In [1]:
import numpy as np
from scipy import special
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
#pip install stop-words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [2]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
#doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]

doc_set = data_samples[:10]

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

#print(dictionary.token2id)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

#print corpus[0]

In [3]:
# find the optimizing values of the vatiational parameters 
# alpha, beta : hyper-parameters
def E_step(words, num_topic, alpha, beta, stop=1e-3, itrMax = 100) : 
    num_word = len(words)
    # initialization
    phi = np.zeros((num_word,num_topic))  # N*K
    phi[:] = 1.0/num_topic  
    gamma = np.zeros(num_topic) 
    gamma = alpha + num_word/num_topic
    
    converged_phi = 1.0
    converged_gamma = 1.0
    
    itr = 0
    while(itr <= itrMax and (converged_phi > stop or converged_gamma > stop )) : 
        itr = itr + 1; 
        phi_new = np.zeros((num_word,num_topic))
        for n in range(0,num_word) :
            for i in range(0,num_topic) :
                w_n = words[n][0]
                phi_new[n,i] = beta[i,w_n]  * np.exp(special.digamma(gamma[i])  -  special.digamma(np.sum(gamma))) 
        phi_new = phi_new / np.sum(phi_new, axis = 1)[:,None]
        gamma_new = alpha + np.sum(phi_new, axis = 0)
        
        converged_phi = np.sum(np.abs(phi_new-phi))
        converged_gamma = np.sum(np.abs(gamma_new-gamma))

        phi = phi_new
        gamma = gamma_new
        ll_new = log_likelihood([words],alpha,beta,[phi_new],[gamma_new])
        print "E step : ", itr, ll_new
    return [phi, gamma]
        
            
def M_step(num_topic,num_vocabulary,corpus,alpha, beta, phi_doc,gamma_doc, stop = 1e-3, itrMax = 100) :
    print "M step before beta update ", log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    beta = np.zeros((num_topic,num_vocabulary))  #K*V      
    M = len(corpus)
    for m in range(0,M) :
        for n in range(0,len(corpus[m])) :
            for i in range(0,num_topic) :
                j = corpus[m][n][0]
                beta[i,j] = beta[i,j] + phi_doc[m][n,i]  * corpus[m][n][1]
    beta = beta / np.sum(beta,axis = 1)[:,None]
    
    print "M step before after update ", log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    
    itr = 0
    ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    converged = 1.0
    
    while(itr <= itrMax and converged > stop) : 
        
        itr = itr + 1
        ll_old = ll_new 
        
        if(np.isnan(alpha).any()) : 
            alpha = alpha / 10.0

        g = np.zeros(num_topic)
        g = M * (special.digamma(np.sum(alpha)) - special.digamma(alpha))  #gradient 
        for d in range(0,M) : 
            g = g + special.digamma(gamma_doc[d]) - special.digamma(np.sum(gamma_doc[d]))

        h =  - M * special.polygamma(1,alpha) # vector along the diagonal of hessien
        z =  special.polygamma(1,np.sum(alpha))   # constant
        c = np.sum(g/h)/(1.0/z + np.sum(1.0/h))
        Hg = (g-c)/h
        

        alpha = alpha -  Hg
        ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
        
        converged = np.abs(ll_new-ll_old)
        print "M step : ", itr, ll_new, np.sqrt(np.sum(Hg**2))
        
    return[beta,alpha]

            
def log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc) : 
    l = 0
    M = len(phi_doc) # number of document
    num_topic = len(alpha)
    for m in range(0,M) : 
        gamma = gamma_doc[m]
        phi = phi_doc[m]
        words = corpus[m]
        len_word = len(words)
        term1 = special.gammaln(np.sum(alpha)) - np.sum(special.gammaln(alpha)) \
                + np.sum((alpha-1)*(special.digamma(gamma)- special.digamma(np.sum(gamma))))
        term2 = np.sum(phi * (special.digamma(gamma)- special.digamma(np.sum(gamma)))) 
        
        l += term1 + term2
        
        term3 = 0
        for n in range(0,len_word) :
            w_n = words[n][0]
            term3 += np.sum(phi[n,:] * np.log(beta[:,w_n]))
        l += term3
       
        term4 = - special.gammaln(np.sum(gamma)) + np.sum(special.gammaln(gamma)) \
              - np.sum((gamma-1)*(special.digamma(gamma)- special.digamma(np.sum(gamma))))
            
        term5 = 0  #term5 = - np.sum(phi * np.log(phi))
        for n in range(0,len_word) :
            for i in range(0,num_topic):
                if(phi[n,i] > 0 ) :
                    term5 += phi[n,i] * np.log(phi[n,i])
        
        l += term4 + term5
            
        if(np.isnan(term4) or np.isinf(term4)) : 
            print "term4 have nan!!!", gamma 
        
        if(np.isnan(term5) or np.isinf(term5)) : 
            print "term5 have nan!!!", phi 
                
    return l


def EM(num_topic,num_vocabulary,alpha,beta,corpus, itrMax = 20) : 
    print -np.sort(-beta,axis = 1)[:,:4]
    phi_doc = []  # list of size M
    gamma_doc = []  # list of size M 
    
    ## E step : find optimized variational parameters for each document
    for m in range(0,len(corpus)) : # E step  : 
        [phi,gamma] = E_step(corpus[m], num_topic, alpha, beta, 1e-3)
        phi_doc.append(phi)
        gamma_doc.append(gamma) 

    ## M step
    [beta,alpha] = M_step(num_topic,num_vocabulary,corpus,alpha,beta,phi_doc,gamma_doc)
    
    converged = 1.0 
    ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    itr = 0
    while(itr <= itrMax and converged > 1e-3 ) :
        ll_old = ll_new 
        itr = itr + 1
        ll_old = ll_new
        phi_doc = []  # list of size M
        gamma_doc = []  # list of size M 
        for m in range(0,len(corpus)) : # E step  : 
            [phi,gamma] = E_step(corpus[m], num_topic, alpha, beta, 1e-3)
            phi_doc.append(phi)
            gamma_doc.append(gamma)      

        ## M step
        [beta,alpha] = M_step(num_topic,num_vocabulary,corpus,alpha,beta,phi_doc,gamma_doc)
        ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
        converged = np.abs(ll_new - ll_old)
        
        print " EM step iteration " , itr , ll_new
        
    return [ll_new, alpha, beta]

    


In [4]:
old_settings = np.seterr(all='print') 
num_topic = 2
num_w = 3
num_vocabulary = len(dictionary)
alpha = np.zeros(num_topic)
alpha = np.random.rand(num_topic)
beta = np.zeros((num_topic,num_vocabulary))
beta[:] = 1.0/num_vocabulary

[ll,alpha, beta] = EM(num_topic,num_vocabulary,alpha,beta, corpus)
print -np.sort(-beta,axis = 1)[:,:num_w]
print np.argsort(-beta,axis = 1)[:,:num_w]


[[ 0.0012987  0.0012987  0.0012987  0.0012987]
 [ 0.0012987  0.0012987  0.0012987  0.0012987]]
E step :  1 -508.180118881
E step :  2 -508.178774717
E step :  3 -508.176779868
E step :  4 -508.174145079
E step :  5 -508.170880952
E step :  6 -508.166997945
E step :  7 -508.162506378
E step :  8 -508.157416433
E step :  9 -508.151738156
E step :  10 -508.145481458
E step :  11 -508.138656118
E step :  12 -508.131271786
E step :  13 -508.123337981
E step :  14 -508.114864096
E step :  15 -508.105859399
E step :  16 -508.096333033
E step :  17 -508.08629402
E step :  18 -508.075751262
E step :  19 -508.064713541
E step :  20 -508.053189524
E step :  21 -508.041187759
E step :  22 -508.028716682
E step :  23 -508.015784617
E step :  24 -508.002399776
E step :  25 -507.988570261
E step :  26 -507.974304066
E step :  27 -507.959609078
E step :  28 -507.944493079
E step :  29 -507.928963748
E step :  30 -507.913028659
E step :  31 -507.896695287
E step :  32 -507.879971005
E step :  33 -507.8

E step :  40 -684.696774506
E step :  41 -684.680662101
E step :  42 -684.664255117
E step :  43 -684.647557592
E step :  44 -684.630573527
E step :  45 -684.613306879
E step :  46 -684.595761569
E step :  47 -684.577941477
E step :  48 -684.559850447
E step :  49 -684.541492282
E step :  50 -684.522870749
E step :  51 -684.503989577
E step :  52 -684.484852458
E step :  53 -684.465463048
E step :  54 -684.445824966
E step :  55 -684.425941794
E step :  56 -684.40581708
E step :  57 -684.385454336
E step :  58 -684.364857039
E step :  59 -684.344028631
E step :  60 -684.32297252
E step :  61 -684.301692081
E step :  62 -684.280190653
E step :  63 -684.258471543
E step :  64 -684.236538026
E step :  65 -684.214393343
E step :  66 -684.192040701
E step :  67 -684.169483278
E step :  68 -684.146724219
E step :  69 -684.123766636
E step :  70 -684.100613612
E step :  71 -684.077268198
E step :  72 -684.053733414
E step :  73 -684.030012249
E step :  74 -684.006107665
E step :  75 -683.9820

E step :  15 -492.02233085
E step :  16 -492.012535367
E step :  17 -492.002215539
E step :  18 -491.991380792
E step :  19 -491.980040421
E step :  20 -491.968203595
E step :  21 -491.955879355
E step :  22 -491.943076618
E step :  23 -491.929804178
E step :  24 -491.916070707
E step :  25 -491.901884759
E step :  26 -491.887254768
E step :  27 -491.872189053
E step :  28 -491.856695817
E step :  29 -491.840783151
E step :  30 -491.824459034
E step :  31 -491.807731333
E step :  32 -491.790607809
E step :  33 -491.773096113
E step :  34 -491.755203792
E step :  35 -491.736938287
E step :  36 -491.718306938
E step :  37 -491.699316982
E step :  38 -491.679975555
E step :  39 -491.660289694
E step :  40 -491.640266341
E step :  41 -491.619912338
E step :  42 -491.599234434
E step :  43 -491.578239283
E step :  44 -491.556933447
E step :  45 -491.535323397
E step :  46 -491.513415513
E step :  47 -491.491216086
E step :  48 -491.468731319
E step :  49 -491.44596733
E step :  50 -491.4229

M step :  77 -7177.59531202 0.152531385399
M step :  78 -7177.58031682 0.150636265117
M step :  79 -7177.5658077 0.14876363539
M step :  80 -7177.55176685 0.146913279652
M step :  81 -7177.53817722 0.145084980481
M step :  82 -7177.52502249 0.143278519839
M step :  83 -7177.51228702 0.141493679281
M step :  84 -7177.49995583 0.139730240159
M step :  85 -7177.48801455 0.1379879838
M step :  86 -7177.47644942 0.136266691671
M step :  87 -7177.46524722 0.134566145539
M step :  88 -7177.45439527 0.132886127602
M step :  89 -7177.4438814 0.131226420629
M step :  90 -7177.43369394 0.129586808066
M step :  91 -7177.42382166 0.127967074155
M step :  92 -7177.41425378 0.126367004027
M step :  93 -7177.40497995 0.124786383794
M step :  94 -7177.39599019 0.123225000634
M step :  95 -7177.38727495 0.121682642865
M step :  96 -7177.37882502 0.120159100015
M step :  97 -7177.37063152 0.118654162886
M step :  98 -7177.36268595 0.117167623611
M step :  99 -7177.35498009 0.115699275704
M step :  100 -7

M step :  8 -6896.63800694 0.440880595996
M step :  9 -6896.51550706 0.420930813538
M step :  10 -6896.40116073 0.401894033852
M step :  11 -6896.29447681 0.38372824348
M step :  12 -6896.19498973 0.366393352322
M step :  13 -6896.10225844 0.349851106603
M step :  14 -6896.01586534 0.334065005805
M step :  15 -6895.9354153 0.319000223373
M step :  16 -6895.86053466 0.304623531009
M step :  17 -6895.7908703 0.290903226399
M step :  18 -6895.72608872 0.277809064186
M step :  19 -6895.66587515 0.265312190051
M step :  20 -6895.60993271 0.253385077756
M step :  21 -6895.55798155 0.24200146899
M step :  22 -6895.50975811 0.231136315915
M step :  23 -6895.46501426 0.220765726254
M step :  24 -6895.42351662 0.210866910839
M step :  25 -6895.38504581 0.201418133466
M step :  26 -6895.34939573 0.192398662989
M step :  27 -6895.31637294 0.183788727522
M step :  28 -6895.28579596 0.175569470678
M step :  29 -6895.25749464 0.167722909729
M step :  30 -6895.23130962 0.160231895631
M step :  31 -689

E step :  4 -172.844428487
E step :  5 -172.438053053
E step :  6 -172.330521128
E step :  7 -172.303072198
E step :  8 -172.296140247
E step :  9 -172.294394606
E step :  1 -2625.09353614
E step :  2 -2530.54636325
E step :  3 -2516.081999
E step :  4 -2513.99955511
E step :  5 -2513.74267647
E step :  6 -2513.71260545
E step :  7 -2513.70911516
E step :  1 -404.639126434
E step :  2 -370.122668745
E step :  3 -362.157304103
E step :  4 -360.707939526
E step :  5 -360.499470935
E step :  6 -360.472317256
E step :  7 -360.468850655
E step :  1 -259.981208549
E step :  2 -238.633223979
E step :  3 -235.740733102
E step :  4 -235.45444402
E step :  5 -235.430754891
E step :  6 -235.42886144
E step :  1 -481.453991751
E step :  2 -445.619561851
E step :  3 -438.49234364
E step :  4 -437.010995809
E step :  5 -436.753571137
E step :  6 -436.713148783
E step :  7 -436.70696005
E step :  8 -436.706016666
M step before beta update  -5446.64434254
M step before after update  -5424.18983355
M s

E step :  45 -372.811213721
E step :  46 -380.795205665
E step :  47 -377.106423336
E step :  48 -712.719651345
E step :  49 -378.9850501
E step :  50 -760.297022721
E step :  51 -382.714700679
E step :  52 -382.720198523
E step :  53 -381.404259471
E step :  54 -373.948103167
E step :  55 -761.506320812
E step :  56 -373.124841577
E step :  57 -381.302963593
E step :  58 -372.812404281
E step :  59 -380.799764773
E step :  60 -376.263878824
E step :  61 -761.506332727
E step :  62 -373.124831415
E step :  63 -381.30234582
E step :  64 -372.81217827
E step :  65 -380.798910771
E step :  66 -376.507844649
E step :  67 -761.506473848
E step :  68 -373.12144068
E step :  69 -381.076316994
E step :  70 -372.763131046
E step :  71 -380.507834006
E step :  72 -374.391726279
E step :  73 -372.97057538
E step :  74 -380.493812247
E step :  75 -372.82038045
E step :  76 -380.826346884
E step :  77 -378.863542641
E step :  78 -376.038192489
E step :  79 -372.76453143
E step :  80 -380.518290122


E step :  81 -512.568541026
E step :  82 -268.4892599
E step :  83 -259.667331535
E step :  84 -268.409763438
E step :  85 -261.170131536
E step :  86 -339.606223909
E step :  87 -261.769944394
E step :  88 -268.453742752
E step :  89 -260.397327494
E step :  90 -512.603248755
E step :  91 -268.500068181
E step :  92 -260.329778639
E step :  93 -268.420391967
E step :  94 -261.081410692
E step :  95 -395.917699017
E step :  96 -268.555448278
E step :  97 -259.89065247
E step :  98 -512.641616694
E step :  99 -268.517780333
E step :  100 -260.817642171
E step :  101 -269.46047504
E step :  1 -195.93208296
E step :  2 -349.218505052
E step :  3 -196.046218836
E step :  4 -191.749069602
E step :  5 -196.036209579
E step :  6 -193.98312291
E step :  7 -377.982264068
E step :  8 -196.078628327
E step :  9 -195.109523787
E step :  10 -195.585813663
E step :  11 -193.181324464
E step :  12 -196.036209579
E step :  13 -193.98312291
E step :  14 -377.982264068
E step :  15 -196.078628327
E step

E step :  41 -294.490164159
E step :  42 -313.348176297
E step :  43 -292.081072723
E step :  44 -288.726373757
E step :  45 -294.554022183
E step :  46 -356.603978728
E step :  47 -288.594763096
E step :  48 -291.48569117
E step :  49 -298.228883353
E step :  50 -289.752813847
E step :  51 -314.379285371
E step :  52 -308.669200721
E step :  53 -574.117905728
E step :  54 -582.500100543
E step :  55 -288.602566068
E step :  56 -291.450821436
E step :  57 -301.316597469
E step :  58 -581.381141891
E step :  59 -288.529412158
E step :  60 -291.53683329
E step :  61 -291.545621274
E step :  62 -291.623048009
E step :  63 -288.582168115
E step :  64 -291.521661975
E step :  65 -294.490164159
E step :  66 -313.348176297
E step :  67 -292.081072723
E step :  68 -288.726373757
E step :  69 -294.554022183
E step :  70 -356.603978728
E step :  71 -288.594763096
E step :  72 -291.48569117
E step :  73 -298.228883353
E step :  74 -289.752813847
E step :  75 -314.379285371
E step :  76 -308.66920

E step :  44 -352.528617993
E step :  45 -353.580746185
E step :  46 -359.627143241
E step :  47 -369.048267264
E step :  48 -368.663749702
E step :  49 -361.015651527
E step :  50 -366.0867442
E step :  51 -353.222890976
E step :  52 -355.557389719
E step :  53 -370.331960024
E step :  54 -363.286204445
E step :  55 -369.826964523
E step :  56 -353.302350907
E step :  57 -354.263125403
E step :  58 -434.068594603
E step :  59 -353.249215465
E step :  60 -355.38391719
E step :  61 -366.937882712
E step :  62 -365.980448456
E step :  63 -352.386936573
E step :  64 -353.295560012
E step :  65 -352.601928961
E step :  66 -2511.71975004
E step :  67 -353.189892498
E step :  68 -355.555753238
E step :  69 -369.206932283
E step :  70 -367.943365906
E step :  71 -353.208025297
E step :  72 -352.528617993
E step :  73 -353.580746185
E step :  74 -359.627143241
E step :  75 -369.048267264
E step :  76 -368.663749702
E step :  77 -361.015651527
E step :  78 -366.0867442
E step :  79 -353.2228909

E step :  53 -1864.29330244
E step :  54 -1971.84851883
E step :  55 -1864.20026692
E step :  56 -1868.85905197
E step :  57 -1868.75296533
E step :  58 -1859.17068427
E step :  59 -5190.71821856
E step :  60 -1867.45904883
E step :  61 -1859.56494988
E step :  62 -1978.75899247
E step :  63 -1859.55513401
E step :  64 -1971.18274882
E step :  65 -1862.0767619
E step :  66 -1869.68543154
E step :  67 -1869.9059307
E step :  68 -1868.15191578
E step :  69 -1869.36625914
E step :  70 -1866.68128538
E step :  71 -1862.03173507
E step :  72 -1869.21632835
E step :  73 -1866.67033874
E step :  74 -1861.96199494
E step :  75 -1868.45159029
E step :  76 -1987.9290582
E step :  77 -1865.61864747
E step :  78 -1876.87882753
E step :  79 -1868.25064005
E step :  80 -1859.98846335
E step :  81 -1862.68782374
E step :  82 -2085.14708651
E step :  83 -1887.95706996
E step :  84 -1868.70786682
E step :  85 -1869.62975247
E step :  86 -1869.32375216
E step :  87 -1864.79473558
E step :  88 -1862.6679

M step :  82 -4766.36292077 2.21148519708
M step :  83 -4772.26040318 1.61589686673
M step :  84 -4785.1475808 0.820290127266
M step :  85 -4771.47879702 0.825739399396
M step :  86 -4788.92639941 0.761896272346
M step :  87 -4778.4203227 1.06082644019
M step :  88 -4766.78907121 1.03801731148
M step :  89 -4751.79603657 0.902421219084
M step :  90 -4741.69126167 1.0689529994
M step :  91 -4729.96602483 1.02208913023
M step :  92 -4716.55280123 0.943973524171
M step :  93 -4706.38641596 1.09127845308
M step :  94 -4682.0295299 0.761566714768
M step :  95 -4693.32255985 0.507236020423
M step :  96 -4709.86961436 1.77403420607
M step :  97 -4734.5472153 2.00978935927
M step :  98 -4762.17299335 2.07427502239
M step :  99 -4792.49552191 2.33200484242
M step :  100 -4792.25918677 0.0737061511414
M step :  101 -4797.81790865 0.699636146861
 EM step iteration  5 -4797.81790865
E step :  1 -319.781624792
E step :  2 -319.868431956
E step :  3 -319.929505289
E step :  4 -319.979324933
E step :

E step :  57 -2468.46003316
E step :  58 -1789.78476603
E step :  59 -1814.73480862
E step :  60 -1783.68023648
E step :  61 -1814.32300332
E step :  62 -2468.46003316
E step :  63 -1789.78476603
E step :  64 -1814.73480862
E step :  65 -1783.68023648
E step :  66 -1814.32300332
E step :  67 -2468.46003316
E step :  68 -1789.78476603
E step :  69 -1814.73480862
E step :  70 -1783.68023648
E step :  71 -1814.32300332
E step :  72 -2468.46003316
E step :  73 -1789.78476603
E step :  74 -1814.73480862
E step :  75 -1783.68023648
E step :  76 -1814.32300332
E step :  77 -2468.46003316
E step :  78 -1789.78476603
E step :  79 -1814.73480862
E step :  80 -1783.68023648
E step :  81 -1814.32300332
E step :  82 -2468.46003316
E step :  83 -1789.78476603
E step :  84 -1814.73480862
E step :  85 -1783.68023648
E step :  86 -1814.32300332
E step :  87 -2468.46003316
E step :  88 -1789.78476603
E step :  89 -1814.73480862
E step :  90 -1783.68023648
E step :  91 -1814.32300332
E step :  92 -2468.4

E step :  23 -498.750180145
E step :  24 -483.676319463
E step :  25 -440.259410016
E step :  26 -428.579239114
E step :  27 -419.408797311
E step :  28 -449.446042339
E step :  29 -498.614055102
E step :  30 -498.723302406
E step :  31 -493.463109517
E step :  32 -415.753717768
E step :  33 -416.558742944
E step :  34 -448.993581862
E step :  35 -436.913184288
E step :  36 -1823.54769753
E step :  37 -498.741270549
E step :  38 -484.511886853
E step :  39 -1823.54769753
E step :  40 -498.741270549
E step :  41 -484.511886853
E step :  42 -1823.54769753
E step :  43 -498.741270549
E step :  44 -484.511886853
E step :  45 -1823.54769753
E step :  46 -498.741270549
E step :  47 -484.511886853
E step :  48 -1823.54769753
E step :  49 -498.741270549
E step :  50 -484.511886853
E step :  51 -1823.54769753
E step :  52 -498.741270549
E step :  53 -484.511886853
E step :  54 -1823.54769753
E step :  55 -498.741270549
E step :  56 -484.511886853
E step :  57 -1823.54769753
E step :  58 -498.74

E step :  7 -460.967856716
E step :  8 -314.693960203
E step :  9 -460.967856716
E step :  10 -314.693960203
E step :  11 -460.967856716
E step :  12 -314.693960203
E step :  13 -460.967856716
E step :  14 -314.693960203
E step :  15 -460.967856716
E step :  16 -314.693960203
E step :  17 -460.967856716
E step :  18 -314.693960203
E step :  19 -460.967856716
E step :  20 -314.693960203
E step :  21 -460.967856716
E step :  22 -314.693960203
E step :  23 -460.967856716
E step :  24 -314.693960203
E step :  25 -460.967856716
E step :  26 -314.693960203
E step :  27 -460.967856716
E step :  28 -314.693960203
E step :  29 -460.967856716
E step :  30 -314.693960203
E step :  31 -460.967856716
E step :  32 -314.693960203
E step :  33 -460.967856716
E step :  34 -314.693960203
E step :  35 -460.967856716
E step :  36 -314.693960203
E step :  37 -460.967856716
E step :  38 -314.693960203
E step :  39 -460.967856716
E step :  40 -314.693960203
E step :  41 -460.967856716
E step :  42 -314.69396

E step :  76 -351.807725292
E step :  77 -360.577084156
E step :  78 -351.813284934
E step :  79 -359.286189016
E step :  80 -612.877116038
E step :  81 -428.612890123
E step :  82 -351.821279044
E step :  83 -359.046948315
E step :  84 -1601.2008727
E step :  85 -351.807716081
E step :  86 -360.576287797
E step :  87 -351.813417155
E step :  88 -359.290554623
E step :  89 -585.183473118
E step :  90 -351.72953469
E step :  91 -457.150285911
E step :  92 -351.807725292
E step :  93 -360.577084156
E step :  94 -351.813284934
E step :  95 -359.286189016
E step :  96 -612.877116038
E step :  97 -428.612890123
E step :  98 -351.821279044
E step :  99 -359.046948315
E step :  100 -1601.2008727
E step :  101 -351.807716081
E step :  1 -315.719294839
E step :  2 -686.801293066
E step :  3 -315.414419296
E step :  4 -399.837671724
E step :  5 -318.551912784
E step :  6 -345.409993243
E step :  7 -314.095706179
E step :  8 -625.059376732
E step :  9 -314.76666474
E step :  10 -6189.57210673
E s

E step :  96 -2767.16476108
E step :  97 -2756.13149084
E step :  98 -2767.16476108
E step :  99 -2756.13149084
E step :  100 -2767.16476108
E step :  101 -2756.13149084
E step :  1 -419.312066307
E step :  2 -414.637026017
E step :  3 -731.796290692
E step :  4 -425.341486547
E step :  5 -413.149322336
E step :  6 -594.717604922
E step :  7 -419.495623442
E step :  8 -415.665974957
E step :  9 -414.231484956
E step :  10 -607.01682164
E step :  11 -579.548723848
E step :  12 -521.893154898
E step :  13 -435.681471249
E step :  14 -594.716666794
E step :  15 -419.384084215
E step :  16 -411.916466048
E step :  17 -594.71666567
E step :  18 -419.384038925
E step :  19 -411.903745343
E step :  20 -594.71666567
E step :  21 -419.384038925
E step :  22 -411.903745343
E step :  23 -594.71666567
E step :  24 -419.384038925
E step :  25 -411.903745343
E step :  26 -594.71666567
E step :  27 -419.384038925
E step :  28 -411.903745343
E step :  29 -594.71666567
E step :  30 -419.384038925
E ste

E step :  55 -2745.80128681
E step :  56 -2733.62453359
E step :  57 -3108.86540363
E step :  58 -2745.80128681
E step :  59 -2733.62453359
E step :  60 -3108.86540363
E step :  61 -2745.80128681
E step :  62 -2733.62453359
E step :  63 -3108.86540363
E step :  64 -2745.80128681
E step :  65 -2733.62453359
E step :  66 -3108.86540363
E step :  67 -2745.80128681
E step :  68 -2733.62453359
E step :  69 -3108.86540363
E step :  70 -2745.80128681
E step :  71 -2733.62453359
E step :  72 -3108.86540363
E step :  73 -2745.80128681
E step :  74 -2733.62453359
E step :  75 -3108.86540363
E step :  76 -2745.80128681
E step :  77 -2733.62453359
E step :  78 -3108.86540363
E step :  79 -2745.80128681
E step :  80 -2733.62453359
E step :  81 -3108.86540363
E step :  82 -2745.80128681
E step :  83 -2733.62453359
E step :  84 -3108.86540363
E step :  85 -2745.80128681
E step :  86 -2733.62453359
E step :  87 -3108.86540363
E step :  88 -2745.80128681
E step :  89 -2733.62453359
E step :  90 -3108.8

E step :  11 -479.545017973
E step :  12 -350.97533133
E step :  13 -346.342206831
E step :  14 -476.034530451
E step :  15 -479.545017973
E step :  16 -350.97533133
E step :  17 -346.342206831
E step :  18 -476.034530451
E step :  19 -479.545017973
E step :  20 -350.97533133
E step :  21 -346.342206831
E step :  22 -476.034530451
E step :  23 -479.545017973
E step :  24 -350.97533133
E step :  25 -346.342206831
E step :  26 -476.034530451
E step :  27 -479.545017973
E step :  28 -350.97533133
E step :  29 -346.342206831
E step :  30 -476.034530451
E step :  31 -479.545017973
E step :  32 -350.97533133
E step :  33 -346.342206831
E step :  34 -476.034530451
E step :  35 -479.545017973
E step :  36 -350.97533133
E step :  37 -346.342206831
E step :  38 -476.034530451
E step :  39 -479.545017973
E step :  40 -350.97533133
E step :  41 -346.342206831
E step :  42 -476.034530451
E step :  43 -479.545017973
E step :  44 -350.97533133
E step :  45 -346.342206831
E step :  46 -476.034530451
E

E step :  4 -406.913351045
E step :  5 -578.649648886
E step :  6 -446.837108785
E step :  7 -412.470755096
E step :  8 -578.624264955
E step :  9 -440.298827676
E step :  10 -578.649648904
E step :  11 -446.837107063
E step :  12 -412.468663552
E step :  13 -578.625439582
E step :  14 -439.773352794
E step :  15 -578.649648904
E step :  16 -446.837107063
E step :  17 -412.468663552
E step :  18 -578.625439582
E step :  19 -439.773352794
E step :  20 -578.649648904
E step :  21 -446.837107063
E step :  22 -412.468663552
E step :  23 -578.625439582
E step :  24 -439.773352794
E step :  25 -578.649648904
E step :  26 -446.837107063
E step :  27 -412.468663552
E step :  28 -578.625439582
E step :  29 -439.773352794
E step :  30 -578.649648904
E step :  31 -446.837107063
E step :  32 -412.468663552
E step :  33 -578.625439582
E step :  34 -439.773352794
E step :  35 -578.649648904
E step :  36 -446.837107063
E step :  37 -412.468663552
E step :  38 -578.625439582
E step :  39 -439.77335279

E step :  41 -438.876226047
E step :  42 -438.206516357
E step :  43 -444.292297799
E step :  44 -444.423720697
E step :  45 -447.556239512
E step :  46 -459.606691108
E step :  47 -444.245568405
E step :  48 -444.33088941
E step :  49 -444.540843108
E step :  50 -545.199342424
E step :  51 -1345.70111157
E step :  52 -444.302724433
E step :  53 -444.451348069
E step :  54 -451.617439883
E step :  55 -438.876224221
E step :  56 -438.206517357
E step :  57 -444.292297405
E step :  58 -444.423719711
E step :  59 -447.556116045
E step :  60 -459.629720274
E step :  61 -444.243146896
E step :  62 -444.326417249
E step :  63 -444.525516186
E step :  64 -502.874812908
E step :  65 -439.28164326
E step :  66 -438.101899229
E step :  67 -444.611046657
E step :  68 -1317.60880678
E step :  69 -441.614847077
E step :  70 -1345.70111152
E step :  71 -444.302724484
E step :  72 -444.451348211
E step :  73 -451.617465507
E step :  74 -438.876310707
E step :  75 -438.206470056
E step :  76 -444.2923

E step :  32 -464.03418845
E step :  33 -373.931183617
E step :  34 -372.263825721
E step :  35 -2909.41568729
E step :  36 -373.923462709
E step :  37 -373.71670503
E step :  38 -464.009845518
E step :  39 -373.926680534
E step :  40 -373.470527825
E step :  41 -2354.85391306
E step :  42 -504.295293779
E step :  43 -374.481721124
E step :  44 -374.477474728
E step :  45 -373.515218427
E step :  46 -391.659312953
E step :  47 -373.880298016
E step :  48 -373.919277433
E step :  49 -373.869552253
E step :  50 -464.009859633
E step :  51 -373.926457235
E step :  52 -373.494191006
E step :  53 -1029.10442314
E step :  54 -454.853924765
E step :  55 -464.03418845
E step :  56 -373.931183617
E step :  57 -372.263825721
E step :  58 -2909.41568729
E step :  59 -373.923462709
E step :  60 -373.71670503
E step :  61 -464.009845518
E step :  62 -373.926680534
E step :  63 -373.470527825
E step :  64 -2354.85391306
E step :  65 -504.295293779
E step :  66 -374.481721124
E step :  67 -374.477474

E step :  2 -440.529228937
E step :  1 -356.254790328
E step :  2 -356.254790328
E step :  1 -658.373763932
E step :  2 -658.373763938
M step before beta update  -7225.23125391
M step before after update  -6822.55549245
M step :  1 -6822.55016002 0.00653601396727
M step :  2 -6822.53946951 0.00911665486455
M step :  3 -6822.51865799 0.0129708266136
M step :  4 -6822.48032101 0.0180479919877
M step :  5 -6822.41462146 0.0243416411793
M step :  6 -6822.31188486 0.0314388016255
M step :  7 -6822.16836607 0.0382834577228
M step :  8 -6821.99308134 0.0431645697565
M step :  9 -6821.80922211 0.0443246254163
M step :  10 -6821.64460291 0.0411598968809
M step :  11 -6821.51713524 0.0348809680225
M step :  12 -6821.42915164 0.0276142452564
M step :  13 -6821.37305192 0.0209773913274
M step :  14 -6821.3390114 0.0156141468058
M step :  15 -6821.31895312 0.011533110175
M step :  16 -6821.30733339 0.00850848375817
M step :  17 -6821.3006685 0.00628730007998
M step :  18 -6821.29686793 0.0046579645

In [5]:
id2token = {}
for token in dictionary.token2id :
    id2token[dictionary.token2id[token]] = token
    
weights = -np.sort(-beta,axis = 1)[:,:num_w]
res = np.argsort(-beta,axis = 1)[:,:num_w]
for i in range(0,res.shape[0]) :
    topic = {}
    for j in range(0,res.shape[1]) : 
        topic[id2token[res[i,j]]]= weights[i,j]
    print topic, len(topic)



{u's': 0.018450184501845018, u'b': 0.0092250922509225092, u't': 0.01107011070110701} 3
{u'h': 0.015555555555555555, u'j': 0.015555555555555555, u'f': 0.017777777777777778} 3


In [7]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

print(ldamodel.print_topics(num_topics=2, num_words=3))

x = 100
print np.log(special.gamma(x)), special.digamma(x)
print special.polygamma(0, x), special.digamma(x)

[(0, u'0.010*"s" + 0.009*"t" + 0.008*"b"'), (1, u'0.013*"s" + 0.008*"israel" + 0.008*"will"')]
359.13420537 4.60016185274
4.60016185274 4.60016185274
