In [3]:
import numpy as np
from scipy import special
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
#pip install stop-words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [4]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
#doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]

doc_set = data_samples[:10]

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

#print(dictionary.token2id)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

#print corpus[0]

In [3]:
    
    beta = np.zeros((num_topic,num_vocabulary))
    for i in range(0,num_topic) :
        denominator = 0
        numerator = np.zeros((num_vocabulary))
        for j in range(0,num_vocabulary) :
            for d in range(0,M) :
                for n in range(0,len(corpus[d])) :
                    s = corpus[d][n][0]
                    numerator[s] = numerator[s] + phi_doc[d][n,i] * corpus[d][n][1]
                    denominator = denominator + phi_doc[d][n,i] * corpus[d][n][1]
        beta[i,:] = numerator / denominator
    print "beta2", beta

NameError: name 'num_topic' is not defined

In [10]:
# find the optimizing values of the vatiational parameters 
# alpha, beta : hyper-parameters
def E_step(words, num_topic, alpha, beta, stop=1e-3, itrMax = 100) : 
    num_word = len(words)
    # initialization
    phi = np.zeros((num_word,num_topic))  # N*K
    phi[:] = 1.0/num_topic  
    gamma = np.zeros(num_topic) 
    gamma = alpha + num_word/num_topic
    
    converged_phi = 1.0
    converged_gamma = 1.0
    
    itr = 0
    while(itr <= itrMax and (converged_phi > stop or converged_gamma > stop )) : 
        itr = itr + 1; 
        phi_new = np.zeros((num_word,num_topic))
        for n in range(0,num_word) :
            for i in range(0,num_topic) :
                w_n = words[n][0]
                phi_new[n,i] = beta[i,w_n]  * np.exp(special.digamma(gamma[i])  -  special.digamma(np.sum(gamma))) 
        phi_new = phi_new / np.sum(phi_new, axis = 1)[:,None]
        gamma_new = alpha + np.sum(phi_new, axis = 0)
        
        converged_phi = np.sum(np.abs(phi_new-phi))
        converged_gamma = np.sum(np.abs(gamma_new-gamma))

        phi = phi_new
        gamma = gamma_new
        ll_new = log_likelihood([words],alpha,beta,[phi_new],[gamma_new])
        print "E step : ", itr, ll_new
    return [phi, gamma]
        
            
def M_step(num_topic,num_vocabulary,corpus,alpha, beta, phi_doc,gamma_doc, stop = 1e-3, itrMax = 100) :
    print "M step before beta update ", log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    beta = np.zeros((num_topic,num_vocabulary))  #K*V      
    M = len(corpus)
    for m in range(0,M) :
        for n in range(0,len(corpus[m])) :
            for i in range(0,num_topic) :
                j = corpus[m][n][0]
                beta[i,j] = beta[i,j] + phi_doc[m][n,i]  * corpus[m][n][1]
    beta = beta / np.sum(beta,axis = 1)[:,None]
    
    print "M step before after update ", log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    
    itr = 0
    ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    converged = 1.0
    
    while(itr <= itrMax and converged > stop) : 
        
        itr = itr + 1
        ll_old = ll_new 
        
        if(np.isnan(alpha).any()) : 
            alpha = alpha / 10.0

        g = np.zeros(num_topic)
        g = M * (special.digamma(np.sum(alpha)) - special.digamma(alpha))  #gradient 
        for d in range(0,M) : 
            g = g + special.digamma(gamma_doc[d]) - special.digamma(np.sum(gamma_doc[d]))

        h =  - M * special.polygamma(1,alpha) # vector along the diagonal of hessien
        z =  special.polygamma(1,np.sum(alpha))   # constant
        c = np.sum(g/h)/(1.0/z + np.sum(1.0/h))
        Hg = (g-c)/h
        

        alpha = alpha -  Hg
        ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
        
        converged = np.abs(ll_new-ll_old)
        print "M step : ", itr, ll_new, np.sqrt(np.sum(Hg**2))
        
    return[beta,alpha]

            
def log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc) : 
    l = 0
    M = len(phi_doc) # number of document
    num_topic = len(alpha)
    for m in range(0,M) : 
        gamma = gamma_doc[m]
        phi = phi_doc[m]
        words = corpus[m]
        len_word = len(words)
        term1 = special.gammaln(np.sum(alpha)) - np.sum(special.gammaln(alpha)) \
                + np.sum((alpha-1)*(special.digamma(gamma)- special.digamma(np.sum(gamma))))
        term2 = np.sum(phi * (special.digamma(gamma)- special.digamma(np.sum(gamma)))) 
        
        l += term1 + term2
        
        term3 = 0
        for n in range(0,len_word) :
            w_n = words[n][0]
            term3 += np.sum(phi[n,:] * np.log(beta[:,w_n]))
        l += term3
       
        term4 = - special.gammaln(np.sum(gamma)) + np.sum(special.gammaln(gamma)) \
              - np.sum((gamma-1)*(special.digamma(gamma)- special.digamma(np.sum(gamma))))
            
        term5 = 0  #term5 = - np.sum(phi * np.log(phi))
        for n in range(0,len_word) :
            for i in range(0,num_topic):
                if(phi[n,i] > 0 ) :
                    term5 += phi[n,i] * np.log(phi[n,i])
        
        l += term4 + term5
            
        if(np.isnan(term4) or np.isinf(term4)) : 
            print "term4 have nan!!!", gamma 
        
        if(np.isnan(term5) or np.isinf(term5)) : 
            print "term5 have nan!!!", phi 
                
    return l


def EM(num_topic,num_vocabulary,alpha,beta,corpus, itrMax = 20) : 
    print -np.sort(-beta,axis = 1)[:,:4]
    phi_doc = []  # list of size M
    gamma_doc = []  # list of size M 
    
    ## E step : find optimized variational parameters for each document
    for m in range(0,len(corpus)) : # E step  : 
        [phi,gamma] = E_step(corpus[m], num_topic, alpha, beta, 1e-3)
        phi_doc.append(phi)
        gamma_doc.append(gamma) 

    ## M step
    [beta,alpha] = M_step(num_topic,num_vocabulary,corpus,alpha,beta,phi_doc,gamma_doc)
    
    converged = 1.0 
    ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
    itr = 0
    while(itr <= itrMax and converged > 1e-3 ) :
        ll_old = ll_new 
        itr = itr + 1
        ll_old = ll_new
        phi_doc = []  # list of size M
        gamma_doc = []  # list of size M 
        for m in range(0,len(corpus)) : # E step  : 
            [phi,gamma] = E_step(corpus[m], num_topic, alpha, beta, 1e-3)
            phi_doc.append(phi)
            gamma_doc.append(gamma)      

        ## M step
        [beta,alpha] = M_step(num_topic,num_vocabulary,corpus,alpha,beta,phi_doc,gamma_doc)
        ll_new = log_likelihood(corpus,alpha,beta,phi_doc,gamma_doc)
        converged = np.abs(ll_new - ll_old)
        
        print " EM step iteration " , itr , ll_new
        
    return [ll_new, alpha, beta]

    


In [18]:
old_settings = np.seterr(all='print') 
num_topic = 2
num_w = 3
num_vocabulary = len(dictionary)
alpha = np.zeros(num_topic)
alpha = np.random.rand(num_topic)
beta = np.zeros((num_topic,num_vocabulary))
beta[:] = 1.0/num_vocabulary

[ll,alpha, beta] = EM(num_topic,num_vocabulary,alpha,beta, corpus)
print -np.sort(-beta,axis = 1)[:,:num_w]
print np.argsort(-beta,axis = 1)[:,:num_w]


[[ 0.0012987  0.0012987  0.0012987  0.0012987]
 [ 0.0012987  0.0012987  0.0012987  0.0012987]]
E step :  1 -508.514553943
E step :  2 -508.514524673
E step :  3 -508.514480691
E step :  4 -508.514421863
E step :  5 -508.514348054
E step :  6 -508.514259128
E step :  7 -508.514154947
E step :  8 -508.514035375
E step :  9 -508.513900272
E step :  10 -508.513749498
E step :  11 -508.513582914
E step :  12 -508.513400376
E step :  13 -508.513201742
E step :  14 -508.512986868
E step :  15 -508.512755609
E step :  16 -508.51250782
E step :  17 -508.512243354
E step :  18 -508.511962062
E step :  19 -508.511663795
E step :  20 -508.511348404
E step :  21 -508.511015737
E step :  22 -508.510665643
E step :  23 -508.510297967
E step :  24 -508.509912556
E step :  25 -508.509509253
E step :  26 -508.509087904
E step :  27 -508.508648349
E step :  28 -508.50819043
E step :  29 -508.507713987
E step :  30 -508.50721886
E step :  31 -508.506704886
E step :  32 -508.506171901
E step :  33 -508.505

E step :  34 -685.375903901
E step :  35 -685.375483577
E step :  36 -685.375049768
E step :  37 -685.374602385
E step :  38 -685.374141339
E step :  39 -685.373666544
E step :  40 -685.373177909
E step :  41 -685.372675345
E step :  42 -685.372158761
E step :  43 -685.371628068
E step :  44 -685.371083175
E step :  45 -685.370523989
E step :  46 -685.36995042
E step :  47 -685.369362374
E step :  48 -685.368759759
E step :  49 -685.368142482
E step :  50 -685.367510448
E step :  51 -685.366863563
E step :  52 -685.366201733
E step :  53 -685.365524862
E step :  54 -685.364832854
E step :  55 -685.364125613
E step :  56 -685.363403042
E step :  57 -685.362665044
E step :  58 -685.361911521
E step :  59 -685.361142374
E step :  60 -685.360357505
E step :  61 -685.359556813
E step :  62 -685.358740201
E step :  63 -685.357907566
E step :  64 -685.357058808
E step :  65 -685.356193825
E step :  66 -685.355312516
E step :  67 -685.354414778
E step :  68 -685.353500507
E step :  69 -685.352

E step :  32 -3264.65385408
E step :  33 -3264.65377876
E step :  34 -3264.65370111
E step :  35 -3264.65362111
E step :  36 -3264.65353877
E step :  37 -3264.65345409
E step :  38 -3264.65336704
E step :  39 -3264.65327765
E step :  40 -3264.65318589
E step :  41 -3264.65309178
E step :  42 -3264.6529953
E step :  43 -3264.65289645
E step :  44 -3264.65279522
E step :  45 -3264.65269162
E step :  46 -3264.65258565
E step :  47 -3264.65247729
E step :  48 -3264.65236654
E step :  49 -3264.65225341
E step :  50 -3264.65213788
E step :  51 -3264.65201996
E step :  52 -3264.65189964
E step :  53 -3264.65177691
E step :  54 -3264.65165178
E step :  55 -3264.65152424
E step :  56 -3264.65139429
E step :  57 -3264.65126192
E step :  58 -3264.65112713
E step :  59 -3264.65098991
E step :  60 -3264.65085027
E step :  61 -3264.6507082
E step :  62 -3264.6505637
E step :  63 -3264.65041676
E step :  64 -3264.65026737
E step :  65 -3264.65011555
E step :  66 -3264.64996127
E step :  67 -3264.6498

E step :  32 -588.907600669
E step :  33 -588.907133844
E step :  34 -588.906650966
E step :  35 -588.906151913
E step :  36 -588.905636564
E step :  37 -588.905104796
E step :  38 -588.904556488
E step :  39 -588.903991515
E step :  40 -588.903409752
E step :  41 -588.902811076
E step :  42 -588.902195359
E step :  43 -588.901562475
E step :  44 -588.900912296
E step :  45 -588.900244695
E step :  46 -588.899559542
E step :  47 -588.898856708
E step :  48 -588.898136061
E step :  49 -588.897397471
E step :  50 -588.896640805
E step :  51 -588.89586593
E step :  52 -588.895072713
E step :  53 -588.894261018
E step :  54 -588.893430711
E step :  55 -588.892581655
E step :  56 -588.891713714
E step :  57 -588.890826748
E step :  58 -588.88992062
E step :  59 -588.88899519
E step :  60 -588.888050318
E step :  61 -588.887085861
E step :  62 -588.886101679
E step :  63 -588.885097628
E step :  64 -588.884073565
E step :  65 -588.883029344
E step :  66 -588.88196482
E step :  67 -588.880879

E step :  16 -475.623073575
E step :  17 -475.622757659
E step :  18 -475.622552628
E step :  19 -475.622419574
E step :  1 -312.186694311
E step :  2 -311.972655974
E step :  3 -311.829860036
E step :  4 -311.744199413
E step :  5 -311.695170644
E step :  6 -311.667749191
E step :  7 -311.652594551
E step :  8 -311.64427206
E step :  9 -311.639717085
E step :  10 -311.637228681
E step :  11 -311.635870608
E step :  12 -311.635129826
E step :  13 -311.634725875
E step :  14 -311.634505635
E step :  1 -576.103630974
E step :  2 -576.013472141
E step :  3 -575.932713829
E step :  4 -575.868201935
E step :  5 -575.81953446
E step :  6 -575.783996464
E step :  7 -575.758556973
E step :  8 -575.740575651
E step :  9 -575.727970838
E step :  10 -575.719183502
E step :  11 -575.713080164
E step :  12 -575.708851672
E step :  13 -575.705927119
E step :  14 -575.703906778
E step :  15 -575.702512205
E step :  16 -575.701550111
E step :  17 -575.700886627
E step :  18 -575.700429193
E step :  19

E step :  7 -468.40392463
E step :  8 -468.32945269
E step :  9 -468.28973746
E step :  10 -468.268598507
E step :  11 -468.257358516
E step :  12 -468.251385235
E step :  13 -468.248211761
E step :  14 -468.24652602
E step :  15 -468.245630632
E step :  16 -468.245155064
E step :  1 -309.81951219
E step :  2 -307.953622697
E step :  3 -307.01544215
E step :  4 -306.590876309
E step :  5 -306.406355429
E step :  6 -306.327472459
E step :  7 -306.293980806
E step :  8 -306.279802111
E step :  9 -306.273806857
E step :  10 -306.271273153
E step :  11 -306.270202595
E step :  12 -306.269750297
E step :  1 -573.45783178
E step :  2 -570.893104518
E step :  3 -569.022165892
E step :  4 -567.81216103
E step :  5 -567.073151507
E step :  6 -566.635125307
E step :  7 -566.3797113
E step :  8 -566.23213104
E step :  9 -566.147295123
E step :  10 -566.098669394
E step :  11 -566.070844523
E step :  12 -566.054937445
E step :  13 -566.045848489
E step :  14 -566.040656852
E step :  15 -566.037691

M step :  14 -5680.29172698 0.157778956501
M step :  15 -5694.01349766 0.461993088874
M step :  16 -5666.25088868 0.272121323543
M step :  17 -5689.62735258 0.334291592669
M step :  18 -5673.89789139 0.371041999204
M step :  19 -4423.01864125 262.365869068
M step :  20 -4415.87231277 0.439645032332
M step :  21 -4364.1242804 0.331576582842
M step :  22 -4387.91923776 0.028788830792
M step :  23 -4413.15036086 0.337830924342
M step :  24 -4404.53646288 0.199011444176
M step :  25 -4435.96929806 0.773448582535
M step :  26 -4426.48516342 0.0279466968458
M step :  27 -4411.03004019 0.105801582432
M step :  28 -4461.78561041 0.439600535992
M step :  29 -4454.82112287 0.0140003041527
M step :  30 -4447.81908688 0.0176638298807
M step :  31 -4440.7131031 0.0252206750149
M step :  32 -4433.29287715 0.0409905009503
M step :  33 -4424.73485446 0.0752150122857
M step :  34 -4407.22824689 0.165703009184
M step :  35 -4438.54223094 3.97768565696
M step :  36 -4408.89767894 0.285155257272
M step : 

E step :  5 -2685.14334922
E step :  6 -2739.77413003
E step :  7 -3026.64355017
E step :  8 -2918.70163083
E step :  9 -2888.96740915
E step :  10 -3132.79795477
E step :  11 -2919.86072212
E step :  12 -2908.48861251
E step :  13 -2616.00023808
E step :  14 -2686.72617682
E step :  15 -2676.74443996
E step :  16 -2853.38920317
E step :  17 -2615.17827475
E step :  18 -2618.92726071
E step :  19 -3154.98113382
E step :  20 -3042.11026173
E step :  21 -2615.18483299
E step :  22 -2682.7565607
E step :  23 -2628.13738709
E step :  24 -3022.8606012
E step :  25 -3029.13592074
E step :  26 -2908.62824691
E step :  27 -2615.26508859
E step :  28 -2711.6469427
E step :  29 -2908.48863813
E step :  30 -2615.99924276
E step :  31 -2683.31403167
E step :  32 -2711.64694417
E step :  33 -2908.4886382
E step :  34 -2615.99924013
E step :  35 -2683.26027595
E step :  36 -2711.64694417
E step :  37 -2908.4886382
E step :  38 -2615.99924013
E step :  39 -2683.26027595
E step :  40 -2711.64694417
E 

E step :  47 -2625.18302422
E step :  48 -3856.40752772
E step :  49 -2612.40740766
E step :  50 -3716.8774745
E step :  51 -2650.98129634
E step :  52 -3859.95267262
E step :  53 -2950.79904234
E step :  54 -2611.76101594
E step :  55 -3016.85210471
E step :  56 -2617.00155514
E step :  57 -3191.6130982
E step :  58 -2635.91193758
E step :  59 -3075.59061878
E step :  60 -2654.98911601
E step :  61 -3862.9433675
E step :  62 -2625.7462113
E step :  63 -2619.0660126
E step :  64 -2914.79933946
E step :  65 -2610.25660058
E step :  66 -2611.86386634
E step :  67 -3856.42291773
E step :  68 -2613.1237857
E step :  69 -2687.38779706
E step :  70 -3872.53912696
E step :  71 -2626.49911053
E step :  72 -3813.38517502
E step :  73 -3012.7389166
E step :  74 -2611.86386634
E step :  75 -3856.42291773
E step :  76 -2613.1237857
E step :  77 -2687.38779706
E step :  78 -3872.53912696
E step :  79 -2626.49911053
E step :  80 -3813.38517502
E step :  81 -3012.7389166
E step :  82 -2611.86386634
E

E step :  31 -181.735054751
E step :  32 -189.535760201
E step :  33 -187.435995031
E step :  34 -181.380471915
E step :  35 -181.41900537
E step :  36 -181.447297876
E step :  37 -181.483319398
E step :  38 -181.557215062
E step :  39 -181.735054751
E step :  40 -189.535760201
E step :  41 -187.435995031
E step :  42 -181.380471915
E step :  43 -181.41900537
E step :  44 -181.447297876
E step :  45 -181.483319398
E step :  46 -181.557215062
E step :  47 -181.735054751
E step :  48 -189.535760201
E step :  49 -187.435995031
E step :  50 -181.380471915
E step :  51 -181.41900537
E step :  52 -181.447297876
E step :  53 -181.483319398
E step :  54 -181.557215062
E step :  55 -181.735054751
E step :  56 -189.535760201
E step :  57 -187.435995031
E step :  58 -181.380471915
E step :  59 -181.41900537
E step :  60 -181.447297876
E step :  61 -181.483319398
E step :  62 -181.557215062
E step :  63 -181.735054751
E step :  64 -189.535760201
E step :  65 -187.435995031
E step :  66 -181.380471

E step :  48 -472.453028323
E step :  49 -470.400459543
E step :  50 -470.475650042
E step :  51 -470.646200817
E step :  52 -472.16770286
E step :  53 -473.759332091
E step :  54 -470.043626546
E step :  55 -2961.766641
E step :  56 -470.405699985
E step :  57 -470.482961171
E step :  58 -470.668359173
E step :  59 -474.485048751
E step :  60 -467.461252333
E step :  61 -470.381172263
E step :  62 -470.454132735
E step :  63 -470.583254621
E step :  64 -472.189857465
E step :  65 -470.384339999
E step :  66 -470.456967628
E step :  67 -470.591050542
E step :  68 -472.691713487
E step :  69 -470.459684708
E step :  70 -470.598698278
E step :  71 -473.107222701
E step :  72 -470.712550145
E step :  73 -481.036501787
E step :  74 -470.455568166
E step :  75 -470.587177869
E step :  76 -472.453028323
E step :  77 -470.400459543
E step :  78 -470.475650042
E step :  79 -470.646200817
E step :  80 -472.16770286
E step :  81 -473.759332091
E step :  82 -470.043626546
E step :  83 -2961.76664

M step :  6 -6600.4229114 0.00408847922819
M step :  7 -6600.38986586 0.00562437124721
M step :  8 -6600.31833854 0.00774216232272
M step :  9 -6600.15058586 0.0107003612302
M step :  10 -6599.68477952 0.015015200861
M step :  11 -6597.54396372 0.0223832507439
M step :  12 -6622.98581711 0.0528092136615
M step :  13 -6562.36094927 0.064640533089
M step :  14 -6499.45576033 0.0788654957652
M step :  15 -5748.4396334 1.01203002051
M step :  16 -4563.79155791 1.63875208808
M step :  17 -1598.99906193 4.02571227323
M step :  18 1033.99604438 3.57664155421
M step :  19 6012.82024386 6.78304474179
M step :  20 7071.81472568 1.50611615868
M step :  21 12719.5543687 7.68533183107
M step :  22 16494.3280466 5.16052068237
M step :  23 22173.7553235 7.72721942617
M step :  24 23332.6089085 1.55164086522
M step :  25 150222.255582 192.16196643
M step :  26 150541.14546 0.419465458238
M step :  27 157250.14809 9.1753865182
M step :  28 162796.866796 7.54634056773
M step :  29 164198.20446 1.9243503

E step :  96 -482.41799483
E step :  97 -351.857457329
E step :  98 -482.422527794
E step :  99 -352.459548457
E step :  100 -481.424974896
E step :  101 -482.41799483
E step :  1 -285.972151621
E step :  2 -422.925460517
E step :  3 -432.288720608
E step :  4 -286.013609508
E step :  5 -420.415976833
E step :  6 -395.541192763
E step :  7 -285.926076299
E step :  8 -426.139997836
E step :  9 -432.241658325
E step :  10 -304.835405124
E step :  11 -327.315425388
E step :  12 -284.170386553
E step :  13 -432.288720608
E step :  14 -286.013609504
E step :  15 -420.415980146
E step :  16 -395.541192742
E step :  17 -285.926075578
E step :  18 -426.139874103
E step :  19 -432.241906846
E step :  20 -304.867079375
E step :  21 -323.387930043
E step :  22 -395.541351132
E step :  23 -285.932159753
E step :  24 -426.593752651
E step :  25 -337.706350813
E step :  26 -395.49113284
E step :  27 -284.650295033
E step :  28 -432.293177093
E step :  29 -286.559247108
E step :  30 -360.351032047
E 

E step :  62 -3748.36645604
E step :  63 -3800.97867083
E step :  64 -9734.60566806
E step :  65 -3747.59966669
E step :  66 -3747.95200806
E step :  67 -3748.41538277
E step :  68 -9734.74840453
E step :  69 -3747.73174179
E step :  70 -3748.23813734
E step :  71 -3764.72766905
E step :  72 -3747.45665712
E step :  73 -3747.77508219
E step :  74 -3748.36645604
E step :  75 -3800.97867083
E step :  76 -9734.60566806
E step :  77 -3747.59966669
E step :  78 -3747.95200806
E step :  79 -3748.41538277
E step :  80 -9734.74840453
E step :  81 -3747.73174179
E step :  82 -3748.23813734
E step :  83 -3764.72766905
E step :  84 -3747.45665712
E step :  85 -3747.77508219
E step :  86 -3748.36645604
E step :  87 -3800.97867083
E step :  88 -9734.60566806
E step :  89 -3747.59966669
E step :  90 -3747.95200806
E step :  91 -3748.41538277
E step :  92 -9734.74840453
E step :  93 -3747.73174179
E step :  94 -3748.23813734
E step :  95 -3764.72766905
E step :  96 -3747.45665712
E step :  97 -3747.7

E step :  31 -641.427145553
E step :  32 -375.393433414
E step :  33 -371.619839163
E step :  34 -418.937443761
E step :  35 -371.131885374
E step :  36 -641.427145553
E step :  37 -375.393433414
E step :  38 -371.619839163
E step :  39 -418.937443761
E step :  40 -371.131885374
E step :  41 -641.427145553
E step :  42 -375.393433414
E step :  43 -371.619839163
E step :  44 -418.937443761
E step :  45 -371.131885374
E step :  46 -641.427145553
E step :  47 -375.393433414
E step :  48 -371.619839163
E step :  49 -418.937443761
E step :  50 -371.131885374
E step :  51 -641.427145553
E step :  52 -375.393433414
E step :  53 -371.619839163
E step :  54 -418.937443761
E step :  55 -371.131885374
E step :  56 -641.427145553
E step :  57 -375.393433414
E step :  58 -371.619839163
E step :  59 -418.937443761
E step :  60 -371.131885374
E step :  61 -641.427145553
E step :  62 -375.393433414
E step :  63 -371.619839163
E step :  64 -418.937443761
E step :  65 -371.131885374
E step :  66 -641.42

E step :  26 -238.083678302
E step :  27 -491.897109161
E step :  28 -228.28802133
E step :  29 -228.390944305
E step :  30 -239.219703021
E step :  31 -246.095620219
E step :  32 -230.730363457
E step :  33 -223.965182399
E step :  34 -223.927926858
E step :  35 -229.920686689
E step :  36 -223.7996961
E step :  37 -268.791547551
E step :  38 -491.89710862
E step :  39 -228.287928103
E step :  40 -228.393123745
E step :  41 -239.168478859
E step :  42 -230.146928765
E step :  43 -282.766793957
E step :  44 -482.612524817
E step :  45 -223.824371066
E step :  46 -491.89710862
E step :  47 -228.287928103
E step :  48 -228.393123745
E step :  49 -239.168478859
E step :  50 -230.146928765
E step :  51 -282.766793957
E step :  52 -482.612524817
E step :  53 -223.824371066
E step :  54 -491.89710862
E step :  55 -228.287928103
E step :  56 -228.393123745
E step :  57 -239.168478859
E step :  58 -230.146928765
E step :  59 -282.766793957
E step :  60 -482.612524817
E step :  61 -223.82437106

E step :  22 -253.689112873
E step :  23 -568.793543652
E step :  24 -509.80621586
E step :  25 -231.602566777
E step :  26 -241.965419135
E step :  27 -228.553019402
E step :  28 -279.702694062
E step :  29 -231.67963993
E step :  30 -241.793300902
E step :  31 -279.702694062
E step :  32 -231.67963993
E step :  33 -241.793300902
E step :  34 -279.702694062
E step :  35 -231.67963993
E step :  36 -241.793300902
E step :  37 -279.702694062
E step :  38 -231.67963993
E step :  39 -241.793300902
E step :  40 -279.702694062
E step :  41 -231.67963993
E step :  42 -241.793300902
E step :  43 -279.702694062
E step :  44 -231.67963993
E step :  45 -241.793300902
E step :  46 -279.702694062
E step :  47 -231.67963993
E step :  48 -241.793300902
E step :  49 -279.702694062
E step :  50 -231.67963993
E step :  51 -241.793300902
E step :  52 -279.702694062
E step :  53 -231.67963993
E step :  54 -241.793300902
E step :  55 -279.702694062
E step :  56 -231.67963993
E step :  57 -241.793300902
E s

E step :  13 -398.726575366
E step :  14 -423.860056819
E step :  15 -421.060488302
E step :  16 -426.82508391
E step :  17 -402.26101256
E step :  18 -414.905767204
E step :  19 -399.282620695
E step :  20 -415.817395571
E step :  21 -399.348343276
E step :  22 -398.9243771
E step :  23 -443.389080474
E step :  24 -408.57121828
E step :  25 -423.860037735
E step :  26 -421.071386122
E step :  27 -425.38346312
E step :  28 -421.941325159
E step :  29 -420.831368733
E step :  30 -423.860094757
E step :  31 -421.005999915
E step :  32 -428.483019391
E step :  33 -797.004376235
E step :  34 -423.860125218
E step :  35 -420.849721892
E step :  36 -495.001268006
E step :  37 -437.4822663
E step :  38 -415.036045723
E step :  39 -423.860067516
E step :  40 -421.050515942
E step :  41 -427.79747941
E step :  42 -394.889517823
E step :  43 -423.785216208
E step :  44 -880.040191393
E step :  45 -878.860277329
E step :  46 -423.860037811
E step :  47 -421.071370216
E step :  48 -425.385983837
E

E step :  32 -3198.8280457
E step :  33 -3218.00543573
E step :  34 -3198.8280457
E step :  35 -3218.00543573
E step :  36 -3198.8280457
E step :  37 -3218.00543573
E step :  38 -3198.8280457
E step :  39 -3218.00543573
E step :  40 -3198.8280457
E step :  41 -3218.00543573
E step :  42 -3198.8280457
E step :  43 -3218.00543573
E step :  44 -3198.8280457
E step :  45 -3218.00543573
E step :  46 -3198.8280457
E step :  47 -3218.00543573
E step :  48 -3198.8280457
E step :  49 -3218.00543573
E step :  50 -3198.8280457
E step :  51 -3218.00543573
E step :  52 -3198.8280457
E step :  53 -3218.00543573
E step :  54 -3198.8280457
E step :  55 -3218.00543573
E step :  56 -3198.8280457
E step :  57 -3218.00543573
E step :  58 -3198.8280457
E step :  59 -3218.00543573
E step :  60 -3198.8280457
E step :  61 -3218.00543573
E step :  62 -3198.8280457
E step :  63 -3218.00543573
E step :  64 -3198.8280457
E step :  65 -3218.00543573
E step :  66 -3198.8280457
E step :  67 -3218.00543573
E step :  

In [19]:
id2token = {}
for token in dictionary.token2id :
    id2token[dictionary.token2id[token]] = token
    
weights = -np.sort(-beta,axis = 1)[:,:num_w]
res = np.argsort(-beta,axis = 1)[:,:num_w]
for i in range(0,res.shape[0]) :
    topic = {}
    for j in range(0,res.shape[1]) : 
        topic[id2token[res[i,j]]]= weights[i,j]
    print topic, len(topic)



{u'enough': nan, u'u': nan, u'bought': nan} 3
{u'media': 0.018477800787308016, u's': 0.035995648300272194, u'say': 0.017653359078200002} 3


In [214]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)

print(ldamodel.print_topics(num_topics=10, num_words=3))

x = 100
print np.log(special.gamma(x)), special.digamma(x)
print special.polygamma(0, x), special.digamma(x)

[(0, u'0.015*"arab" + 0.015*"israel" + 0.015*"countri"'), (1, u'0.001*"work" + 0.001*"say" + 0.001*"mile"'), (2, u'0.021*"say" + 0.021*"palestinean" + 0.014*"s"'), (3, u'0.001*"s" + 0.001*"b" + 0.001*"f"'), (4, u'0.018*"s" + 0.018*"let" + 0.018*"score"'), (5, u'0.027*"s" + 0.027*"u" + 0.027*"media"'), (6, u'0.016*"b" + 0.014*"f" + 0.014*"s"'), (7, u'0.031*"t" + 0.025*"msg" + 0.019*"salt"'), (8, u'0.019*"enough" + 0.019*"say" + 0.019*"mile"'), (9, u'0.018*"will" + 0.012*"s" + 0.012*"1000"')]
359.13420537 4.60016185274
4.60016185274 4.60016185274


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print data_samples[0]


print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(doc_a)
print tfidf


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(doc_a)

print()

Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.

Extracting tf-idf features for NMF...


ValueError: Iterable over raw text documents expected, string object received.