In [1]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import cProfile

In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [None]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [None]:
lda = LDA()
#cProfile.run("lda.fit(X)")
%lprun -f lda.fit lda.fit(X)

In [3]:
lda = LDA()
#b, g = lda.fit(X)

In [4]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [5]:
ng.filenames.shape

(11314,)

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20)
ngvec = vec.fit_transform(ng.data)

In [None]:
%lprun -f lda._doc_update lda.fit(ngvec)

In [38]:
b, g = lda.fit(ngvec)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9


In [39]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(5):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [40]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.4s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.1s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.7s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.4s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.9s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.7s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.8s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s finished


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=5, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=True)

In [41]:
topic_summaries(sklda.components_)

Topic 0
good your no an ca know he has up get any do all don so one there like out as just what about would me can university or at host nntp are posting not they be was if with but this article my writes have on re com you edu
Topic 1
ww mx p3 ei 27 mp sp 75 sk 6e mt tm p2 air 86 mc lj m5 17 sq ml 04 4t mk m_ wt 14 mr hz mu mw uw m3 bj mq 45 mv d9 lk 9v ah sl 34 as wm 1d9 pl 145 max ax
Topic 2
my does software would available get so about data using how information program key space all other also what has file one do some windows there your system any but which com edu at use by will not an as if have can are or you be with this on
Topic 3
40 league 55 are vs first apr april 23 22 21 24 as 19 50 17 season be 18 play edu 93 13 hockey 30 by ca drive new games 14 he 1993 with 11 12 year 20 16 25 game 15 will was scsi on team at 10 00
Topic 4
think had when don me more them which god some has his my re edu your so will at their can about no were do one an would who all people there what 

In [42]:
topic_summaries(b)

Topic 0
writes need which about article so am some please does what get anyone like one all has know will distribution would as do use there me at thanks com an university my are not re any but be if can you nntp or host this posting have on with edu
Topic 1
think more know will he no get any do has an up don me would like just so there one as com all my about out what can host posting nntp or they not at university are was if with but be this have article you on writes edu re
Topic 2
he this up out many first these about which into way do make also or those an would will some but on our well are more time one has was because been only when have with then by not had we no all as them people they their were who
Topic 3
will other more me know has some my just any don your we people nntp no host who so at one all about would do an posting they there was can by with what or but as if com have on article are edu not you be this writes re
Topic 4
my 27 april other cmu 26 this information ru

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]