In [1]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import cProfile

In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [None]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [None]:
lda = LDA()
#cProfile.run("lda.fit(X)")
%lprun -f lda.fit lda.fit(X)

In [None]:

#b, g = lda.fit(X)

In [3]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [4]:
ng.filenames.shape

(11314,)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20)
ngvec = vec.fit_transform(ng.data)

In [None]:
%lprun lda.fit(ngvec)

In [6]:
lda = LDA(n_jobs=8)
%time b, g = lda.fit(ngvec)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
CPU times: user 3.99 s, sys: 997 ms, total: 4.99 s
Wall time: 3min 41s


In [None]:
%debug

In [7]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(5):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [None]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

In [None]:
topic_summaries(sklda.components_)

In [8]:
topic_summaries(b)

Topic 0
don by some all has any know like up just cs so reply me will one ca distribution how about get at what there as would was com but do an are they university if this or not can on with be you writes have edu re nntp posting host
Topic 1
been like do me some has an only were can out with think would he com more just there people don no at we one so your or by about have who my all as they what edu if was on be re are but you writes this article not
Topic 2
thanks just edu article system writes about only know like me up all no at get does by your would one has using which so what how some my will as do there any are you com use re but be an not if have or with can on this
Topic 3
they public against today 1993 into than not over do one we or there during may has american people after would us national world also new two who now first these re be are its at years on been which was other have their were an as with government by
Topic 4
drive so 16 as computer will 10 new am know ab

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]