In [1]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import cProfile

In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [None]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [None]:
lda = LDA()
#cProfile.run("lda.fit(X)")
%lprun -f lda.fit lda.fit(X)

In [3]:

#b, g = lda.fit(X)

In [4]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [5]:
ng.filenames.shape

(11314,)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20)
ngvec = vec.fit_transform(ng.data)

In [11]:
%lprun lda.fit(ngvec)

UsageError: Could not find function u'lda._doc_update'.
AttributeError: 'LDA' object has no attribute '_doc_update'

In [14]:
lda = LDA(n_jobs=8)
%time b, g = lda.fit(ngvec)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
CPU times: user 3.92 s, sys: 974 ms, total: 4.89 s
Wall time: 2min 54s


In [None]:
%debug

In [15]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(5):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [None]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

In [None]:
topic_summaries(sklda.components_)

In [16]:
topic_summaries(b)

Topic 0
charley roby ourselves knives uwec phrase prophets mangoe ifas blindly arrogance jayne weapon refering atheists christ fires condemned excuses geneva judaism religious existence frightened irrational conviction dies 7415 eternity pagan beaverton laws batf christians eau morality kulikauskas claire beauchaine bobbe mmalt mcovingt covington aisun3 manhattan 706 n4tmi belief 0358 30602
Topic 1
finals souviens uic domi champs homosexual amherst rri ns rangers nd cunixc ahl ink nubus marlins windows hello faraday monica maxtor group dos6 royals psuvm russ controller cramer 165 vlb ati md dal clayton ak296 halifax ide daker wlsmith consent card promiscuous bus keller psu lcs eisa isa kkeller dalhousie
Topic 2
input ext anybody outputs 301 syl motorcycles fixing hydro labs behanna panels rgb vhs fuel levine ipx ecs nada stereo fan link glide harley microsoft kth thermal vga jody bar hassle jubilee inch 350 priced formats xxmessage jlevine polytechnic fxwg nec cb360t slip sphere scroll

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]