In [1]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA, _doc_update, _slice_doc_update
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import cProfile

In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [None]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [None]:
lda = LDA()
#cProfile.run("lda.fit(X)")
# %lprun -f lda.fit lda.fit(X)

In [None]:

#b, g = lda.fit(X)

In [5]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20, stop_words="english")
ngvec = vec.fit_transform(ng.data)

In [None]:
%lprun -f _slice_doc_update lda.fit(ngvec)

In [7]:
lda = LDA(K=10, n_jobs=8)
%time b, g = lda.fit(ngvec)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
CPU times: user 3.38 s, sys: 1.26 s, total: 4.65 s
Wall time: 4min 12s


In [None]:
%debug

In [8]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(10):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [None]:
print ng.target_names

In [None]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

In [None]:
topic_summaries(sklda.components_)

In [9]:
topic_summaries(b)

Topic 0
got cc andrew nhl people league columbia best news canada david know did player cmu turks time turkey world better armenia distribution reply games just play think good season baseball hockey like don armenians players armenian cs turkish nntp host posting game year team com university ca writes article edu
Topic 1
argument death claim christianity jews hell atheists human religious word caltech point said did way man moral world law evidence question true nntp like host truth christ life faith religion church just know christians posting bible university say christian does think believe don jesus article people com writes god edu
Topic 2
25 mw 94 04 p3 06 col m5 salmon pt hr gi 45 mas 14 dakota m4 ah 75 usd 225 mr sl mn om 68 34 d9 mq tm km 145 6e ei m3 tg m_ ml 4t lj wt z5 i4 bj 9v wm 1d9 pl max ax
Topic 3
33 36 louis chicago 01 division 37 35 pittsburgh hockey 93 34 38 gm new cup 21 03 team 20 montreal win game 02 pts 55 28 st 27 play 23 24 19 la 25 30 26 games period 15 18 

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]

In [None]:
%timeit X.toarray()[3, :45]
%timeit X[3, :45].A

In [None]:
from scipy.sparse import lil_matrix
lilX = lil_matrix(X)

In [None]:
%timeit lilX[3, :45].A