In [10]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA, _doc_update, _slice_doc_update
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import pickle
import cProfile

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [3]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [4]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20, stop_words="english")
ngvec = vec.fit_transform(ng.data)

In [6]:
lda = LDA(K=10, n_jobs=8)
%time b, g = lda.fit(ngvec)

Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
CPU times: user 3.23 s, sys: 1.18 s, total: 4.42 s
Wall time: 3min 40s


In [None]:
%lprun -f _slice_doc_update lda.fit(ngvec)

In [None]:
%debug

In [7]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(10):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [None]:
print ng.target_names

In [None]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

In [None]:
topic_summaries(sklda.components_)

In [8]:
topic_summaries(b)

Topic 0
ve truth wrong different university christ faith religion thing doesn read want right man said reason church world mean evidence christians really fact did things bible true life point make christian question time good way jesus believe like know say does article just com think don writes people god edu
Topic 1
cliff unt 1k 17 km fr morris edu mpg hi bk mv colin 75 nh gi ct 3d 86 lc air distribution bu app kindly lk ua bc hz 14 ne im end tm p2 meg sg ai wm 34 su lu ub wa bd hst pl bj uw max
Topic 2
chicago did roger runs second gm 12 15 won 20 division canada nntp host com best time toronto posting just 00 better player 25 new teams like vs cs 10 nhl baseball don league think university win good article players season writes hockey play games year ca team game edu
Topic 3
oh 13 don 17 15 __ magnus john acs gmt freenet columbia steve david virginia new cso org sun world mail like computer thanks cmu cwru apr cleveland netcom just andrew ohio know ca uiuc cc news usa state distri

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]

In [None]:
%timeit X.toarray()[3, :45]
%timeit X[3, :45].A

In [None]:
from scipy.sparse import lil_matrix
lilX = lil_matrix(X)

In [None]:
%timeit lilX[3, :45].A