In [1]:
%matplotlib inline
%load_ext line_profiler
import numpy as np
import scipy as sp
import scipy.sparse as spar
import scipy.special as spec
import sys
from matplotlib import pyplot as plt
from lda import LDA, _doc_update, _slice_doc_update
from sklearn.decomposition import LatentDirichletAllocation as SKLDA

import pickle
import cProfile

In [2]:
M = 100
V = 1000
X = np.random.binomial(1,.3, size=M*V).reshape(M,V)
X = spar.csr_matrix(X, dtype=float)

In [3]:
# For even a reasonable setup like 10K vocabulary, 5K documents and 20 topics, the size of the tensor indexed by
# <document, word, topic> simply explodes to 7.5G. This is why we can't explicitly keep all of $\phi$ in the memory.
# Instead, we iterate over the documents one by one, and accumulate the phi parameter

In [4]:
from sklearn.datasets import fetch_20newsgroups
ng = fetch_20newsgroups(subset='train')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_df=.7, min_df=20, stop_words="english")
ngvec = vec.fit_transform(ng.data)

In [None]:
lda = LDA(K=10, n_jobs=8)
%time b, g = lda.fit(ngvec)

Epoch: 0
Perplexity: 1.40531939272
Epoch: 1


In [None]:
%lprun -f _slice_doc_update lda.fit(ngvec)

In [None]:
%debug

In [None]:
lookup = lambda x : [k for k, v in vec.vocabulary_.items() if v == x]

def topic_summaries(b):
    bs = b.argsort()[:,-50:]
    for i in range(10):
        print "Topic", i
        words = []
        for j in range(bs.shape[1]):
            words.append(lookup(bs[i,j])[0])
        print " ".join(words)

In [None]:
print ng.target_names

In [None]:
sklda = SKLDA(n_topics=5, learning_method="batch", verbose=True)
sklda.fit(ngvec)

In [None]:
topic_summaries(sklda.components_)

In [None]:
topic_summaries(b)

In [None]:
np.product(ngvec.shape) * 8 

In [None]:
%timeit ngvec.indices[ngvec.indptr[14]:ngvec.indptr[14+1]]
%timeit ngvec[14, :].nonzero()[1]

In [None]:
%timeit X.toarray()[3, :45]
%timeit X[3, :45].A

In [None]:
from scipy.sparse import lil_matrix
lilX = lil_matrix(X)

In [None]:
%timeit lilX[3, :45].A