In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
%%time
df = pd.read_csv('../data/parsedv3.csv')

CPU times: user 9.75 s, sys: 1.24 s, total: 11 s
Wall time: 11 s


In [4]:
MAX_FEATURES = 1000
tf_vectorizer = CountVectorizer(min_df=2, max_df=0.5,
                                max_features=MAX_FEATURES,
                                lowercase=True, 
                                token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}',
                                stop_words='english')

In [5]:
%%time
dtm = tf_vectorizer.fit_transform(df['text'])
dtm.shape

CPU times: user 43.8 s, sys: 639 ms, total: 44.4 s
Wall time: 44.5 s


In [6]:
NUM_TOPICS = 10
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online',
                                      verbose = 1,
                                      batch_size=1024,
                                      n_jobs = 7)

In [7]:
%%time
lda_Z = lda_model.fit_transform(dtm)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
CPU times: user 4.21 s, sys: 1.37 s, total: 5.58 s
Wall time: 2min 4s


In [10]:
%%time
doc_topic_dist = lda_model.transform(dtm)

CPU times: user 238 ms, sys: 158 ms, total: 396 ms
Wall time: 6.25 s


In [12]:
doc_topic_dist[1]

array([ 0.00169523,  0.00169518,  0.11586473,  0.00169511,  0.00169541,
        0.0016953 ,  0.00169517,  0.07294067,  0.04795884,  0.75306436])

In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tf_vectorizer)