In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
%%time
df = pd.read_csv('../data/parsed.csv')

In [3]:
MAX_FEATURES = 1000
tf_vectorizer = CountVectorizer(min_df=2, max_df=0.5,
                                max_features=MAX_FEATURES,
                                lowercase=True, 
                                token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}',
                                stop_words='english')

In [4]:
%%time
dtm = tf_vectorizer.fit_transform(df['text'])
dtm.shape

(21337, 1000)

In [7]:
NUM_TOPICS = 10
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online',
                                      verbose = 1,
                                      batch_size=1024,
                                      n_jobs = 7)

In [8]:
%%time
lda_Z = lda_model.fit_transform(dtm)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
CPU times: user 4.61 s, sys: 1.38 s, total: 5.99 s
Wall time: 1min 21s


In [9]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tf_vectorizer)

LDA Model:
Topic 0:
[('vessel', 18354.411887463484), ('damages', 15125.539097584573), ('water', 13716.236381203998), ('indian', 13481.424462219864), ('goods', 12637.991243457387), ('treaty', 11628.810988319379), ('ship', 11040.97555226338), ('policy', 10943.979820111328), ('duty', 10509.470076541193), ('master', 10081.449527080158)]
Topic 1:
[('tax', 71480.970780716962), ('taxes', 23281.59565647885), ('income', 22754.663564454891), ('value', 17431.072240431495), ('taxation', 15245.086588208766), ('revenue', 14817.675159703347), ('paid', 14688.560995810782), ('commissioner', 12679.944144896193), ('pay', 12147.493656234588), ('year', 11342.304783444488)]
Topic 2:
[('government', 37351.717498009588), ('constitution', 26791.336874635235), ('criminal', 21101.282826057253), ('officers', 15840.260288488356), ('officer', 15403.203917003797), ('amendment', 15008.026537175185), ('indictment', 13180.64217413888), ('offense', 12982.290607010515), ('search', 12289.190880410939), ('crime', 12078.706