In [1]:
#libraries for topic modeling
import pandas as pd
import sys
import numpy as np
import csv
import nltk
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from IPython.display import display

# uncomment following line to pip install pyLDAvis as needed
#!{sys.executable} -m pip install pyLDAvis
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn

In [2]:
df_R = pd.read_csv('output/df-R-cleaned.csv', encoding='utf-8', na_filter=False)
df_n = pd.read_csv('output/df-n.csv', encoding='utf-8')
df_all = df_R.merge(df_n, left_on='file_name', right_on='n_id')

In [4]:
#back down to the 7,773 articles in our primary corpus
len(df_all)

7773

In [5]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.70, min_df=0.10,
                                max_features=None)

tf = tf_vectorizer.fit_transform(df_all.body.values.astype('U'))

Extracting tf features for LDA...


In [6]:
# Define Search Param
search_params = {'n_components': [35, 40, 45, 50, 55], 'max_iter':[10,20]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf)



GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [35, 40, 45, 50, 55], 'max_iter': [10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf))

Best Model's Params:  {'max_iter': 20, 'n_components': 40}
Best Log Likelihood Score:  -22045585.387935583
Model Perplexity:  744.5775763714838


In [8]:
print(model.get_params())

{'cv': None, 'error_score': 'raise', 'estimator__batch_size': 128, 'estimator__doc_topic_prior': None, 'estimator__evaluate_every': -1, 'estimator__learning_decay': 0.7, 'estimator__learning_method': None, 'estimator__learning_offset': 10.0, 'estimator__max_doc_update_iter': 100, 'estimator__max_iter': 10, 'estimator__mean_change_tol': 0.001, 'estimator__n_components': 10, 'estimator__n_jobs': 1, 'estimator__n_topics': None, 'estimator__perp_tol': 0.1, 'estimator__random_state': None, 'estimator__topic_word_prior': None, 'estimator__total_samples': 1000000.0, 'estimator__verbose': 0, 'estimator': LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, 