# Use GridSearchCV to find best performing topic model

### Import libraries for topic modeling

In [None]:
import pandas as pd
import sys
import numpy as np
import csv
import nltk
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from IPython.display import display

# uncomment following line to pip install pyLDAvis as needed
#!{sys.executable} -m pip install pyLDAvis
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn

### Import and merge CSV files 
Create dataframes from CSV files with cleaned metadata and bag of words as created in notebook 2. Merge into single dataframe matching file_name from the ngrams to the metadata id.

In [None]:
df_R = pd.read_csv('output/df-R-cleaned.csv', encoding='utf-8', na_filter=False)
df_n = pd.read_csv('output/df-n.csv', encoding='utf-8')
df_all = df_R.merge(df_n, left_on='file_name', right_on='n_id')

The merge should drop ngram-articles that are not listed in the cleaned metadata df. 

In [None]:
print('Current number of articles', len(df_all))

### Run CountVectorizer
Fit to the words from each article (available in df_all['body']).

In [None]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.70, min_df=0.10,
                                max_features=None)

tf = tf_vectorizer.fit_transform(df_all.body.values.astype('U'))

### GridSearchCV
Define the range of parameters (number of topics and number of iterations) to test using LDA.

In [None]:
# Define Search Param
search_params = {'n_components': [35, 40, 45, 50, 55], 'max_iter':[10,20]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf)

### GridSearch: Best models
Look at [GridSearchCV documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) for more information on the meaning of log likelihood score and perplexity. 

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf))

print(model.get_params())