# LDA Topic Modelling

* This notebook is showcases the process of building an NLP Topic Model using `Latent Dirichlet Allocation` method. 
* The dataset we are going to use are `text` and `soft text` from `scrapped_fox_data_clean.csv`. 

## Table Of Contents

## Installations


In [1]:
# ## installing required libraries
# ! pip install beautifulsoup4
# ! pip install pandas
# ! pip install numpy
# ! pip install plotly
# ! pip install nbformat
# ! pip install ipykernel
# ! pip install matplotlip
# ! pip install wordcloud
# ! pip install gensim
# ! pip install pyLDAvis
# ! pip install nltk
# ! pip install -U pip setuptools wheel
# ! pip install -U spacy
# ! python -m spacy download en_core_web_trf 
# ! python -m spacy download en_core_web_md
# ! pip install joblib
# ! pip install tqdm

## Imports

In [28]:

import pandas as pd
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as io

# loading library
import pickle

from joblib import dump, load

from tqdm.auto import tqdm

import os
import sys
sys.path.insert(0, os.path.abspath('../utils'))

## importing custom modules
import common_utils
import gensim_utils
import sklearn_utils


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [16]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


## Utility Functions

### Preparing Stop Words

In [111]:
## extending stopwords
# lets break down the cleaning functions into smaller functions
stop_words = nltk.corpus.stopwords.words('english')

## trying to remove stopwords from stopwords super set. 
stopwords_super_set = pd.read_csv("../data/stopwords/sw1k.csv")

## filtering stopwords to pronouns and other type
stopwords_to_remove = list(stopwords_super_set.loc[(stopwords_super_set["type"] == "G" ), "term"])


# stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'say', 'one', 'time', 'people',
#                   'know', 'like', 'tell', 'get', 'year', 'go', 'around', 'award', 'actually', 'carry',
#                    'new', 'it', 'show', 'news', 'go', 'fox', 'make', 'do', 'not', 'say',
#                    'also', 'love', 'it', 'star', 'go', 'do', 'say', 'not', 'said'
#                    ])

stop_words.extend(stopwords_to_remove)
stop_words.extend(['monday', 'tuesday', 'wednesday', 
                   'thursday', 'friday', 'saturday', 'sunday'])
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Common Utility Functions

In [112]:
# nlp = spacy.load('en_core_web_md')
# # nlp = spacy.load('en_core_web_trf')
# nlp.add_pipe('merge_entities')
# # nlp.add_pipe("merge_noun_chunks")
# tqdm.pandas(desc="processing")

# # Utility Functions for Text Cleaning
# def sent_to_words(sentences):
#     for sentence in tqdm(sentences):
#         yield (simple_preprocess(str(sentence), deacc=True))

# # function to clean html tags from text


# def clean_html(html):
#     # parse html content
#     soup = BeautifulSoup(html, "html.parser")
#     for data in soup(['style', 'script', 'code', 'a']):
#         # Remove tags
#         data.decompose()
#     # return data by retrieving the tag content
#     return ' '.join(soup.stripped_strings)

# # function to convert text to lowercase


# def lower_case(text):
#     return text.lower()

# # function to remove line breaks


# def remove_line_breaks(text):
#     return re.sub(r'\n', '', text)

# # function to remove punctuation


# def remove_punctuation(text):
#     return text.translate(str.maketrans('', '', string.punctuation))

# # function to remove numbers


# def remove_numbers(text):
#     return re.sub(r'\d+', '', text)

# # function to remove extra spaces


# def remove_extra_spaces(text):
#     text = text.replace(u'\xa0', u' ')
#     return text
#     # return re.sub(' +', ' ', text)

# # function to remove stopwords


# def remove_stopwords(texts):
#     preprocess_text = simple_preprocess(str(texts), deacc=True)
#     word_list = [word for word in preprocess_text if word not in stop_words]
#     return " ".join(word_list)
#     # return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# # helper function to create pos tags


# def create_pos_tag(str_sent):
#     return nlp(str_sent)

# # function for text lemmatization using spac
# ##'ADJ', 'VERB'
# def lemmatization(texts, allowed_postags=['PROPN', 'NOUN']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in tqdm(texts):
#         doc = nlp(" ".join(sent))
#         texts_out.append(
#             [token.lemma_ for token in doc if (token.pos_ in allowed_postags and token.is_stop == False and token.text not in stop_words)])
#     return texts_out


# def tokenization(texts, allowed_postags=['PROPN', 'NOUN']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in tqdm(texts):
#         doc = nlp(" ".join(sent))
#         texts_out.append(
#             ["_".join(token.text.split(" ")) for token in doc if (token.pos_ in allowed_postags and token.is_stop == False and token.text not in stop_words)])
#     return texts_out

# def lemmatization_without_pos(texts):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent))
#         texts_out.append(
#             [token.lemma_ for token in doc])
#     return texts_out


# def make_bigrams(texts, bigram_mod):
#     return [bigram_mod[doc] for doc in texts]


# def make_trigrams(texts, bigram_mod, trigram_mod):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

# ## helper function to create pos tags distribution
# def create_pos_tags_distribution(docs = []):
#     token_distribution = {}
#     is_alpha = 0
#     is_stop = 0
#     for doc in tqdm(docs):
#         for token in doc:
#             token_distribution[token.pos_] = token_distribution.get(token.pos_, 0) + 1
#             if(token.is_alpha):
#                 is_alpha += 1
#             if(token.is_stop):
#                 is_stop += 1
#     return token_distribution, is_alpha, is_stop


# # function to create n-grams from noun chunks
# def create_noun_chunk_ngrams(docs):
#     n_gram_docs = []
#     for doc in docs:
#         doc_text = doc.text
#         for chunk in doc.noun_chunks:
#             chunk_n_gram = "_".join(chunk.text.split(" "))
#             doc_text = doc_text.replace(chunk.text, chunk_n_gram)
#         n_gram_docs.append(doc_text.split(" "))
#     return n_gram_docs


# def lemmatization_noun_chunks(texts):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent))
#         texts_out.append([token.lemma_ for token in doc if (
#             ("_" in token.text) or ## if the token is a noun chunk allow that
#             (token.pos_ in ['NOUN', 'PROPN'] and token.is_alpha and token.is_stop == False) ## if the token is a noun or proper noun allow that
#         )])
#     return texts_out

### Gensim Models Utility Functions

In [203]:
# ## function to compute optimal parameters for LDA model
# def compute_coherence_values(corpus, id2word, texts, num_topics, passes, chunk_sizes=[200], iterations=[100]):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     """
#     coherence_values = []
#     model_list = []
#     params = []
#     for num_topic in tqdm(num_topics):
#         # for chunk_size in tqdm(chunk_sizes):
#             for num_passes in tqdm(passes):
#                 for iteration in tqdm(iterations):
#                     model = LdaModel(corpus=corpus,
#                                     id2word=id2word,
#                                     num_topics=num_topic,
#                                     random_state=100,
#                                     update_every=1,
#                                     # chunksize=chunk_size,
#                                     passes=num_passes,
#                                     iterations=iteration,
#                                     per_word_topics=True)
#                     model_list.append(model)
#                     # Compute Perplexity
#                     perplexity = model.log_perplexity(corpus)
                    
#                     # Compute Coherence Score
#                     coherence_model_lda = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
#                     cv_coherence = coherence_model_lda.get_coherence()
                    
#                     coherence_model_umass = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='u_mass')
#                     umass_coherence = coherence_model_umass.get_coherence()
                                    
#                     coherence_values.append({
#                         "perplexity": perplexity,
#                         "cv_coherence": cv_coherence,
#                         "umass_coherence": umass_coherence,
#                     })
#                     params.append({'num_topics': num_topic, 'chunk_size': "chunk_size", 'passes': num_passes, 'iterations': iteration})

#     return model_list, coherence_values, params

# def analyze_gensim_lda_model(lda_model, corpus, id2word, texts, num_topics, passes, chunk_sizes=[200]):
#     # Compute Perplexity
#     print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
#     # Compute Coherence Score
#     coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
#     coherence_lda = coherence_model_lda.get_coherence()
#     print('\nCoherence Score: ', coherence_lda)

# ## helper functions to visualize LDA model
# def visualize_gensim_lda_model(lda_model, corpus, id2word, filename="gensim_lda.html"):
#     # Visualize the topics
#     pyLDAvis.enable_notebook()
#     vis = gensimvis.prepare(lda_model, corpus, id2word)
#     vis.save(filename)

### Sklearn Model Utility Functions

In [114]:
# import numpy as np


# # Styling
# def color_green(val):
#     color = 'green' if val > .1 else 'black'
#     return 'color: {col}'.format(col=color)

# def make_bold(val):
#     weight = 700 if val > .1 else 400
#     return 'font-weight: {weight}'.format(weight=weight)


# def print_sklearn_sparcity(data_vectorized):
#     # Materialize the sparse data
#     data_dense = data_vectorized.todense()

#     # Compute Sparsicity = Percentage of Non-Zero cells
#     print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


# def create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized):
#     lda_output = lda_model.transform(data_vectorized)
#     # column names
#     topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
#     # index names
#     docnames = ["Doc" + str(i) for i in range(len(data))]
    
#     df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
#     # Get dominant topic for each document
#     dominant_topic = np.argmax(df_document_topic.values, axis=1)
#     df_document_topic['dominant_topic'] = dominant_topic
#     return df_document_topic

# def print_sklearn_dominant_topics(lda_model, data_vectorized):
#     df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
#     # Apply Style
#     df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
#     return df_document_topics

# def print_sklearn_topic_distribution(lda_model, data_vectorized):
#     df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
#     df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents").rename(columns={'index':'Topic'})
#     # df_topic_distribution.columns = ["Topic Num", "Num Documents"]
#     return df_topic_distribution


# # Show top n keywords for each topic
# def show_sklearn_topics(vectorizer, lda_model, n_words=20):
#     keywords = np.array(vectorizer.get_feature_names_out())
#     topic_keywords = []
#     for topic_weights in lda_model.components_:
#         top_keyword_locs = (-topic_weights).argsort()[:n_words]
#         topic_keywords.append(keywords.take(top_keyword_locs))
#     return topic_keywords

# def format_sklearn_topics(topic_keywords):
#     df_topic_keywords = pd.DataFrame(topic_keywords)
#     df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
#     df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
#     return df_topic_keywords

# def analyze_sklearn_lda_model(lda_model, data_vectorized):
#     # Log Likelyhood: Higher the better
#     print("Log Likelihood: ", lda_model.score(data_vectorized))
#     # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
#     print("Perplexity: ", lda_model.perplexity(data_vectorized))

# ## helper function to visualize lda model
# def visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer, mds='tsne'):    
#     pyLDAvis.enable_notebook()
#     panel2 = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds=mds)
#     return panel2

## Text Pre-processing

In [18]:
def preprocess_text(text):
     text = common_utils.clean_html(text)
     text = common_utils.lower_case(text)
     text = common_utils.remove_line_breaks(text)
     text = common_utils.remove_punctuation(text)
     text = common_utils.remove_numbers(text)
     text = common_utils.remove_extra_spaces(text)
     # text = remove_stopwords(text)
     return text

data["clean_text"] = data["text"].progress_apply(preprocess_text)

processing: 100%|██████████| 3972/3972 [00:01<00:00, 2367.07it/s]


### Tokenizing

In [116]:
data_words = list(common_utils.sent_to_words(data['clean_text']))

100%|██████████| 3972/3972 [00:04<00:00, 899.81it/s]


## EDA on text Data

In [117]:
## check for duplicates
data["clean_text"].duplicated().sum()

0

In [118]:
data['text_word_count'] = data['clean_text'].progress_apply(lambda x: len(str(x).split(" ")))

data['text_word_count'].describe()


processing: 100%|██████████| 3972/3972 [00:00<00:00, 31052.12it/s]


count    3972.000000
mean      615.245972
std       342.681649
min        29.000000
25%       402.000000
50%       538.000000
75%       738.000000
max      9672.000000
Name: text_word_count, dtype: float64

In [119]:
## checking the distribution of word count in text
fig = px.histogram(data, x="text_word_count", title="Distribution of Word Count in text")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



##### Notes
* Word counts are fairly distributed. 

In [120]:
from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    print("flatten a list of lists to a combined list")
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    nlp_list = list(nlp.pipe(texts, batch_size=100, n_process=100))
    print("processing chunk..")
    return nlp_list

def preprocess_parallel(texts, chunksize=1000):
    executor = Parallel(n_jobs=50, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(data), chunksize=chunksize))
    print("Processing {} texts in {} jobs".format(len(data), 50))
    result = executor(tasks)
    return flatten(result)

In [121]:
clean_text_list = list(data['clean_text'])
len(clean_text_list)

3972

In [122]:
## lets create POS tags for each text and see the distribution of POS tags
docs = [nlp(text) for text in tqdm(data['clean_text'])]


100%|██████████| 3972/3972 [04:02<00:00, 16.40it/s]


In [123]:
## creating pos tags distribution
token_distribution, is_alpha, is_stop = common_utils.create_pos_tags_distribution(docs)

100%|██████████| 3972/3972 [00:01<00:00, 3799.23it/s]


In [124]:
## convert the dictionary to a dataframe
token_distribution_df = pd.DataFrame.from_dict(token_distribution, orient='index', columns=['count']).reset_index().rename(columns={"index": "tags"})


In [125]:
## lets create a distribution of POS tags
## checking the distribution of word count in text
fig = px.histogram(token_distribution_df, x="tags", y="count", title="Distribution POS Tags in text")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [126]:
## lets see how many words are alpha and how many are stop words
print(f"we have total {data['text_word_count'].sum()} words in the text. Out of which {is_alpha} are alpha and {is_stop} are stop words")

##we have total 2384515 words in the text. Out of which 1275765 are alpha and 780851 are stop words


we have total 2443757 words in the text. Out of which 2210843 are alpha and 1065545 are stop words


##### Notes
* It seems we still have some numerical and non-alpha values even after cleaning it. Lets check them out so make sure if it needs more cleaning


In [127]:
for doc in docs:
    for token in doc:
        if(token.is_alpha == False):
            print(token.text, token.pos_, token.is_stop,len(token.text.split(" ")))

maggie hassan PROPN False 2
new hampshire PROPN False 2
don bolduc PROPN False 2
’s VERB True 1
– PUNCT False 1
– PUNCT False 1
saint anselm college’s PROPN False 3
new hampshire PROPN False 2
wednesday evening NOUN False 2
  SPACE False 2
rick wiley PROPN False 2
fox news NOUN False 2
  SPACE False 2
kevin donohoe PROPN False 2
nancy pelosi PROPN False 2
’s PART True 1
san francisco PROPN False 2
kate constantini PROPN False 2
fox news NOUN False 2
’s VERB True 1
’s AUX True 1
new hampshire PROPN False 2
midseptember bolduc PROPN False 2
chuck morse NOUN False 2
gop gov PROPN False 2
chris sununu PROPN False 2
the past six weeks NOUN False 4
’s PART True 1
’s VERB True 1
six days NOUN False 2
  SPACE False 2
’s PART True 1
last year NOUN False 2
the supreme court’s PROPN False 3
fox news NOUN False 2
earlier this month NOUN False 3
sen lindsey graham PROPN False 3
south carolina PROPN False 2
’s PART True 1
’s PART True 1
  SPACE False 2
’s AUX True 1
’s AUX True 1
’s PART True 1
n’t 

##### Notes
* Interesting, `Spacy` is tagging `noun` and `entity` chunks as `non alpha` tokens.  That could be an issue. We'll have to make sure lemmatization function can handle this. 

##### Notes
* So `maximum` tags are
    * `NOUN` - noun
    * `VERB` - verb    
    * `ADP` - adposition    
    * `PROPN`- proper noun
    * `PUNCT` - punctuation
    * `ADJ` - adjective
* Since these are news article texts, I think useful tags are, 
    * `PROPN`
    * `NOUN`
    * `ADJ` - Not sure about adjective yet. 
    * `VERB` - Not sure about verb yet. 
* We can remove rest of the words and still have a decent topic model. 
* We can also use the `is_stop` and `is_alpha` tags to remove the stopwords and non alpha tokens.
    * Lets update the helper functions accordingly. 

### Word Frequency

In [91]:
from collections import Counter
# lets see if we can calculate word frequency
# all tokens that arent stop words or punctuations
words = []
for doc in tqdm(docs):
    doc_words = [token.text for token in doc if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop and not token.is_punct and token.is_alpha]
    words.append(doc_words)

flat_list = [item for sublist in words for item in sublist]


word_counts = Counter(flat_list)
word_counts.most_common(10)


100%|██████████| 3972/3972 [00:00<00:00, 4774.75it/s]


[('news', 11478),
 ('fox', 11456),
 ('biden', 8447),
 ('president', 7925),
 ('house', 6630),
 ('trump', 5970),
 ('state', 5862),
 ('democrats', 5299),
 ('senate', 4572),
 ('republicans', 4512)]

In [92]:
## lets try and plot the word cloud
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index().rename(columns={"index": "word"})

fig = px.histogram(word_counts_df, x="word", y="count", title="Distribution POS Tags in text")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [93]:
word_counts_df.describe()

Unnamed: 0,count
count,24086.0
mean,34.059827
std,216.496122
min,1.0
25%,1.0
50%,3.0
75%,11.0
max,11478.0


##### Notes
* Interesting the difference between `mean` and `max` frequency is quick big.  Wonder if that would cause issues in the model. 
* May be IF-IDF might be a good candidate to balance out the frequency difference

In [94]:
## lets look at spacy merge entities 
# nlp.add_pipe("merge_noun_chunks")
merged_docs = [nlp(text) for text in tqdm(data['clean_text'])]


100%|██████████| 3972/3972 [03:49<00:00, 17.29it/s]


## Creating Bigram & Tigram Models

In [128]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

## Gensim LDA with BOW

### Lemmatization

In [165]:
# Do lemmatization
data_lemmatized = common_utils.tokenization(data_words)

print(data_lemmatized[:1])

100%|██████████| 3972/3972 [03:50<00:00, 17.27it/s]

[['governor', 'term', 'democratic', 'sen', 'maggie_hassan', 'new_hampshire', 'republican', 'challenger', 'don_bolduc', 'aim', 'inflation', 'abortion', 'border', 'crisis', 'denialism', 'debate', 'battleground', 'race', 'handful', 'gop', 'senate', 'majority', 'crossfire', 'debate', 'army', 'tours', 'duty', 'war', 'afghanistan', 'debate', 'saint_anselm_college_new_hampshire_institute_of_politics', 'wednesday_evening', 'bolduc', 'campaign', 'bystander', 'crowd', 'debate', 'swing', 'campaign', 'bolduc', 'rick_wiley', 'bolduc', 'campaign', 'fox_news', 'punch', 'activist', 'countdown', 'stake', 'midterm', 'elections', 'campaign', 'communications', 'kevin_donohoe', 'incident', 'behavior', 'activist', 'campaign', 'volunteers', 'debate', 'bolduc', 'altercation', 'debate', 'hassan', 'attack', 'speaker', 'nancy_pelosi', 'husband', 'san_francisco', 'gop', 'senate', 'nominee', 'problems', 'republicans', 'democrat', 'fuel', 'individual', 'bolduc', 'campaign', 'spokesperson', 'kate_constantini', 'fox_




### Create Dictionary & Corpus

In [168]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
## filter out words that occur less than 10 documents, or more than 75% of the documents.
id2word.filter_extremes(no_below=10, no_above=0.75, keep_n=10000)
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 8), (1, 2), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 13), (16, 1), (17, 10), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 11), (35, 2), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 5), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 4), (56, 2), (57, 1), (58, 1), (59, 1), (60, 2), (61, 6), (62, 1), (63, 5), (64, 1), (65, 1), (66, 1), (67, 8), (68, 1), (69, 1), (70, 1), (71, 1), (72, 2), (73, 3), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 2), (80, 1), (81, 1), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 5), (91, 4), (92, 3), (93, 3), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 3), (103, 2), (104, 2), (105, 1), (106, 1), (107, 2), (108, 1), (109, 1), (110,

In [169]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('abortion', 8),
  ('activist', 2),
  ('afghanistan', 1),
  ('aim', 1),
  ('army', 2),
  ('attack', 1),
  ('attacks', 1),
  ('ballot', 1),
  ('ban', 2),
  ('battleground', 2),
  ('behavior', 1),
  ('bid', 1),
  ('biden', 1),
  ('biden_administration', 1),
  ('bills', 1),
  ('bolduc', 13),
  ('border', 1),
  ('campaign', 10),
  ('candidates', 1),
  ('care', 1),
  ('challenger', 2),
  ('chance', 1),
  ('chris_sununu', 1),
  ('chuck_morse', 1),
  ('click', 1),
  ('clips', 1),
  ('communications', 1),
  ('conclusion', 1),
  ('contest', 1),
  ('countdown', 1),
  ('credentials', 1),
  ('crisis', 1),
  ('crowd', 2),
  ('cycle', 1),
  ('debate', 11),
  ('decisions', 2),
  ('democrat', 1),
  ('democratic', 1),
  ('digit', 1),
  ('discourse', 1),
  ('doctor', 1),
  ('don_bolduc', 5),
  ('donald_trump', 1),
  ('doubt', 1),
  ('drug', 1),
  ('duty', 1),
  ('earlier_this_month', 1),
  ('efforts', 1),
  ('elections', 2),
  ('endorsement', 1),
  ('enforcement', 1),
  ('error', 1),
  ('extremism', 1

### Building the Topic Model

In [180]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     iterations=120,
                     alpha='auto',
                     per_word_topics=True)

In [182]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

[(0,
  '0.031*"china" + 0.028*"biden" + 0.018*"taiwan" + 0.017*"ukraine" + '
  '0.016*"administration" + 0.015*"president" + 0.015*"russia" + '
  '0.013*"the_united_states" + 0.013*"afghanistan" + 0.010*"war"'),
 (1,
  '0.049*"police" + 0.023*"crime" + 0.019*"enforcement" + 0.013*"violence" + '
  '0.012*"campaign" + 0.012*"officers" + 0.010*"attorney" + 0.009*"bail" + '
  '0.007*"justice" + 0.007*"charges"'),
 (2,
  '0.031*"justice" + 0.019*"supreme_court" + 0.018*"justices" + '
  '0.016*"lawsuit" + 0.015*"the_supreme_court" + 0.012*"judge" + '
  '0.011*"amendment" + 0.010*"courts" + 0.010*"opinion" + 0.009*"district"'),
 (3,
  '0.045*"iran" + 0.026*"israel" + 0.020*"queen" + 0.014*"north_carolina" + '
  '0.013*"johnson" + 0.013*"biden" + 0.013*"president" + 0.011*"regime" + '
  '0.010*"senators" + 0.009*"king"'),
 (4,
  '0.072*"voters" + 0.051*"democrats" + 0.036*"poll" + 0.028*"elections" + '
  '0.027*"republicans" + 0.026*"midterm" + 0.018*"inflation" + 0.017*"georgia" '
  '+ 0.016*

### Analyzing Model

In [183]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCV Coherence Score: ', coherence_lda)

coherence_model_umass = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='u_mass')
coherence_lda = coherence_model_umass.get_coherence()
print('\nuMass Coherence Score: ', coherence_lda)


Perplexity:  -6.929646047519937

CV Coherence Score:  0.5554490793630982

uMass Coherence Score:  -2.4774150787821116


##### Notes to Delete

###### Manual Grid Search
-------------------------------------------------

```
Worth Saving
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)

Perplexity:  -5.722810366840693
Coherence Score:  0.4411146751458711
```

---------------------------------------------------------
```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=25,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -5.715112582723044
Coherence Score:  0.4510137020042296
```
---------------------------------------------------------------

```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -5.73645354360661
Coherence Score:  0.4648348586102282                     
```
----------------------------------------------------------------------
```
Works with token.lemma_ and not token.text
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -6.204518508321434
Coherence Score:  0.47021907190499096
```
-----------------------------------------------------------------------------
##### Tokenized Data without `noun chunks` and `named entity chunks`
```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)



Perplexity:  -6.875836169151711
Coherence Score:  0.5762156296311478
```
------------------------------------------------------------------------------
##### Tokenized data with noun and entitiy chunks

```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)

Perplexity:  -7.3730853354515915
Coherence Score:  0.5226321487478642
```
----------------------------------------------------------------------------------
##### Tokenized with just named_entity chunks

```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -6.91509928727058
Coherence Score:  0.5916526920931731
```


-----------------------------------------------------------------------------------
##### Tokenized with just named_entity chunks

```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     iterations=100,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -6.923146151007259
CV Coherence Score:  0.602559024821954
uMass Coherence Score:  -2.101086011129721
```

### Visualize Topics

In [184]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     iterations=120,
                     alpha='auto',
                     per_word_topics=True)

### Finding Best Parms



In [190]:
## compute_coherence_values(corpus, id2word, texts, num_topics, passes, chunk_sizes=[200], iterations=[100]):
model_list, coherence_values, params = gensim_utils.compute_coherence_values(corpus=corpus, id2word=id2word, texts=data_lemmatized, num_topics=[10, 15, 20, 25], passes=[200, 250, 300], iterations=[100, 120, 150])


  0%|          | 0/4 [00:00<?, ?it/s]
[A
[A
[A
100%|██████████| 3/3 [23:50<00:00, 476.92s/it]

[A
[A
[A
100%|██████████| 3/3 [29:18<00:00, 586.18s/it]

[A
[A
[A
100%|██████████| 3/3 [34:39<00:00, 693.31s/it]
100%|██████████| 3/3 [1:27:49<00:00, 1756.42s/it]
 25%|██▌       | 1/4 [1:27:49<4:23:27, 5269.26s/it]
[A
[A
[A
100%|██████████| 3/3 [23:15<00:00, 465.24s/it]

[A
[A
[A
100%|██████████| 3/3 [28:31<00:00, 570.39s/it]

[A
[A
[A
100%|██████████| 3/3 [33:47<00:00, 675.86s/it]
100%|██████████| 3/3 [1:25:34<00:00, 1711.49s/it]
 50%|█████     | 2/4 [2:53:23<2:52:59, 5189.98s/it]
[A
[A
[A
100%|██████████| 3/3 [23:55<00:00, 478.62s/it]

[A
[A
[A
100%|██████████| 3/3 [29:34<00:00, 591.46s/it]

[A
[A
[A
100%|██████████| 3/3 [34:29<00:00, 689.96s/it]
100%|██████████| 3/3 [1:28:00<00:00, 1760.05s/it]
 75%|███████▌  | 3/4 [4:21:23<1:27:11, 5231.15s/it]
[A
[A
[A
100%|██████████| 3/3 [23:46<00:00, 475.61s/it]

[A
[A
[A
100%|██████████| 3/3 [29:25<00:00, 588.34s/it]


In [193]:
## lets write to pickle file for future use
# for models in model_list:
#     with open(f"../pickles/text_models/gensim_bow/{models.num_topics}_topics_{models.passes}_passes_{models.iterations}_iterations.pkl", "wb") as f:
#         pickle.dump(models, f)

In [202]:
## les convert the coherence_values list to a dataframe
# coherence_values_df = pd.DataFrame(coherence_values, columns=['perplexity', 'cv_coherence', 'umass_coherence'])


# params_df = pd.DataFrame(params, columns=['num_topics', 'passes'])


# combined_df = pd.concat([params_df, coherence_values_df], axis=1)
# combined_df

# combined_df.to_csv('../pickles/text_models/gensim_bow/combined_df.csv', index=False)

Unnamed: 0,num_topics,passes,perplexity,cv_coherence,umass_coherence
0,10,200,-6.998504,0.530079,-1.928838
1,10,200,-7.003896,0.527507,-1.899064
2,10,200,-7.009518,0.522806,-1.9413
3,10,250,-6.998219,0.527192,-1.926723
4,10,250,-7.00366,0.527507,-1.899524
5,10,250,-7.009314,0.521979,-1.939337
6,10,300,-6.998064,0.527192,-1.925657
7,10,300,-7.003501,0.527507,-1.89975
8,10,300,-7.009193,0.521979,-1.939096
9,15,200,-6.927175,0.598702,-2.078357


### Write to Pickle File

In [269]:
# create an iterator object with write permission - model.pkl
## just commenting to make sure we don't overwrite the file

# path = "../pickles/text_models/gensim_bow"

# # with open(path + '/features_v1', 'wb') as files:
# #     pickle.dump(features, files)
    
# with open(path + '/model_v1', 'wb') as files:
#     pickle.dump(lda_model, files)    

### Analyzing Best Params

In [9]:
coherence_bow_df = pd.read_csv('../pickles/text_models/gensim_bow/coherence_bow.csv')


Unnamed: 0,num_topics,passes,perplexity,cv_coherence,umass_coherence
0,10,200,-6.998504,0.530079,-1.928838
1,10,200,-7.003896,0.527507,-1.899064
2,10,200,-7.009518,0.522806,-1.9413
3,10,250,-6.998219,0.527192,-1.926723
4,10,250,-7.00366,0.527507,-1.899524
5,10,250,-7.009314,0.521979,-1.939337
6,10,300,-6.998064,0.527192,-1.925657
7,10,300,-7.003501,0.527507,-1.89975
8,10,300,-7.009193,0.521979,-1.939096
9,15,200,-6.927175,0.598702,-2.078357


In [12]:
## lets try and plot the coherence values
fig = px.bar(coherence_bow_df, y=['cv_coherence', 'umass_coherence'], title="Coherence Scores for different number of topics")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



##### Notes
* From the above chart it looks like max we could attain is `0.602920` which is not ideal but also not too bad, looks like the model at index `15` is our best model. 


### Best Params

In [14]:
print(f"Ideal Params are: {coherence_bow_df.loc[coherence_bow_df['cv_coherence'].idxmax(),['num_topics', 'passes', ]]}")

Ideal Params are: num_topics     15.0
passes        300.0
Name: 15, dtype: float64


### Sanity Testing

In [None]:
## Lets load the mode from pickle file and run some sanity tests. 
with open("../pickles/text_models/gensim_bow/15_topics_300_passes_100_iterations.pkl", 'rb') as model:
        lda = pickle.load(model)
        



In [81]:
random_text = data["clean_text"][102]
random_text

'supreme court chief justice john roberts granted a stay temporarily blocking former president trump from having to turn over his tax records to democrats on the house ways and means committee trump and his legal team on monday filed asking the supreme court to block the release of his tax records  the house ways and means committee first requested six years of trumps tax returns in  donald trumps request to keep tax returns from congress denied by appeals court upon consideration of the application of counsel for the applicants it is so ordered that the mandate of the united states court of appeals for the district of columbia circuit case no  is hereby stayed pending further order of the undersigned or of the court roberts’ order states requesting that the committee respond before thursday nov  by noon in  the justice department said congress should be able to review the records a decision trump and his legal team have appealed  us district judge trevor mcfadden ruled last december t

In [82]:
data_words = list(common_utils.sent_to_words([random_text]))

data_lemmatized = common_utils.tokenization(data_words)


100%|██████████| 1/1 [00:00<00:00, 1003.18it/s]
100%|██████████| 1/1 [00:00<00:00, 26.25it/s]


In [83]:
bow = lda.id2word.doc2bow(data_lemmatized[0])

document_topics = lda.get_document_topics(bow)

In [84]:
sorted_document_topics = sorted(document_topics, key = lambda x: x[1], reverse=True)

for doc_topic in sorted_document_topics:
    print(f"Topic: {doc_topic[0]} - Probability: {doc_topic[1]}")


Topic: 11 - Probability: 0.6454530358314514
Topic: 2 - Probability: 0.20740783214569092
Topic: 12 - Probability: 0.0828285962343216
Topic: 9 - Probability: 0.05335969477891922


In [61]:
## lets try and get the top 10 words for each topic
top_words = lda.show_topics(formatted=False, num_words=10)

In [85]:
lda.show_topic(11)

[('fbi', 0.05228108),
 ('trump', 0.049099073),
 ('documents', 0.027911378),
 ('raid', 0.025399191),
 ('records', 0.023785453),
 ('investigation', 0.023728805),
 ('president', 0.022728289),
 ('doj', 0.018155092),
 ('maralago', 0.014479115),
 ('search', 0.014084321)]

## Gensim LDA with Bigram BOW

### Lemmatization

In [204]:
## lemmatization with bigrams
data_words_bigrams = common_utils.make_bigrams(data_words, bigram_mod)

data_lemmatized = common_utils.tokenization(data_words_bigrams)
# data_lemmatized = lemmatization_noun_chunks(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

100%|██████████| 3972/3972 [03:45<00:00, 17.64it/s]

[['governor', 'term', 'sen_maggie_hassan', 'challenger', 'inflation', 'abortion', 'border', 'crisis', 'denialism', 'debate', 'race', 'gop', 'senate', 'majority', 'crossfire', 'debate', 'army', 'tours', 'duty', 'war', 'afghanistan', 'debate', 'saint_anselm_college_new_hampshire_institute', 'wednesday_evening', 'bolduc', 'campaign', 'bystander', 'crowd', 'debate', 'swing', 'campaign', 'bolduc', 'rick_wiley', 'bolduc', 'campaign', 'fox_news', 'punch', 'activist', 'stake', 'midterm_elections', 'hassans', 'campaign', 'communications_director', 'kevin_donohoe', 'incident', 'behavior', 'activist', 'campaign', 'volunteers', 'debate', 'bolduc', 'altercation', 'debate', 'hassan', 'attack', 'speaker_nancy', 'pelosi', 'husband', 'san_francisco', 'gop', 'senate', 'nominee', 'problems', 'republicans', 'democrat', 'fuel', 'individual', 'bolduc', 'campaign', 'spokesperson', 'kate_constantini', 'fox_news', 'tonight', 'temperature', 'discourse', 'debate', 'individual', 'crowd', 'gathered_outside', 'resp




### Create Dictionary & Corpus

In [205]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# id2word.filter_extremes(no_below=10, no_above=0.75, keep_n=10000)


# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 7), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 13), (19, 1), (20, 1), (21, 1), (22, 1), (23, 9), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 11), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 4), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 7), (68, 1), (69, 1), (70, 5), (71, 1), (72, 8), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 3), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 2), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 3), (101, 1), (102, 4), (103, 3), (104, 3), (105, 1), (106, 2), (107, 1), (108, 1), (109, 1), (110, 

In [206]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('abortion', 7),
  ('activist', 2),
  ('afghanistan', 1),
  ('altercation', 1),
  ('app', 1),
  ('army', 2),
  ('attack', 1),
  ('attacks', 1),
  ('ballot', 1),
  ('ban', 2),
  ('behavior', 1),
  ('bid', 1),
  ('biden', 1),
  ('biden_administration', 1),
  ('big_lie', 1),
  ('big_pharma', 1),
  ('bills', 1),
  ('blockbuster_move', 1),
  ('bolduc', 13),
  ('border', 1),
  ('bottom_line', 1),
  ('buldoc', 1),
  ('bystander', 1),
  ('campaign', 9),
  ('candidates', 1),
  ('challenger', 2),
  ('chance', 1),
  ('chuck_morse', 1),
  ('click_here', 1),
  ('clips', 1),
  ('combustible_issue', 1),
  ('communications_director', 1),
  ('conclusion', 1),
  ('contest', 1),
  ('credentials', 1),
  ('crisis', 1),
  ('crossfire', 1),
  ('crowd', 2),
  ('cycle', 1),
  ('debate', 11),
  ('decisions', 2),
  ('democrat', 1),
  ('denialism', 1),
  ('discourse', 1),
  ('doctor', 1),
  ('don_bolduc', 2),
  ('donald_trump_repeated_unproven', 1),
  ('dumps', 1),
  ('duty', 1),
  ('earlier_this_month', 1),
  

### Building the Topic Model

In [153]:
# lda_model = LdaModel(corpus=corpus,
#                      id2word=id2word,
#                      num_topics=20,
#                      random_state=100,
#                      update_every=1,
#                     #  chunksize=200,
#                      passes=250,
#                      alpha='auto',
#                      per_word_topics=True)

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=20,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


In [154]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

[(0,
  '0.082*"trump" + 0.041*"president" + 0.033*"committee" + 0.017*"capitol" + '
  '0.012*"cheney" + 0.012*"fox" + 0.010*"donald_trump" + 0.009*"app" + '
  '0.009*"congress" + 0.008*"rep"'),
 (1,
  '0.018*"hunter" + 0.017*"hunter_biden" + 0.014*"letter" + 0.013*"biden" + '
  '0.013*"investigation" + 0.012*"fox" + 0.009*"president" + 0.008*"fox_news" '
  '+ 0.007*"emails" + 0.006*"johnson"'),
 (2,
  '0.038*"arizona" + 0.020*"alaska" + 0.017*"lake" + 0.016*"hobbs" + '
  '0.010*"kari_lake" + 0.008*"bush" + 0.008*"katie_hobbs" + 0.008*"murkowski" '
  '+ 0.007*"slavery" + 0.006*"constitution"'),
 (3,
  '0.025*"voters" + 0.021*"democrats" + 0.019*"gop" + 0.019*"senate" + '
  '0.018*"race" + 0.017*"campaign" + 0.016*"candidates" + 0.015*"fox" + '
  '0.015*"republicans" + 0.014*"candidate"'),
 (4,
  '0.035*"desantis" + 0.035*"florida" + 0.022*"governor" + 0.019*"president" + '
  '0.012*"fox" + 0.012*"fox_news" + 0.009*"california" + 0.008*"gop" + '
  '0.008*"iowa" + 0.008*"app"'),
 (5,
  '0

##### Notes
* So visually it seems we have a different topics when we use `bigrams`. 

### Analyzing Model

In [157]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.758935949671814

Coherence Score:  -3.140279325133822


Promising Models
```
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=20,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                     passes=250,
                     alpha='auto',
                     per_word_topics=True)


Perplexity:  -8.535495028799938
Coherence Score:  0.4446532328940426
```
--------------------------------------------------------




### Visualize Topics

In [158]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



### Finding Best Params

In [207]:
## compute_coherence_values(corpus, id2word, texts, num_topics, passes, chunk_sizes=[200], iterations=[100]):
model_list, coherence_values, params = gensim_utils.compute_coherence_values(corpus=corpus, id2word=id2word, texts=data_lemmatized, num_topics=[10, 15, 20, 25], passes=[200, 250, 300], iterations=[100, 120, 150])

  0%|          | 0/4 [00:00<?, ?it/s]
[A
[A
[A
100%|██████████| 3/3 [24:23<00:00, 487.88s/it]

[A
[A
[A
100%|██████████| 3/3 [29:39<00:00, 593.26s/it]

[A
[A
[A
100%|██████████| 3/3 [35:10<00:00, 703.62s/it]
100%|██████████| 3/3 [1:29:14<00:00, 1784.77s/it]
 25%|██▌       | 1/4 [1:29:14<4:27:42, 5354.31s/it]
[A
[A
[A
100%|██████████| 3/3 [24:36<00:00, 492.30s/it]

[A
[A
[A
100%|██████████| 3/3 [30:22<00:00, 607.35s/it]

[A
[A
[A
100%|██████████| 3/3 [36:15<00:00, 725.18s/it]
100%|██████████| 3/3 [1:31:14<00:00, 1824.84s/it]
 50%|█████     | 2/4 [3:00:28<3:00:50, 5425.02s/it]
[A
[A
[A
100%|██████████| 3/3 [26:19<00:00, 526.47s/it]

[A
[A
[A
100%|██████████| 3/3 [33:19<00:00, 666.47s/it]

[A
[A
[A
100%|██████████| 3/3 [38:39<00:00, 773.26s/it]
100%|██████████| 3/3 [1:38:18<00:00, 1966.20s/it]
 75%|███████▌  | 3/4 [4:38:47<1:34:01, 5641.27s/it]
[A
[A
[A
100%|██████████| 3/3 [27:44<00:00, 554.71s/it]

[A
[A
[A
100%|██████████| 3/3 [33:45<00:00, 675.07s/it]


In [211]:
## lets write to pickle file for future use
## commenting this code to prevent overwriting the files
# for models in model_list:
#     with open(f"../pickles/text_models/gensim_bigram/{models.num_topics}_topics_{models.passes}_passes_{models.iterations}_iterations.pkl", "wb") as f:
#         pickle.dump(models, f)
# ## les convert the coherence_values list to a dataframe

# coherence_values_df = pd.DataFrame(coherence_values, columns=['perplexity', 'cv_coherence', 'umass_coherence'])


# params_df = pd.DataFrame(params, columns=['num_topics', 'passes', 'iterations'])


# combined_df = pd.concat([params_df, coherence_values_df], axis=1)
# combined_df

# combined_df.to_csv('../pickles/text_models/gensim_bigram/combined_df.csv', index=False)

### Plotting GridSearch Results

In [7]:
coherence_values_df = pd.read_csv('../pickles/text_models/gensim_bigram/combined_df.csv')
coherence_values_df

Unnamed: 0,num_topics,passes,iterations,perplexity,cv_coherence,umass_coherence
0,10,200,100,-8.270029,0.472249,-2.186222
1,10,200,120,-8.27381,0.459705,-2.194383
2,10,200,150,-8.278741,0.454264,-2.28705
3,10,250,100,-8.268745,0.47048,-2.176113
4,10,250,120,-8.272758,0.467805,-2.180342
5,10,250,150,-8.277754,0.454264,-2.283637
6,10,300,100,-8.267961,0.47048,-2.178966
7,10,300,120,-8.27185,0.467805,-2.181999
8,10,300,150,-8.277023,0.454264,-2.28347
9,15,200,100,-8.598768,0.500653,-3.692057
