# LDA Topic Modelling

* This notebook is showcases the process of building an NLP Topic Model using `Latent Dirichlet Allocation` method. 
* The dataset we are going to use are `title` and `soft title` from `scrapped_fox_data_clean.csv`. 

## Table Of Contents

## Installations


In [9]:
# ## installing required libraries
# ! pip install beautifulsoup4
# ! pip install pandas
# ! pip install numpy
# ! pip install plotly
# ! pip install nbformat
# ! pip install ipykernel
# ! pip install matplotlip
# ! pip install wordcloud
# ! pip install gensim
# ! pip install pyLDAvis
# ! pip install nltk
# ! pip install -U pip setuptools wheel
# ! pip install -U spacy
# ! python -m spacy download en_core_web_trf 

## Imports

In [10]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as io

# loading library
import pickle

from joblib import dump, load

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
  from imp import reload


## Reading Data

In [11]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


## Utility Functions

### Preparing Stop Words

In [12]:
## extending stopwords
# lets break down the cleaning functions into smaller functions
stop_words = nltk.corpus.stopwords.words('english')

## trying to remove stopwords from stopwords super set. 
stopwords_super_set = pd.read_csv("../data/stopwords/sw1k.csv")

## filtering stopwords to pronouns and other type
stopwords_to_remove = list(stopwords_super_set.loc[(stopwords_super_set["type"] == "G" ), "term"])


# stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'say', 'one', 'time', 'people',
#                   'know', 'like', 'tell', 'get', 'year', 'go', 'around', 'award', 'actually', 'carry',
#                    'new', 'it', 'show', 'news', 'go', 'fox', 'make', 'do', 'not', 'say',
#                    'also', 'love', 'it', 'star', 'go', 'do', 'say', 'not', 'said'
#                    ])

# stop_words.extend(stopwords_to_remove)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Common Utility Functions

In [25]:
nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('merge_entities')
nlp.add_pipe("merge_noun_chunks")

# Utility Functions for Text Cleaning
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc=True))

# function to clean html tags from text


def clean_html(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

# function to convert text to lowercase


def lower_case(text):
    return text.lower()

# function to remove line breaks


def remove_line_breaks(text):
    return re.sub(r'\n', '', text)

# function to remove punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# function to remove numbers


def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# function to remove extra spaces


def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# function to remove stopwords


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# helper function to create pos tags


def create_pos_tag(str_sent):
    return nlp(str_sent)

# function for text lemmatization using spac
##'ADJ', 'VERB'
def lemmatization(texts, allowed_postags=['PROPN', 'NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            ["_".join(token.lemma_.split(" ")) for token in doc if (token.pos_ in allowed_postags and token.is_alpha and token.is_stop == False)])
    return texts_out

def lemmatization_without_pos(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc])
    return texts_out


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

## helper function to create pos tags distribution
def create_pos_tags_distribution(docs = []):
    token_distribution = {}
    is_alpha = 0
    is_stop = 0
    for doc in docs:
        for token in doc:
            token_distribution[token.pos_] = token_distribution.get(token.pos_, 0) + 1
            if(token.is_alpha):
                is_alpha += 1
            if(token.is_stop):
                is_stop += 1
    return token_distribution, is_alpha, is_stop


# function to create n-grams from noun chunks
def create_noun_chunk_ngrams(docs):
    n_gram_docs = []
    for doc in docs:
        doc_text = doc.text
        for chunk in doc.noun_chunks:
            chunk_n_gram = "_".join(chunk.text.split(" "))
            doc_text = doc_text.replace(chunk.text, chunk_n_gram)
        n_gram_docs.append(doc_text.split(" "))
    return n_gram_docs


def lemmatization_noun_chunks(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if (
            ("_" in token.text) or ## if the token is a noun chunk allow that
            (token.pos_ in ['NOUN', 'PROPN'] and token.is_alpha and token.is_stop == False) ## if the token is a noun or proper noun allow that
        )])
    return texts_out

### Gensim Models Utility Functions

In [14]:
## function to compute optimal parameters for LDA model
def compute_coherence_values(dictionary, corpus, id2word, texts, num_topics, passes, chunk_sizes=[200]):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    params = []
    for num_topic in num_topics:
        for chunk_size in chunk_sizes:
            for num_passes in passes:
                model = LdaModel(corpus=corpus,
                                 id2word=id2word,
                                 num_topics=num_topic,
                                 random_state=100,
                                 update_every=1,
                                 chunksize=chunk_size,
                                 passes=num_passes,
                                 per_word_topics=True)
                model_list.append(model)
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_lda = coherencemodel.get_coherence()
                coherence_values.append(coherence_lda)
                params.append({'num_topics': num_topic, 'chunk_size': chunk_size, 'passes': num_passes})

    return model_list, coherence_values, params

def analyze_gensim_lda_model(lda_model, corpus, id2word, texts, num_topics, passes, chunk_sizes=[200]):
    # Compute Perplexity
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

## helper functions to visualize LDA model
def visualize_gensim_lda_model(lda_model, corpus, id2word, filename="gensim_lda.html"):
    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, corpus, id2word)
    vis.save(filename)

### Sklearn Model Utility Functions

In [15]:
import numpy as np


# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)


def print_sklearn_sparcity(data_vectorized):
    # Materialize the sparse data
    data_dense = data_vectorized.todense()

    # Compute Sparsicity = Percentage of Non-Zero cells
    print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


def create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized):
    lda_output = lda_model.transform(data_vectorized)
    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]
    
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    return df_document_topic

def print_sklearn_dominant_topics(lda_model, data_vectorized):
    df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    return df_document_topics

def print_sklearn_topic_distribution(lda_model, data_vectorized):
    df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents").rename(columns={'index':'Topic'})
    # df_topic_distribution.columns = ["Topic Num", "Num Documents"]
    return df_topic_distribution


# Show top n keywords for each topic
def show_sklearn_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def format_sklearn_topics(topic_keywords):
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    return df_topic_keywords

def analyze_sklearn_lda_model(lda_model, data_vectorized):
    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))
    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

## helper function to visualize lda model
def visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer, mds='tsne'):    
    pyLDAvis.enable_notebook()
    panel2 = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds=mds)
    return panel2

## Text Pre-processing

In [16]:
def preprocess_text(text):
     text = clean_html(text)
     text = lower_case(text)
     text = remove_line_breaks(text)
     text = remove_punctuation(text)
     text = remove_numbers(text)
     text = remove_extra_spaces(text)
     return text

data["cleaned_title"] = data["title"].apply(preprocess_text)



### Tokenizing

In [17]:
data_words = list(sent_to_words(data['cleaned_title']))

## EDA on Title Data

In [10]:
## check for duplicates
data["cleaned_title"].duplicated().sum()

0

In [11]:
data['title_word_count'] = data['cleaned_title'].apply(lambda x: len(str(x).split(" ")))

data['title_word_count'].describe()


count    3972.000000
mean       13.474572
std         2.650764
min         3.000000
25%        12.000000
50%        14.000000
75%        15.000000
max        23.000000
Name: title_word_count, dtype: float64

In [12]:
## checking the distribution of word count in title
fig = px.histogram(data, x="title_word_count", title="Distribution of Word Count in Title")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



##### Notes
* Word counts are fairly distributed. 

In [13]:
## lets create POS tags for each title and see the distribution of POS tags
# docs = [nlp(text) for text in data['title']]
docs = data["cleaned_title"].apply(create_pos_tag)

In [14]:
## creating pos tags distribution
token_distribution, is_alpha, is_stop = create_pos_tags_distribution(docs)

In [15]:
## convert the dictionary to a dataframe
token_distribution_df = pd.DataFrame.from_dict(token_distribution, orient='index', columns=['count']).reset_index().rename(columns={"index": "tags"})


In [16]:
## lets create a distribution of POS tags
## checking the distribution of word count in title
fig = px.histogram(token_distribution_df, x="tags", y="count", title="Distribution POS Tags in Title")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [17]:
## lets see how many words are alpha and how many are stop words
print(f"we have total {data['title_word_count'].sum()} words in the title. Out of which {is_alpha} are alpha and {is_stop} are stop words")

we have total 53521 words in the title. Out of which 53595 are alpha and 13786 are stop words


##### Notes
* So `maximum` tags are
    * `PROPN`- proper noun
    * `VERB` - verb
    * `ADP` - adposition
    * `NOUN` - noun
    * `PUNCT` - punctuation
    * `ADJ` - adjective
* Since these are news article titles, I think useful tags are, 
    * `PROPN`
    * `NOUN`
    * `VERB`
    * `ADJ` - Not sure about adjective yet. 
* We can remove rest of the words and still have a decent topic model. 
* We can also use the `is_stop` and `is_alpha` tags to remove the stopwords and non alpha tokens.
    * Lets update the helper functions accordingly. 

### Word Frequency

In [18]:
from collections import Counter
# lets see if we can calculate word frequency
# all tokens that arent stop words or punctuations
words = []
for doc in docs:
    doc_words = [token.text for token in doc if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop and not token.is_punct and token.is_alpha]
    words.append(doc_words)

flat_list = [item for sublist in words for item in sublist]


word_counts = Counter(flat_list)
word_counts.most_common(10)


[('biden', 693),
 ('gop', 385),
 ('house', 374),
 ('trump', 363),
 ('senate', 286),
 ('democrats', 245),
 ('republicans', 228),
 ('abortion', 228),
 ('bill', 194),
 ('court', 185)]

In [19]:
## lets try and plot the word cloud
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index().rename(columns={"index": "word"})

fig = px.histogram(word_counts_df, x="word", y="count", title="Distribution POS Tags in Title")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [20]:
word_counts_df.describe()

Unnamed: 0,count
count,4498.0
mean,6.131614
std,20.633604
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,693.0


##### Notes
* Interesting the difference between `mean` and `max` frequency is quick big.  Wonder if that would cause issues in the model. 

In [21]:
## lets look at spacy merge entities 
# nlp.add_pipe("merge_noun_chunks")
merged_docs = [nlp(text) for text in data['cleaned_title']]


In [22]:
# test_str = "graham cruz tell mayorkas hes on notice for possible impeachment over border crisis"

# texts = [(t.lemma_, t.pos_) for t in merged_docs]
# texts
for merged_doc in merged_docs:
    print([("_".join(t.lemma_.split(" ")), t.pos_) for t in merged_doc])

[('hassan', 'PROPN'), ('and', 'CCONJ'), ('bolduc', 'PROPN'), ('trade', 'VERB'), ('fire', 'NOUN'), ('in', 'ADP'), ('final', 'ADJ'), ('showdown', 'NOUN'), ('after', 'SCONJ'), ('gop', 'PROPN'), ('nominee', 'NOUN'), ('come', 'VERB'), ('under', 'ADP'), ('attack', 'NOUN'), ('arrive', 'VERB'), ('at', 'ADP'), ('debate', 'NOUN')]
[('biden', 'PROPN'), ('suggest', 'VERB'), ('vote', 'VERB'), ('for', 'ADP'), ('republicans', 'PROPN'), ('be', 'AUX'), ('a', 'DET'), ('threat', 'NOUN'), ('to', 'ADP'), ('democracy', 'NOUN')]
[('nycs', 'PROPN'), ('naked', 'ADJ'), ('cowboy', 'NOUN'), ('make', 'VERB'), ('endorsement', 'NOUN'), ('for', 'ADP'), ('gov', 'PROPN'), ('while', 'SCONJ'), ('perform', 'VERB'), ('on', 'ADP'), ('times', 'PROPN'), ('square', 'PROPN'), ('restore', 'VERB'), ('law', 'NOUN'), ('and', 'CCONJ'), ('order', 'NOUN')]
[('wisconsin', 'PROPN'), ('court', 'NOUN'), ('shoot', 'VERB'), ('down', 'ADP'), ('liberal', 'ADJ'), ('group', 'NOUN'), ('attempt', 'NOUN'), ('to', 'PART'), ('change', 'VERB'), ('rul

##### Notes
* I think `merge_entities` and `merge_noun_entities` is what we want, it might end up with fewer words in vocab and we might not need bigrams/trigrams

In [23]:
words = []
for merged_doc in merged_docs:
    doc_words = [token.text for token in merged_doc if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop and not token.is_punct and token.is_alpha]
    words.append(doc_words)

flat_list = [item for sublist in words for item in sublist]


word_counts = Counter(flat_list)
word_counts.most_common(10)

[('biden', 693),
 ('gop', 385),
 ('house', 374),
 ('trump', 363),
 ('senate', 286),
 ('democrats', 245),
 ('republicans', 228),
 ('abortion', 228),
 ('bill', 194),
 ('court', 185)]

In [24]:
## lets try and plot the word cloud
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index().rename(columns={"index": "word"})

fig = px.histogram(word_counts_df, x="word", y="count", title="Distribution POS Tags in Title")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [25]:
word_counts_df.describe()

Unnamed: 0,count
count,4498.0
mean,6.131614
std,20.633604
min,1.0
25%,1.0
50%,2.0
75%,4.0
max,693.0


##### Notes
* Not too much difference in frequency distribution

## Creating Bigram & Tigram Models

In [26]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[10]]])

['rnc', 'chair', 'ronna', 'mcdaniel', 'says', 'gop', 'seeing', 'huge', 'enthusiasm', 'with', 'less', 'than', 'week', 'until', 'election', 'day']


## Gensim LDA with BOW

### Lemmatization

In [75]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'attack', 'debate']]


### Create Dictionary & Corpus

In [76]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
## filter out words that occur less than 10 documents, or more than 75% of the documents.
id2word.filter_extremes(no_below=30, no_above=0.75, keep_n=10000)
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[]]


In [77]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[]]

### Building the Topic Model

In [78]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=50,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                    #  passes=250,
                    #  alpha='auto',
                     per_word_topics=True)


In [79]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

[(14,
  '0.716*"inflation" + 0.161*"biden" + 0.039*"republicans" + 0.031*"abortion" '
  '+ 0.020*"senate" + 0.010*"texas" + 0.010*"migrant" + 0.001*"poll" + '
  '0.001*"doj" + 0.001*"desantis"'),
 (27,
  '0.622*"senate" + 0.148*"biden" + 0.134*"trump" + 0.029*"election" + '
  '0.029*"dems" + 0.015*"republicans" + 0.015*"inflation" + 0.000*"poll" + '
  '0.000*"abortion" + 0.000*"desantis"'),
 (44,
  '0.709*"desantis" + 0.277*"migrant" + 0.001*"poll" + 0.001*"texas" + '
  '0.001*"dem" + 0.001*"trump" + 0.001*"doj" + 0.001*"abortion" + '
  '0.001*"fetterman" + 0.001*"fbi"'),
 (23,
  '0.568*"election" + 0.152*"gop" + 0.152*"democrats" + 0.035*"senate" + '
  '0.035*"republicans" + 0.035*"midterm" + 0.001*"texas" + 0.001*"poll" + '
  '0.001*"abortion" + 0.001*"desantis"'),
 (37,
  '0.649*"midterm" + 0.177*"democrats" + 0.110*"gop" + 0.053*"republicans" + '
  '0.000*"texas" + 0.000*"fbi" + 0.000*"poll" + 0.000*"abortion" + 0.000*"dem" '
  '+ 0.000*"doj"'),
 (21,
  '0.424*"biden" + 0.212*"amer

### Analyzing Model

In [80]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -4.594951833209757


### Visualize Topics

In [34]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



## Gensim LDA with Bigram BOW

### Lemmatization

In [35]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ'])
data_lemmatized = lemmatization_noun_chunks(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

[['fire', 'final', 'showdown', 'nominee', 'attack', 'debate']]


### Create Dictionary & Corpus

In [36]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [37]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('attack', 1),
  ('debate', 1),
  ('final', 1),
  ('fire', 1),
  ('nominee', 1),
  ('showdown', 1)]]

### Building the Topic Model

In [38]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=10,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [39]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

[(0,
  '0.045*"candidate" + 0.041*"group" + 0.022*"conservative" + 0.018*"home" + '
  '0.016*"chinese" + 0.016*"people" + 0.014*"effort" + 0.014*"shooting" + '
  '0.013*"fund" + 0.013*"election"'),
 (1,
  '0.037*"leader" + 0.026*"majority" + 0.025*"vulnerable" + 0.024*"death" + '
  '0.021*"reelection" + 0.020*"district" + 0.016*"private" + 0.016*"party" + '
  '0.016*"fundraising" + 0.013*"bid"'),
 (2,
  '0.074*"primary" + 0.035*"democratic" + 0.027*"midterm" + 0.024*"handout" + '
  '0.024*"right" + 0.023*"vote" + 0.023*"ban" + 0.023*"candidate" + '
  '0.018*"agent" + 0.017*"family"'),
 (3,
  '0.037*"migrant" + 0.037*"election" + 0.035*"ruling" + 0.027*"decision" + '
  '0.024*"policy" + 0.024*"threat" + 0.023*"gun" + 0.021*"school" + '
  '0.019*"day" + 0.019*"plan"'),
 (4,
  '0.047*"border" + 0.038*"police" + 0.032*"crisis" + 0.021*"spending" + '
  '0.021*"political" + 0.020*"migrant" + 0.020*"activist" + 0.018*"security" + '
  '0.018*"deal" + 0.018*"office"'),
 (5,
  '0.046*"official" 

##### Notes
* So visually it seems we have a different topics when we use `bigrams`. 

### Analyzing Model

In [40]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.841529029229702


### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

NameError: name 'pyLDAvis' is not defined

## Gensim LDA with Trigram BOW

### Lemmatization

In [None]:
## lemmatization with trigrams
data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])


### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Gensim LDA with Spacy noun-chunks n-Grams

##### Notes
* In this case we are trying to use `Spacy's` noun-chunks to create n-grams.
* We'll need to first create tokens from clean text. 
* We'll then need a function to replace the nouns in (noun chunks) with ngram word. 
* The `lemmatize` this, 
    * We'll need to test with and without POS to see if our filtering affects ngrams. 

### Creating Noun Chunks

In [None]:
## Lets start by reviewing noun chunks created by spacy
docs = [create_pos_tag(" ".join(x)) for x in data_words]

### Creating Noun Chunks Ngrams

In [None]:
n_grams = create_noun_chunk_ngrams(docs)
n_grams

In [None]:
data_lemmatized = lemmatization_without_pos(n_grams)

In [None]:
data_lemmatized

##### Notes
* So since we are doing `lemmatization` without any `POS` some parts of speech words are present in our lemmatized data. 
* Words like `doesn't`, `believe`, `think` etc are present. I think we should update the function to ignore certain `POS` rather than just include all words. 
* Lets do a quick check on how these `POS` are distributed. 

In [None]:
n_gram_pos = [create_pos_tag(" ".join(x)) for x in data_lemmatized]
n_gram_pos_distribution, is_alpha, is_stop = create_pos_tags_distribution(n_gram_pos)

In [None]:
token_distribution_df = pd.DataFrame.from_dict(n_gram_pos_distribution, orient='index', columns=['count']).reset_index().rename(columns={"index": "tags"})
## lets create a distribution of POS tags
## checking the distribution of word count in title
fig = px.histogram(token_distribution_df, x="tags", y="count", title="Distribution POS Tags in Title")
fig.show()

##### Notes
* Lets see how our `ngrams` are tagged

In [None]:
for token in n_gram_pos:
    for token in token:
        ## Only print the noun chunks
        if("_" in token.text):            
            print(token.text, token.pos_, token.dep_)

##### Notes
* Looks like most of the noun chunks are tagged as `NOUN`, `PROPN` or `ADJ`, but lets confirm it using visualization. 


In [None]:
noun_chunks_tags = {}
for token in n_gram_pos:
    for token in token:
        ## Only print the noun chunks
        if("_" in token.text):
            ## increment the count of the noun chunk
            noun_chunks_tags[token.pos_] = noun_chunks_tags.get(token.pos_, 0) + 1

noun_chunks_tags

##### Notes
* So as assumed most of them are `NOUN` or `PROPN` but just to make sure we don't loose any chunks, lets modify our lemmatization script to handle this case. 

In [None]:
data_lemmatized = lemmatization_noun_chunks(n_grams)

In [None]:
data_lemmatized

##### Notes
* So the lemmatized data looks promising lets train the model and see. 

### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

##### Notes
* So our model didn't perform well, in fact regular model BOW did better than noun chunks. 

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Sklearn LDA with Count Vectorization

##### Notes
* For Count Vecotorization we'l use SKLearn's LDA algorithm. The algorithm is same as Gensim, but the interface is different and it allows us to use CountVectorizations. 

### Lemmatization

In [27]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)


print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'showdown', 'gop', 'nominee', 'attack', 'debate']]


### Count Vectorizer

In [28]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [29]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


Sparsicity:  0.8972885528264708 %


### Building Topic Model

In [30]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=35,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      learning_decay=0.5
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_decay=0.5, learning_method='online',
                          n_components=35, n_jobs=-1, random_state=100)


### Analyzing Model

In [31]:
analyze_sklearn_lda_model(lda_model, data_vectorized)
# See model parameters
pprint(lda_model.get_params())

## 
# Log Likelihood:  -237441.44543701067
# Perplexity:  1308.0281367579253

Log Likelihood:  -124046.36022110481
Perplexity:  672.9156309210325
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.5,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 35,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [32]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,Topic33,Topic34,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.58,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15
Doc1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.41,0.01,0.01,0.01,0.01,0.01,0.01,28
Doc2,0.01,0.01,0.01,0.01,0.01,0.01,0.61,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,6
Doc3,0.58,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
Doc5,0.0,0.0,0.0,0.0,0.0,0.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,5
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
Doc7,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.26,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,16
Doc8,0.13,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,6
Doc9,0.01,0.01,0.01,0.01,0.76,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,4


In [33]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

Unnamed: 0,Topic,Num Documents
0,5,659
1,1,284
2,14,266
3,20,260
4,6,252
5,4,220
6,12,217
7,2,199
8,11,185
9,10,181


### Visualizing Topics

In [34]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [35]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,group,wisconsin,rule,history,barnes,johnson,alarm,mandela,sanctuary,attempt,money,judge,ballot,ron,court,office,criticism,garland,lawsuit,state
Topic 1,border,migrant,doc,admin,crisis,immigrant,agent,patrol,funding,meeting,ice,dhs,fentanyl,ukraine,july,mayorkas,thomas,email,number,operation
Topic 2,democrat,nyc,plan,michigan,victory,abortion,assault,union,test,paul,schumer,push,pennsylvania,whitmer,congressman,tuesday,trial,teacher,line,jury
Topic 3,affidavit,term,wisconsin,help,worker,position,debt,grassley,challenger,social,steve,organization,charlie,hour,marthas,politic,angeles,doc,raid,prison
Topic 4,biden,police,officer,death,capitol,people,response,health,mother,protest,priority,invasion,gender,result,post,lake,katie,fundraiser,kari,letter
Topic 5,house,election,white,voter,dem,biden,midterm,gun,gop,republicans,donation,democrats,harris,control,kamala,claim,question,effort,money,republican
Topic 6,court,supreme,law,decision,policy,biden,john,iran,ohio,trumps,endorsement,order,haley,enforcement,ally,murder,nikki,firm,argument,fetterman
Topic 7,rep,fbi,committee,china,taiwan,family,hunter,fund,search,challenger,aide,team,pregnancy,business,warrant,head,mandate,whistleblower,danchenko,master
Topic 8,cost,attorney,prolife,january,document,arizona,protester,ted,politic,raid,karine,carolina,fbi,girl,university,kamala,council,senator,press,control
Topic 9,dems,crime,biden,immigration,change,tim,maga,newsom,ryan,mccarthy,lawsuit,nation,prison,safety,vacation,critic,vance,remark,green,hall


##### Notes
* Somehow I couldn't find out `coherence` for but from visualization it seems that the model has created topics which are easy to interpret and not overlapping
* I still see some topics that don't make sense but, we can tweak this further by creating n-grams, including more POS and skipping lemmatization. 

### GridSearch for Params

### Write to Pickle File

In [36]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization/"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## Sklearn LDA with Bi-Grams Count Vectorization

### Lemmatization with Bigrams

In [37]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)


data_lemmatized = lemmatization(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'showdown', 'gop', 'nominee', 'attack', 'debate']]


In [38]:
data_lemmatized

[['hassan',
  'bolduc',
  'fire',
  'showdown',
  'gop',
  'nominee',
  'attack',
  'debate'],
 ['biden', 'republicans', 'threat', 'democracy'],
 ['nycs', 'cowboy', 'endorsement', 'gov', 'times', 'square', 'law', 'order'],
 ['wisconsin', 'court', 'group', 'attempt', 'rule', 'absentee', 'ballot'],
 ['texas', 'candidate', 'obama', 'tiktok', 'getoutthevote', 'message'],
 ['twitter', 'republic', 'gop', 'takeover', 'congress'],
 ['mccarthy', 'biden', 'nation', 'speech', 'maga', 'republicans'],
 ['tennessee', 'official', 'vote', 'nashville', 'race'],
 ['democrats', 'attempt', 'ally', 'midterm'],
 ['academic', 'biden', 'response', 'protest', 'crackdown'],
 ['rnc',
  'chair',
  'ronna',
  'mcdaniel',
  'gop',
  'enthusiasm',
  'week',
  'election',
  'day'],
 ['texas', 'authority', 'uber', 'driver'],
 ['biden', 'dems', 'remark', 'democracy', 'ballot', 'maga', 'republicans'],
 ['vote', 'statebystate', 'guide', 'absentee', 'ballot'],
 ['vote', 'election'],
 ['senate', 'midterm', 'candidate', 'ra

### Count Vectorizer

In [39]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [40]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


Sparsicity:  0.8767219157958923 %


### Building Topic Model

In [41]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=33, n_jobs=-1,
                          random_state=100)


### Analyzing Model

In [42]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -96697.14925724012
Perplexity:  562.5566079511259
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 33,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Visualizing Topics

In [43]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,dominant_topic
Doc0,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0
Doc1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.29,0.01,0.01,0.01,0.01,0.32,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,23
Doc2,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.41,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,18
Doc3,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.29,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
Doc4,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.41,0.01,0.01,0.01,0.01,28
Doc5,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.51,0.01,0.01,30
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.56,32
Doc7,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,13
Doc8,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.41,0.01,0.4,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,10
Doc9,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.51,32


In [44]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

Unnamed: 0,Topic,Num Documents
0,6,345
1,23,307
2,11,294
3,12,269
4,29,252
5,4,248
6,13,234
7,18,199
8,32,197
9,24,177


In [45]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [46]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,manchin,west,showdown,energy,nominee,effort,county,view,passage,hassan,sinema,gop,debate,congress,attack,path,voting,abbott,sen,virginia
Topic 1,agent,meeting,dhs,team,mayorkas,email,job,loss,wyoming,tie,week,office,border,request,crisis,group,secretary,policy,mandate,ice
Topic 2,afghanistan,impeachment,day,education,trumps,rubio,abortion,nomination,kid,record,irs,rep,price,reform,border,right,battleground,rally,supporter,schumer
Topic 3,dems,office,pac,question,term,battle,irs,control,newsom,kansas,million,sanders,sinema,reform,talk,russia,mandate,ukraine,staffer,deal
Topic 4,campaign,president,florida,biden,school,wisconsin,immigration,iowa,country,bus,pandemic,approval,girl,johnson,arm,conservative,california,sex,threat,crime
Topic 5,police,policy,officer,month,people,capitol,mother,result,pompeo,trial,argument,info,organization,list,jury,democratic,death,vaccine,office,suspect
Topic 6,state,doj,sen,year,doc,support,probe,california,city,woman,washington,government,fentanyl,parent,prolife,history,director,drug,oregon,protest
Topic 7,assault,mayor,showdown,invasion,president,department,admin,guard,indiana,maryland,bus,fentanyl,nyc,dobbs,russia,paul,obama,walker,organization,state
Topic 8,virginia,department,voting,issue,state,county,west,flight,worker,spending,new,candidate,priority,gop,video,manchin,parent,right,email,union
Topic 9,judge,jan,hearing,lawsuit,video,search,affidavit,subpoena,mike,riot,fauci,raid,trump,tie,speech,trial,trumps,house,new,rhetoric


### Write to Pickle File

In [47]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_bigrams/"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## SKLearn LDA with noun-chunks n-grams

### Creating Noun Chunks

In [48]:
## Lets start by reviewing noun chunks created by spacy
docs = [create_pos_tag(" ".join(x)) for x in data_words]

### Creating Noun Chunks N-grams

In [49]:
n_grams = create_noun_chunk_ngrams(docs)

In [50]:
## lemmatization with bigrams
# data_words_bigrams = make_bigrams(data_words, bigram_mod)

# data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
## performaing lemmatization with noun chunks to preserve the ngram words. 
# data_lemmatized = lemmatization_noun_chunks(n_grams)
data_lemmatized = lemmatization(n_grams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'attack', 'debate']]


In [51]:
data_lemmatized

[['hassan', 'bolduc', 'fire', 'attack', 'debate'],
 ['biden', 'republicans', 'threat', 'democracy'],
 ['endorsement', 'gov', 'law', 'order'],
 ['attempt', 'rule'],
 ['obama', 'tiktok'],
 ['twitter', 'republic', 'congress'],
 ['mccarthy', 'biden', 'speech'],
 [],
 ['democrats', 'midterm'],
 ['biden', 'response'],
 ['gop', 'week'],
 [],
 ['biden', 'dems', 'democracy'],
 ['statebystate', 'guide'],
 ['voting', 'election'],
 [],
 ['new', 'jerseys', 'midterm', 'republicans', 'cult'],
 ['emmett', 'rally'],
 ['gop', 'wisconsin'],
 ['inflation'],
 ['dems', 'messaging'],
 ['officer'],
 ['biden'],
 ['gop'],
 ['voter'],
 ['day', 'midterms', 'americans', 'inflation'],
 ['colorado'],
 ['louisville', 'congress', 'age'],
 ['governor'],
 [],
 ['migrant'],
 ['aclu', 'migrant', 'hell'],
 ['congress', 'manchin'],
 ['oz'],
 ['republicans', 'edge', 'democrats'],
 ['backlash'],
 [],
 [],
 [],
 ['tweet', 'twitter', 'biden'],
 [],
 ['suit', 'panel'],
 ['gop'],
 [],
 [],
 ['fetterman', 'elimination'],
 ['trend'

### Count Vectorizer

In [52]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [53]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


Sparsicity:  0.6959284094351518 %


### Building Topic Model

In [54]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=33, n_jobs=-1,
                          random_state=100)


### Analyzing Model

In [55]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -17660.006384534805
Perplexity:  177.48524927676334
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 33,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Visualizing Topics

In [56]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,dominant_topic
Doc0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.68,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,19
Doc1,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.61,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,16
Doc2,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.68,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,9
Doc3,0.02,0.02,0.52,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,2
Doc4,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.52,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,17
Doc5,0.02,0.52,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,1
Doc6,0.02,0.02,0.02,0.02,0.02,0.52,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,5
Doc7,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc8,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.34,0.01,0.01,0.01,0.01,0.01,0.01,0.34,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,9
Doc9,0.02,0.02,0.02,0.02,0.02,0.52,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,5


In [57]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

Unnamed: 0,Topic,Num Documents
0,0,1946
1,5,415
2,16,237
3,14,171
4,2,148
5,9,115
6,15,104
7,1,77
8,3,76
9,6,68


In [58]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [59]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,migrant,election,texas,manchin,aoc,abbott,prosecutor,democrats,midterm,congress,obama,rubio,issue,senate,trump,month,fbi,democracy,florida,house
Topic 1,congress,race,economy,governor,majority,americans,control,texas,month,abrams,abortion,group,migrant,woman,democrat,history,fauci,taiwan,judge,child
Topic 2,fbi,poll,china,house,taiwan,rule,republicans,biden,race,congress,senate,governor,candidate,raid,trump,doj,voter,school,midterm,democrats
Topic 3,desantis,report,policy,jan,violence,migrant,americans,pelosi,woman,doj,aoc,republicans,gop,senate,request,majority,lack,poll,biden,trump
Topic 4,judge,russia,record,fauci,tie,ukraine,people,office,crime,doj,action,desantis,china,issue,abbott,midterm,group,home,endorsement,america
Topic 5,biden,voter,president,plan,man,reelection,republican,midterm,mcconnell,democracy,dems,republicans,ukraine,week,majority,pence,walker,fetterman,poll,georgia
Topic 6,inflation,america,student,concern,president,year,republicans,home,gop,hochul,november,fauci,threat,abortion,vote,zeldin,midterm,policy,answer,biden
Topic 7,crime,concern,spotlight,zeldin,strategist,taiwan,bidens,parent,gop,law,house,hochul,control,obama,inflation,home,dem,answer,office,student
Topic 8,pentagon,child,parent,bidens,request,policy,death,abortion,gop,tie,voter,pence,group,race,tuesday,alarm,economy,support,school,vote
Topic 9,midterm,dem,people,law,strategist,bidens,endorsement,tie,november,lack,voter,gop,obama,president,abortion,covid,thousand,poll,governor,record


### Write to Pickle File

In [60]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_noun_chunks"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

## SKLearn with Tri-Grams

In [61]:
## lemmatization with bigrams
# data_words_bigrams = make_bigrams(data_words, bigram_mod)

data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
## performaing lemmatization with noun chunks to preserve the ngram words. 
data_lemmatized = lemmatization_noun_chunks(data_words_trigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'showdown', 'gop', 'nominee', 'attack', 'debate']]


In [62]:
data_lemmatized

[['hassan',
  'bolduc',
  'fire',
  'showdown',
  'gop',
  'nominee',
  'attack',
  'debate'],
 ['biden', 'republicans', 'threat', 'democracy'],
 ['nycs', 'cowboy', 'endorsement', 'gov', 'times', 'square', 'law', 'order'],
 ['wisconsin', 'court', 'group', 'attempt', 'rule', 'absentee', 'ballot'],
 ['texas',
  'candidate',
  'beto_orourke',
  'obama',
  'tiktok',
  'getoutthevote',
  'message'],
 ['white_house', 'twitter', 'republic', 'gop', 'takeover', 'congress'],
 ['mccarthy', 'biden', 'nation', 'speech', 'maga', 'republicans'],
 ['tennessee', 'official', 'vote', 'nashville', 'race'],
 ['liz_cheney', 'democrats', 'attempt', 'ally', 'midterm'],
 ['academic', 'biden', 'response', 'protest', 'crackdown'],
 ['rnc',
  'chair',
  'ronna',
  'mcdaniel',
  'gop',
  'enthusiasm',
  'week',
  'election',
  'day'],
 ['texas', 'authority', 'uber', 'driver', 'illegal_immigrant'],
 ['biden', 'dems', 'remark', 'democracy', 'ballot', 'maga', 'republicans'],
 ['vote', 'statebystate', 'guide', 'absent

### Count Vectorizer

In [63]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [64]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


Sparsicity:  0.8790701577710641 %


### Building Topic Model

In [65]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=33, n_jobs=-1,
                          random_state=100)


### Analyzing Model

In [66]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -123889.42716658334
Perplexity:  593.900246802277
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 33,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Visualizing Topics

In [67]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,Topic20,Topic21,Topic22,Topic23,Topic24,Topic25,Topic26,Topic27,Topic28,Topic29,Topic30,Topic31,Topic32,dominant_topic
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.15,0.0,0.0,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.15,0.0,18
Doc1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.41,32
Doc2,0.01,0.01,0.01,0.01,0.01,0.01,0.81,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,6
Doc3,0.0,0.0,0.29,0.0,0.0,0.0,0.0,0.0,0.15,0.15,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
Doc5,0.01,0.17,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.17,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.51,0.01,31
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
Doc7,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,9
Doc8,0.0,0.43,0.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Doc9,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.76,0.01,0.01,0.01,0.01,0.01,0.01,0.01,25


In [68]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

Unnamed: 0,Topic,Num Documents
0,1,324
1,15,317
2,9,298
3,8,293
4,2,262
5,6,246
6,5,221
7,20,216
8,18,174
9,16,171


In [69]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [70]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,role,liz,democracy,fires,fight,help,girl,reform,maryland,claim,oregon,drug,speaker,cbp,bail,event,council,video,world,big
Topic 1,rep,report,dem,biden,california,cheney,liz,funding,union,afghanistan,ukraine,jill,fires,attorney,george,aide,number,job,saudi,arabia
Topic 2,voter,midterm,reelection,gop,record,democrats,tax,majority,poll,wisconsin,senate,candidate,power,bid,news,mother,chair,fox,barnes,result
Topic 3,tests,positive,democrats,poll,republicans,midterm,remark,doctor,southern,speech,maryland,big,demand,act,post,washington,voter,support,weapon,proposal
Topic 4,allegation,deal,ice,congress,war,town,virginia,angeles,colorado,year,washington,fires,week,press,brief,proposal,hutchinson,zeldin,family,twitter
Topic 5,border,migrant,biden,agent,patrol,southern,energy,administration,ice,meeting,dhs,answer,document,mayorkas,email,green,surge,taxis,demand,help
Topic 6,gov,primary,law,covid,abrams,stacey,child,georgia,endorsement,kemp,test,order,push,vaccine,life,fight,enforcement,fauci,watchdog,supporter
Topic 7,maryland,surgery,college,roe,oversight,hobbs,news,vineyard,review,fetterman,danchenko,council,hall,security,win,congressman,organization,lack,suspect,document
Topic 8,state,group,candidate,office,doj,year,mayor,support,staffer,roe,act,john,death,woman,reduction,department,prolife,man,history,south
Topic 9,court,supreme,raid,democrat,official,ruling,spending,threat,justice,decision,case,action,scotus,maralago,trail,agenda,challenge,graham,expert,college


### Write to Pickle File

In [71]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_trigrams"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## SKLearn LDA with TF-IDF Vectorization

##### Notes
* Earlier in the LDA we saw that there were a lot of terms with significantly higher frequency than the mean. 
* One theory is that these high frequency words might be biasing the topics, so we are going to use `TF-IDF` vectorization technique to see if we can fix that bias. 

### Lemmatization

In [26]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'attack', 'debate']]


### TF-IDF Vectorization

In [39]:
## helper function to create tfidf matrix
def create_tfidf_matrix(data, max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    return tfidf_matrix, tfidf_vectorizer

In [40]:
## create tfidf matrix
data_vectorized, vectorizer = create_tfidf_matrix([" ".join(lem_word) for lem_word in data_lemmatized])

### Building Topic Model

In [62]:

# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=50,               # Number of topics
                                      max_iter=50,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      learning_decay=0.5
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

LatentDirichletAllocation(learning_decay=0.5, learning_method='online',
                          max_iter=50, n_components=50, n_jobs=-1,
                          random_state=100)


### Analyzing Model

In [64]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -52474.09112618033
Perplexity:  532859.0293350841
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.5,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 50,
 'mean_change_tol': 0.001,
 'n_components': 50,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Visualizing Topics

In [65]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,biden,threat,pelosi,question,plan,reelection,putin,iowa,authority,haley,price,watchdog,montana,fed,district,target,possibility,business,opportunity,gaffe
Topic 1,migrant,texas,abbott,ice,emergency,politic,chicago,veteran,program,wife,flight,uvalde,chief,humanity,venezuelans,adams,ptsd,pga,fund,amendment
Topic 2,ruling,trip,venezuela,crowd,tip,explanation,socialism,photo,republicans,republican,reporter,report,rep,removal,remark,remainder,release,relationship,relation,rent
Topic 3,way,recession,blame,rent,committee,leavitt,naacp,public,january,relation,relationship,riot,release,right,remainder,remark,removal,review,rep,result
Topic 4,assault,cdc,demings,title,europe,tour,security,suspicion,proxy,asia,singapore,sleep,plenty,removal,rent,rep,report,republicans,reporter,republican
Topic 5,midterm,democrats,illinois,fight,hawley,twitter,expert,court,backlash,resident,rally,fentanyl,saudis,today,shell,odd,game,room,abolition,bounty
Topic 6,kid,cruz,august,bid,jr,birth,notice,murphy,virginians,study,resolution,fund,reach,practice,kleefisch,reporter,report,rep,restriction,removal
Topic 7,support,state,woman,crime,effect,decade,pac,restriction,surge,restaurant,da,schedule,fear,alabama,data,terrorism,squad,change,proofreading,immigrant
Topic 8,focus,city,slap,government,shutdown,entry,artwork,prince,outrage,rent,rep,report,reporter,republican,republicans,request,research,resident,resolution,respect
Topic 9,alarm,document,spike,silence,durbin,minority,boost,dui,step,leak,racism,demand,union,plague,afghans,son,reporter,republican,republicans,report


In [66]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

  by='saliency', ascending=False).head(R).drop('saliency', 1)


### Write to Pickle File

In [67]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_tfidf_vectorization"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

### GridSeach for Best Topic Model

In [37]:
# search params
search_params = {
    'n_components': [10, 20, 30, 40, 50],
}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)

In [38]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

analyze_sklearn_lda_model(best_lda_model, data_vectorized)

Best Model's Params:  {'n_components': 10}
Log Likelihood:  -28025.690542902863
Perplexity:  1518.2500106498098


In [36]:
panel = visualize_sklearn_lda_model(best_lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



ValueError: perplexity must be less than n_samples

### Write to Pickle File

In [79]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_tfidf_vectorization"

with open(path + '/grid_search_features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/grid_search_model_v1', 'wb') as files:
    pickle.dump(best_lda_model, files)    

## SKLearn LDA with Bigrams TF-IDF Vectorization

### Lemmatization

In [80]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)


data_lemmatized = lemmatization(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

[['hassan', 'bolduc', 'fire', 'showdown', 'gop', 'nominee', 'attack', 'debate']]


### TF-IDF Vectorization

In [81]:
## helper function to create tfidf matrix
def create_tfidf_matrix(data, max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, max_df=0.95, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    return tfidf_matrix, tfidf_vectorizer

In [82]:
## create tfidf matrix
data_vectorized, vectorizer = create_tfidf_matrix([" ".join(lem_word) for lem_word in data_lemmatized])

### Building Topic Model

In [83]:

# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=200,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

LatentDirichletAllocation(batch_size=200, learning_method='online',
                          n_components=33, n_jobs=-1, random_state=100)


### Analyzing Model

In [84]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -74425.9432870511
Perplexity:  7933.821099751111
{'batch_size': 200,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 33,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


### Visualizing Topics

In [85]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,plane,session,opec,iran,decision,argument,protester,threat,scotus,right,abortion,durham,case,community,teacher,communication,deal,effort,criticism,conversation
Topic 1,florida,biden,admin,desantis,pelosi,official,adviser,oil,meeting,rubio,iran,russia,office,bidens,challenge,visit,student,trump,staffer,recession
Topic 2,campaign,police,ad,reelection,group,security,thousand,fund,director,death,loss,arm,head,rep,member,dem,cash,army,medium,cotton
Topic 3,georgia,china,decision,sen,trump,threat,taiwan,fbi,probe,showdown,ag,hunter,charge,biden,trip,raid,gun,fraud,election,fight
Topic 4,wall,clout,filibuster,education,son,cruz,odea,supporter,issue,symptom,az,conversation,girl,sense,durbin,ohio,provision,agent,network,operation
Topic 5,comment,resident,measure,place,fall,language,taliban,ballot,semifascism,council,program,member,qaeda,highland,opponent,letter,rep,reelection,afghanistan,texas
Topic 6,americans,speech,ny,hochul,life,college,colorado,crime,fundraiser,zeldin,john,biden,view,post,point,poll,presidency,pa,gop,gov
Topic 7,democrats,abortion,midterm,biden,voter,poll,election,race,republicans,issue,gop,inflation,week,senate,house,crisis,state,ban,dems,day
Topic 8,texas,covid,migrant,biden,city,dc,mayor,activist,gov,abbott,jill,washington,effort,pandemic,bid,education,fauci,county,emergency,border
Topic 9,lead,staff,desantis,independent,cut,democrats,adam,governor,wh,view,zeldin,hochul,fundraising,criminal,maryland,candidate,letter,importance,backlash,vaccine


In [86]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



### Write to Pickle File

In [87]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_tfidf_bigram_vectorization"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

## Random Testing

##### Notes
* Lets just do a quick test with the models that just saved to `pickle files` 
* In order to do that we need to take the texts through following steps, 
    * `Tokenize`
    * `Create Noun Chunks`
    * `Create Noun Chunks n-grams`
    * `Lemmatization`
    * `Count Vectorization`
    * `LDA Transform`

### Utiliy Functions

In [5]:
# helper function to load the model and features
def load_pickle_files(path, model_name = 'model_v1', features_name = 'features_v1'):
    with open(path + '/' + model_name, 'rb') as model:
        lda = pickle.load(model)

    # assuming you pickled the vectorizer
    with open(path + '/' + features_name, 'rb') as vocab:
        features = pickle.load(vocab)
    return (lda, features)


def create_noun_chunks(data_words):
    docs = [create_pos_tag(" ".join(x)) for x in data_words]
    return docs

In [7]:
def predict_topics(text, model, features):
    # tokenize the text
    # print("tokenizing the text...")
    data_words = list(sent_to_words(text))

    # create noun chunks
    # print("creating noun chunks...")
    docs = create_noun_chunks(data_words)

    # create noun chunks ngrams
    # print("creating noun chunk ngrams...")
    n_grams = create_noun_chunk_ngrams(docs)

    # lemmatization
    # print("lemmatization noun chunks...")
    data_lemmatized = lemmatization_noun_chunks(n_grams)

    # count vectorization
    # print("count vectorization...")
   
    vectorizer = CountVectorizer(vocabulary=features)
    # column names
    topicnames = ["Topic" + str(i) for i in range(model.n_components)]
    # print(topicnames)
    
    data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

    ## Create a dataframe with topics as rows and features as columns
    ## each cell represents the weight of the feature in the topic
    df_topic_keywords = pd.DataFrame(model.components_)
    df_topic_keywords.columns = vectorizer.get_feature_names_out()
    df_topic_keywords.index = topicnames  # type: ignore
    
    topic_keywords = show_sklearn_topics(vectorizer, model)
    ## transform gives us the topic distribution for each document
    ## here we have a list of probabilities for each topic, index of the list is the topic number
    topic_probability_scores = model.transform(data_vectorized)
    
    ## from the dataframe we then select the row with the highest probability
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()  # type: ignore
    topic_list = list(topic_keywords)[np.argmax(topic_probability_scores)]
    topic.sort(reverse=True)    
    features_with_weights = list(zip(topic_list, topic))
    return features_with_weights

### Test Data

In [18]:
random_text = data.sample(1)["cleaned_title"].tolist()
random_text

['dc deputy mayor for public safety resigns after being charged with assault that was caught on camera']

## Random Testing SKLearn Count Vectorization

### Load Model

In [105]:
(lda_count_vectorization, features_count_vectorization) = load_pickle_files("../pickles/sklearn_count_vectorization")

In [106]:

# import time
# for text in random_text:
#     print(text)
#     features_with_weights = predict_topics(text, lda_count_vectorization, features_count_vectorization)
#     print(features_with_weights)
#     print(" ")

features_with_weights = predict_topics(random_text, lda_count_vectorization, features_count_vectorization)
print(random_text)
print(features_with_weights)

['japans shinzo abe dead biden stunned outraged over assassination']
[('group', 114.68562402347078), ('wisconsin', 31.549720550974893), ('rule', 23.714311189648843), ('history', 21.039490562066995), ('barnes', 11.372358107012298), ('johnson', 10.984358498183479), ('alarm', 9.85586292872311), ('mandela', 8.969919852394568), ('sanctuary', 7.827781837276203), ('attempt', 7.7318900062705795), ('money', 0.02857142857307145), ('judge', 0.02857142857264063), ('ballot', 0.028571428572136143), ('ron', 0.02857142857207247), ('court', 0.028571428571671643), ('office', 0.028571428571542698), ('criticism', 0.028571428571536914), ('garland', 0.02857142857152228), ('lawsuit', 0.028571428571478215), ('state', 0.028571428571471987)]


## Random Testing Count Vectorization With BiGrams

### Load Model

In [35]:
(lda_with_bigrams, features_with_bigrams) = load_pickle_files("../pickles/sklearn_count_vectorization_bigrams")

In [36]:
features_with_weights = predict_topics(random_text, lda_with_bigrams, features_with_bigrams)
print(random_text)
print(features_with_weights)

['nikki haley fires back after tax forms leaked to media republicans have been too nice for too long']
[('house', 221.78864797173534), ('republicans', 214.037864304175), ('democrats', 199.6092325307863), ('biden', 154.50547194995974), ('report', 77.17680233192355), ('admin', 75.8938763510415), ('attack', 57.95423026574956), ('district', 30.41881489133331), ('strategist', 23.62748062104983), ('source', 21.985574928040048), ('jill', 19.641841072832772), ('tie', 16.923786168307466), ('husband', 14.234109974992167), ('aid', 13.752253219508237), ('oversight', 13.15996362408658), ('military', 11.873087081569999), ('impeachment', 11.549612154406297), ('marriage', 11.196995389056948), ('government', 0.049383437320876425), ('probe', 0.03375038760917506)]


## Random Testing Count Vectorization With Tri-Grams

### Load Model

In [109]:
(lda_with_trigrams, features_with_trigrams) = load_pickle_files("../pickles/sklearn_count_vectorization_trigrams")

In [110]:
features_with_weights = predict_topics(random_text, lda_with_trigrams, features_with_trigrams)
print(random_text)
print(features_with_weights)

['japans shinzo abe dead biden stunned outraged over assassination']
[('role', 11.606396248700682), ('liz', 0.030307600395131893), ('democracy', 0.030307439579993357), ('fires', 0.030307386759344187), ('fight', 0.030307322495490594), ('help', 0.030307154370710916), ('girl', 0.030307145161753063), ('reform', 0.030307089556310424), ('maryland', 0.03030702240368765), ('claim', 0.03030702199158779), ('oregon', 0.030307020945159666), ('drug', 0.03030698400171824), ('speaker', 0.030306979956556353), ('cbp', 0.03030688950133688), ('bail', 0.030306885010462234), ('event', 0.030306879257749813), ('council', 0.030306868593786267), ('video', 0.030306850262292177), ('world', 0.030306836562752758), ('big', 0.0303068266989106)]


## Random Testing Count Vectorization With Noun Chunks

### Load Model

In [111]:
(lda_with_noun_chunks, features_with_noun_chunks) = load_pickle_files("../pickles/sklearn_count_vectorization_noun_chunks")

In [112]:
features_with_weights = predict_topics(random_text, lda_with_noun_chunks, features_with_noun_chunks)
print(random_text)
print(features_with_weights)

['japans shinzo abe dead biden stunned outraged over assassination']
[('migrant', 56.36425370712191), ('election', 33.58499262823224), ('texas', 32.671614964145505), ('manchin', 21.02989968646015), ('aoc', 17.444479867650315), ('abbott', 10.88867057115218), ('prosecutor', 9.371497564125493), ('democrats', 0.030407914530170586), ('midterm', 0.03037824317466148), ('congress', 0.030356612613276178), ('obama', 0.03031073210982518), ('rubio', 0.030309372033708502), ('issue', 0.030308602827961818), ('senate', 0.030307478732421376), ('trump', 0.030307147765158162), ('month', 0.030307013725051946), ('fbi', 0.030306885010462234), ('democracy', 0.030306857150331438), ('florida', 0.030306836354242993), ('house', 0.0303068266989106)]


## Random Testing TF-IDF Vectorizer

In [68]:
(lda_tf_idf, features_tf_idf) = load_pickle_files("../pickles/sklearn_tfidf_vectorization")

In [74]:
random_text = data.sample(1)["title"].tolist()
features_with_weights = predict_topics(random_text, lda_tf_idf, features_tf_idf)
print(random_text)
print(features_with_weights)

['GOP Senate candidate Ted Budd rips Democrat ‘deceptively running as moderate’ despite defund police ties']
[('desantis', 22.241735516015908), ('democrat', 7.8850497631064025), ('prosecutor', 7.499918469279549), ('violence', 7.418587582369679), ('cheney', 6.966455961030091), ('critic', 5.64125177991984), ('payment', 2.1791674487343355), ('victim', 1.9776818744584046), ('outrage', 1.8511793977167181), ('crist', 1.44298868473117), ('criminal', 0.9582096869007859), ('position', 0.9180484429734201), ('solution', 0.801116570597144), ('ally', 0.02000000113467131), ('steam', 0.020000000922128378), ('teaching', 0.020000000728663998), ('light', 0.020000000686481622), ('wyomings', 0.02000000056801035), ('woke', 0.0200000003701196), ('influence', 0.020000000258900277)]


## Random Testing Bigram TF-IDF Vectorizer

In [115]:
(lda_tf_idf, features_tf_idf) = load_pickle_files("../pickles/sklearn_tfidf_bigram_vectorization")

In [116]:
features_with_weights = predict_topics(random_text, lda_tf_idf, features_tf_idf)
print(random_text)
print(features_with_weights)

['japans shinzo abe dead biden stunned outraged over assassination']
[('mother', 8.863328627092345), ('sex', 7.179756651946168), ('end', 6.8408818663790845), ('rhetoric', 5.42818064460467), ('roe', 5.080464008598653), ('assassination', 3.7297409402620874), ('attacker', 3.630765386941288), ('teen', 3.201540776604294), ('target', 3.007155542088819), ('data', 2.635638806144962), ('pelosi', 0.030636112308660177), ('death', 0.03053417418219778), ('leak', 0.030456805236316217), ('homeland', 0.030450234287048607), ('report', 0.030429225092785155), ('officer', 0.030423720032843174), ('police', 0.03041783020696869), ('word', 0.030413475458809302), ('democrats', 0.030413470776437232), ('capitol', 0.030412788423152544)]


## Random Testing TF-IDF Grid Searched Param

In [19]:
(lda_tf_idf, features_tf_idf) = load_pickle_files("../pickles/sklearn_tfidf_vectorization", model_name = 'grid_search_model_v1', features_name = 'grid_search_features_v1')

In [24]:
random_text = data.sample(1)["cleaned_title"].tolist()
random_text
features_with_weights = predict_topics(random_text, lda_tf_idf, features_tf_idf)
print(random_text)
print(features_with_weights)

['nikki haley fires back after tax forms leaked to media republicans have been too nice for too long']
[('house', 53.32531054308415), ('white', 44.46060937824883), ('congresswoman', 36.04185220611481), ('biden', 34.93420638948118), ('republicans', 29.39137487046958), ('tax', 21.529418303127166), ('dems', 19.41012442673193), ('speech', 19.117692602708313), ('control', 16.43855417827374), ('question', 14.205618284604945), ('maga', 13.43899449736458), ('democrats', 12.682077945780794), ('change', 12.54327219550978), ('gop', 11.862242813733054), ('response', 11.628740580613702), ('fentanyl', 11.127551074239776), ('clinton', 10.956325197861249), ('congress', 10.734877351309471), ('government', 10.210000017393082), ('voting', 10.20019876851981)]
