# LDA Topic Modelling

* This notebook is showcases the process of building an NLP Topic Model using `Latent Dirichlet Allocation` method. 
* The dataset we are going to use are `text` and `soft text` from `scrapped_fox_data_clean.csv`. 

## Table Of Contents

## Installations


In [1]:
# ## installing required libraries
# ! pip install beautifulsoup4
# ! pip install pandas
# ! pip install numpy
# ! pip install plotly
# ! pip install nbformat
# ! pip install ipykernel
# ! pip install matplotlip
# ! pip install wordcloud
# ! pip install gensim
# ! pip install pyLDAvis
# ! pip install nltk
# ! pip install -U pip setuptools wheel
# ! pip install -U spacy
# ! python -m spacy download en_core_web_trf 
! python -m spacy download en_core_web_md
! pip install joblib

Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
     --------------------------------------- 42.8/42.8 MB 20.5 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')



[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





## Imports

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as io

# loading library
import pickle

from joblib import dump, load

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
  from imp import reload


## Reading Data

In [3]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


## Utility Functions

### Preparing Stop Words

In [4]:
## extending stopwords
# lets break down the cleaning functions into smaller functions
stop_words = nltk.corpus.stopwords.words('english')

## trying to remove stopwords from stopwords super set. 
stopwords_super_set = pd.read_csv("../data/stopwords/sw1k.csv")

## filtering stopwords to pronouns and other type
stopwords_to_remove = list(stopwords_super_set.loc[(stopwords_super_set["type"] == "G" ), "term"])


# stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'say', 'one', 'time', 'people',
#                   'know', 'like', 'tell', 'get', 'year', 'go', 'around', 'award', 'actually', 'carry',
#                    'new', 'it', 'show', 'news', 'go', 'fox', 'make', 'do', 'not', 'say',
#                    'also', 'love', 'it', 'star', 'go', 'do', 'say', 'not', 'said'
#                    ])

# stop_words.extend(stopwords_to_remove)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Common Utility Functions

In [5]:
nlp = spacy.load('en_core_web_trf')
# nlp.add_pipe('merge_entities')
# nlp.add_pipe("merge_noun_chunks")

# Utility Functions for Text Cleaning
def sent_to_words(sentences):
    for sentence in sentences:
        yield (simple_preprocess(str(sentence), deacc=True))

# function to clean html tags from text


def clean_html(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

# function to convert text to lowercase


def lower_case(text):
    return text.lower()

# function to remove line breaks


def remove_line_breaks(text):
    return re.sub(r'\n', '', text)

# function to remove punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# function to remove numbers


def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# function to remove extra spaces


def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# function to remove stopwords


def remove_stopwords(texts):
    preprocess_text = simple_preprocess(str(texts), deacc=True)
    word_list = [word for word in preprocess_text if word not in stop_words]
    return " ".join(word_list)
    # return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# helper function to create pos tags


def create_pos_tag(str_sent):
    return nlp(str_sent)

# function for text lemmatization using spac
##'ADJ', 'VERB'
def lemmatization(texts, allowed_postags=['PROPN', 'NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            ["_".join(token.lemma_.split(" ")) for token in doc if (token.pos_ in allowed_postags and token.is_alpha and token.is_stop == False)])
    return texts_out

def lemmatization_without_pos(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc])
    return texts_out


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

## helper function to create pos tags distribution
def create_pos_tags_distribution(docs = []):
    token_distribution = {}
    is_alpha = 0
    is_stop = 0
    for doc in docs:
        for token in doc:
            token_distribution[token.pos_] = token_distribution.get(token.pos_, 0) + 1
            if(token.is_alpha):
                is_alpha += 1
            if(token.is_stop):
                is_stop += 1
    return token_distribution, is_alpha, is_stop


# function to create n-grams from noun chunks
def create_noun_chunk_ngrams(docs):
    n_gram_docs = []
    for doc in docs:
        doc_text = doc.text
        for chunk in doc.noun_chunks:
            chunk_n_gram = "_".join(chunk.text.split(" "))
            doc_text = doc_text.replace(chunk.text, chunk_n_gram)
        n_gram_docs.append(doc_text.split(" "))
    return n_gram_docs


def lemmatization_noun_chunks(texts):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if (
            ("_" in token.text) or ## if the token is a noun chunk allow that
            (token.pos_ in ['NOUN', 'PROPN'] and token.is_alpha and token.is_stop == False) ## if the token is a noun or proper noun allow that
        )])
    return texts_out

### Gensim Models Utility Functions

In [6]:
## function to compute optimal parameters for LDA model
def compute_coherence_values(dictionary, corpus, id2word, texts, num_topics, passes, chunk_sizes=[200]):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    params = []
    for num_topic in num_topics:
        for chunk_size in chunk_sizes:
            for num_passes in passes:
                model = LdaModel(corpus=corpus,
                                 id2word=id2word,
                                 num_topics=num_topic,
                                 random_state=100,
                                 update_every=1,
                                 chunksize=chunk_size,
                                 passes=num_passes,
                                 per_word_topics=True)
                model_list.append(model)
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_lda = coherencemodel.get_coherence()
                coherence_values.append(coherence_lda)
                params.append({'num_topics': num_topic, 'chunk_size': chunk_size, 'passes': num_passes})

    return model_list, coherence_values, params

def analyze_gensim_lda_model(lda_model, corpus, id2word, texts, num_topics, passes, chunk_sizes=[200]):
    # Compute Perplexity
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

## helper functions to visualize LDA model
def visualize_gensim_lda_model(lda_model, corpus, id2word, filename="gensim_lda.html"):
    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, corpus, id2word)
    vis.save(filename)

### Sklearn Model Utility Functions

In [7]:
import numpy as np


# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)


def print_sklearn_sparcity(data_vectorized):
    # Materialize the sparse data
    data_dense = data_vectorized.todense()

    # Compute Sparsicity = Percentage of Non-Zero cells
    print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


def create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized):
    lda_output = lda_model.transform(data_vectorized)
    # column names
    topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
    # index names
    docnames = ["Doc" + str(i) for i in range(len(data))]
    
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic
    return df_document_topic

def print_sklearn_dominant_topics(lda_model, data_vectorized):
    df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
    # Apply Style
    df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
    return df_document_topics

def print_sklearn_topic_distribution(lda_model, data_vectorized):
    df_document_topic = create_sklearn_dominent_topic_dataframe(lda_model, data_vectorized)
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents").rename(columns={'index':'Topic'})
    # df_topic_distribution.columns = ["Topic Num", "Num Documents"]
    return df_topic_distribution


# Show top n keywords for each topic
def show_sklearn_topics(vectorizer, lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def format_sklearn_topics(topic_keywords):
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
    return df_topic_keywords

def analyze_sklearn_lda_model(lda_model, data_vectorized):
    # Log Likelyhood: Higher the better
    print("Log Likelihood: ", lda_model.score(data_vectorized))
    # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
    print("Perplexity: ", lda_model.perplexity(data_vectorized))

## helper function to visualize lda model
def visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer, mds='tsne'):    
    pyLDAvis.enable_notebook()
    panel2 = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds=mds)
    return panel2

## Text Pre-processing

In [8]:
def preprocess_text(text):
     text = clean_html(text)
     text = lower_case(text)
     text = remove_line_breaks(text)
     text = remove_punctuation(text)
     text = remove_numbers(text)
     text = remove_extra_spaces(text)
     text = remove_stopwords(text)
     return text

data["clean_text"] = data["text"].apply(preprocess_text)

In [9]:
data["clean_text"]

0       former governor first term democratic sen magg...
1       president biden urged democrats wednesday show...
2       famous naked cowboy new york citys times squar...
3       liberal groups wisconsin seeking change rules ...
4       texas gubernatorial nominee beto rourke among ...
                              ...                        
3967    former vice president mike pence said monday a...
3968    house senate office building anywhere us capit...
3969    first fox gov ron desantis reelection campaign...
3970    first fox new documents exclusively obtained f...
3971    runs year reelection amid difficult political ...
Name: clean_text, Length: 3972, dtype: object

### Tokenizing

In [10]:
data_words = list(sent_to_words(data['clean_text']))

## EDA on text Data

In [11]:
## check for duplicates
data["clean_text"].duplicated().sum()

0

In [12]:
data['text_word_count'] = data['clean_text'].apply(lambda x: len(str(x).split(" ")))

data['text_word_count'].describe()


count    3972.000000
mean      356.933535
std       193.554195
min        20.000000
25%       233.000000
50%       315.000000
75%       429.000000
max      5011.000000
Name: text_word_count, dtype: float64

In [13]:
## checking the distribution of word count in text
fig = px.histogram(data, x="text_word_count", title="Distribution of Word Count in text")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



##### Notes
* Word counts are fairly distributed. 

In [14]:
from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    print("flatten a list of lists to a combined list")
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    nlp_list = list(nlp.pipe(texts, batch_size=100, n_process=100))
    print("processing chunk..")
    return nlp_list

def preprocess_parallel(texts, chunksize=1000):
    executor = Parallel(n_jobs=50, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(data), chunksize=chunksize))
    print("Processing {} texts in {} jobs".format(len(data), 50))
    result = executor(tasks)
    return flatten(result)

In [19]:
%%time
## lets create POS tags for each text and see the distribution of POS tags
# docs = [nlp(text) for text in data['clean_text']]
docs = [d for d in nlp.pipe(data['clean_text'], batch_size=100, n_process=100)]
# docs = data["clean_text"].apply(create_pos_tag)
# docs = preprocess_parallel(data['clean_text'], chunksize=1000)

In [None]:
docs

In [None]:
## creating pos tags distribution
token_distribution, is_alpha, is_stop = create_pos_tags_distribution(docs)

In [None]:
## convert the dictionary to a dataframe
token_distribution_df = pd.DataFrame.from_dict(token_distribution, orient='index', columns=['count']).reset_index().rename(columns={"index": "tags"})


In [None]:
## lets create a distribution of POS tags
## checking the distribution of word count in text
fig = px.histogram(token_distribution_df, x="tags", y="count", text="Distribution POS Tags in text")
fig.show()

In [None]:
## lets see how many words are alpha and how many are stop words
print(f"we have total {data['text_word_count'].sum()} words in the text. Out of which {is_alpha} are alpha and {is_stop} are stop words")

##### Notes
* So `maximum` tags are
    * `PROPN`- proper noun
    * `VERB` - verb
    * `ADP` - adposition
    * `NOUN` - noun
    * `PUNCT` - punctuation
    * `ADJ` - adjective
* Since these are news article texts, I think useful tags are, 
    * `PROPN`
    * `NOUN`
    * `VERB`
    * `ADJ` - Not sure about adjective yet. 
* We can remove rest of the words and still have a decent topic model. 
* We can also use the `is_stop` and `is_alpha` tags to remove the stopwords and non alpha tokens.
    * Lets update the helper functions accordingly. 

### Word Frequency

In [None]:
from collections import Counter
# lets see if we can calculate word frequency
# all tokens that arent stop words or punctuations
words = []
for doc in docs:
    doc_words = [token.text for token in doc if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop and not token.is_punct and token.is_alpha]
    words.append(doc_words)

flat_list = [item for sublist in words for item in sublist]


word_counts = Counter(flat_list)
word_counts.most_common(10)


In [None]:
## lets try and plot the word cloud
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index().rename(columns={"index": "word"})

fig = px.histogram(word_counts_df, x="word", y="count", text="Distribution POS Tags in text")
fig.show()

In [None]:
word_counts_df.describe()

##### Notes
* Interesting the difference between `mean` and `max` frequency is quick big.  Wonder if that would cause issues in the model. 

In [None]:
## lets look at spacy merge entities 
# nlp.add_pipe("merge_noun_chunks")
merged_docs = [nlp(text) for text in data['clean_text']]


In [None]:
# test_str = "graham cruz tell mayorkas hes on notice for possible impeachment over border crisis"

# texts = [(t.lemma_, t.pos_) for t in merged_docs]
# texts
for merged_doc in merged_docs:
    print([("_".join(t.lemma_.split(" ")), t.pos_) for t in merged_doc])

##### Notes
* I think `merge_entities` and `merge_noun_entities` is what we want, it might end up with fewer words in vocab and we might not need bigrams/trigrams

In [None]:
words = []
for merged_doc in merged_docs:
    doc_words = [token.text for token in merged_doc if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop and not token.is_punct and token.is_alpha]
    words.append(doc_words)

flat_list = [item for sublist in words for item in sublist]


word_counts = Counter(flat_list)
word_counts.most_common(10)

In [None]:
## lets try and plot the word cloud
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index().rename(columns={"index": "word"})

fig = px.histogram(word_counts_df, x="word", y="count", text="Distribution POS Tags in text")
fig.show()

In [None]:
word_counts_df.describe()

##### Notes
* Not too much difference in frequency distribution

## Creating Bigram & Tigram Models

In [None]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[10]]])

## Gensim LDA with BOW

### Lemmatization

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)

print(data_lemmatized[:1])

### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
## filter out words that occur less than 10 documents, or more than 75% of the documents.
# id2word.filter_extremes(no_below=30, no_above=0.75, keep_n=10000)
# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=10,
                     random_state=100,
                     update_every=1,
                    #  chunksize=200,
                    #  passes=250,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Gensim LDA with Bigram BOW

### Lemmatization

In [None]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ'])
data_lemmatized = lemmatization_noun_chunks(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=10,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

##### Notes
* So visually it seems we have a different topics when we use `bigrams`. 

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Gensim LDA with Trigram BOW

### Lemmatization

In [None]:
## lemmatization with trigrams
data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_trigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])


### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Gensim LDA with Spacy noun-chunks n-Grams

##### Notes
* In this case we are trying to use `Spacy's` noun-chunks to create n-grams.
* We'll need to first create tokens from clean text. 
* We'll then need a function to replace the nouns in (noun chunks) with ngram word. 
* The `lemmatize` this, 
    * We'll need to test with and without POS to see if our filtering affects ngrams. 

### Creating Noun Chunks

In [None]:
## Lets start by reviewing noun chunks created by spacy
docs = [create_pos_tag(" ".join(x)) for x in data_words]

### Creating Noun Chunks Ngrams

In [None]:
n_grams = create_noun_chunk_ngrams(docs)
n_grams

In [None]:
data_lemmatized = lemmatization_without_pos(n_grams)

In [None]:
data_lemmatized

##### Notes
* So since we are doing `lemmatization` without any `POS` some parts of speech words are present in our lemmatized data. 
* Words like `doesn't`, `believe`, `think` etc are present. I think we should update the function to ignore certain `POS` rather than just include all words. 
* Lets do a quick check on how these `POS` are distributed. 

In [None]:
n_gram_pos = [create_pos_tag(" ".join(x)) for x in data_lemmatized]
n_gram_pos_distribution, is_alpha, is_stop = create_pos_tags_distribution(n_gram_pos)

In [None]:
token_distribution_df = pd.DataFrame.from_dict(n_gram_pos_distribution, orient='index', columns=['count']).reset_index().rename(columns={"index": "tags"})
## lets create a distribution of POS tags
## checking the distribution of word count in text
fig = px.histogram(token_distribution_df, x="tags", y="count", text="Distribution POS Tags in text")
fig.show()

##### Notes
* Lets see how our `ngrams` are tagged

In [None]:
for token in n_gram_pos:
    for token in token:
        ## Only print the noun chunks
        if("_" in token.text):            
            print(token.text, token.pos_, token.dep_)

##### Notes
* Looks like most of the noun chunks are tagged as `NOUN`, `PROPN` or `ADJ`, but lets confirm it using visualization. 


In [None]:
noun_chunks_tags = {}
for token in n_gram_pos:
    for token in token:
        ## Only print the noun chunks
        if("_" in token.text):
            ## increment the count of the noun chunk
            noun_chunks_tags[token.pos_] = noun_chunks_tags.get(token.pos_, 0) + 1

noun_chunks_tags

##### Notes
* So as assumed most of them are `NOUN` or `PROPN` but just to make sure we don't loose any chunks, lets modify our lemmatization script to handle this case. 

In [None]:
data_lemmatized = lemmatization_noun_chunks(n_grams)

In [None]:
data_lemmatized

##### Notes
* So the lemmatized data looks promising lets train the model and see. 

### Create Dictionary & Corpus

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


### Building the Topic Model

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     chunksize=200,
                     passes=200,
                    #  alpha='auto',
                     per_word_topics=True)


In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

### Analyzing Model

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

##### Notes
* So our model didn't perform well, in fact regular model BOW did better than noun chunks. 

### Visualize Topics

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

## Sklearn LDA with Count Vectorization

##### Notes
* For Count Vecotorization we'l use SKLearn's LDA algorithm. The algorithm is same as Gensim, but the interface is different and it allows us to use CountVectorizations. 

### Lemmatization

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)


print(data_lemmatized[:1])

### Count Vectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


### Building Topic Model

In [None]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=35,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                      learning_decay=0.5
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)
# See model parameters
pprint(lda_model.get_params())

## 
# Log Likelihood:  -237441.44543701067
# Perplexity:  1308.0281367579253

In [None]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

In [None]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

### Visualizing Topics

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

##### Notes
* Somehow I couldn't find out `coherence` for but from visualization it seems that the model has created topics which are easy to interpret and not overlapping
* I still see some topics that don't make sense but, we can tweak this further by creating n-grams, including more POS and skipping lemmatization. 

### GridSearch for Params

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization/"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## Sklearn LDA with Bi-Grams Count Vectorization

### Lemmatization with Bigrams

In [None]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)


data_lemmatized = lemmatization(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

In [None]:
data_lemmatized

### Count Vectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


### Building Topic Model

In [None]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

### Visualizing Topics

In [None]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

In [None]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_bigrams/"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## SKLearn LDA with noun-chunks n-grams

### Creating Noun Chunks

In [None]:
## Lets start by reviewing noun chunks created by spacy
docs = [create_pos_tag(" ".join(x)) for x in data_words]

### Creating Noun Chunks N-grams

In [None]:
n_grams = create_noun_chunk_ngrams(docs)

In [None]:
## lemmatization with bigrams
# data_words_bigrams = make_bigrams(data_words, bigram_mod)

# data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
## performaing lemmatization with noun chunks to preserve the ngram words. 
# data_lemmatized = lemmatization_noun_chunks(n_grams)
data_lemmatized = lemmatization(n_grams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

In [None]:
data_lemmatized

### Count Vectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


### Building Topic Model

In [None]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

### Visualizing Topics

In [None]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

In [None]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_noun_chunks"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

## SKLearn with Tri-Grams

In [None]:
## lemmatization with bigrams
# data_words_bigrams = make_bigrams(data_words, bigram_mod)

data_words_trigrams = make_trigrams(data_words, bigram_mod, trigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
## performaing lemmatization with noun chunks to preserve the ngram words. 
data_lemmatized = lemmatization_noun_chunks(data_words_trigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

In [None]:
data_lemmatized

### Count Vectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")


### Building Topic Model

In [None]:
# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

### Visualizing Topics

In [None]:
dominant_topics = print_sklearn_dominant_topics(lda_model, data_vectorized)
dominant_topics

In [None]:
topic_distribution = print_sklearn_topic_distribution(lda_model, data_vectorized)
topic_distribution

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_count_vectorization_trigrams"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)   

## SKLearn LDA with TF-IDF Vectorization

##### Notes
* Earlier in the LDA we saw that there were a lot of terms with significantly higher frequency than the mean. 
* One theory is that these high frequency words might be biasing the topics, so we are going to use `TF-IDF` vectorization technique to see if we can fix that bias. 

### Lemmatization

In [None]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words)

print(data_lemmatized[:1])

### TF-IDF Vectorization

In [None]:
## helper function to create tfidf matrix
def create_tfidf_matrix(data, max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, max_df=0.95, min_df=2, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    return tfidf_matrix, tfidf_vectorizer

In [None]:
## create tfidf matrix
data_vectorized, vectorizer = create_tfidf_matrix([" ".join(lem_word) for lem_word in data_lemmatized])

### Building Topic Model

In [None]:

# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

### Visualizing Topics

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_tfidf_vectorization"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

## SKLearn LDA with Bigrams TF-IDF Vectorization

### Lemmatization

In [None]:
## lemmatization with bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)


data_lemmatized = lemmatization(data_words_bigrams)

# Remove Stop Words
## we are removing stop words in the lemmatization function using spacy is_stop flag
# data_words_nostops = remove_stopwords(data_lemmatized)

print(data_lemmatized[:1])

### TF-IDF Vectorization

In [None]:
## helper function to create tfidf matrix
def create_tfidf_matrix(data, max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, max_df=0.95, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    return tfidf_matrix, tfidf_vectorizer

In [None]:
## create tfidf matrix
data_vectorized, vectorizer = create_tfidf_matrix([" ".join(lem_word) for lem_word in data_lemmatized])

### Building Topic Model

In [None]:

# Build LDA Model 35
lda_model = LatentDirichletAllocation(n_components=33,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=200,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

### Analyzing Model

In [None]:
analyze_sklearn_lda_model(lda_model, data_vectorized)

# See model parameters
pprint(lda_model.get_params())

### Visualizing Topics

In [None]:
## vectorizer, lda_model, n_words=20
df_topic_keywords = show_sklearn_topics(vectorizer, lda_model)
formatted_topics = format_sklearn_topics(df_topic_keywords)
formatted_topics

In [None]:
panel = visualize_sklearn_lda_model(lda_model, data_vectorized, vectorizer)
panel

### Write to Pickle File

In [None]:
# create an iterator object with write permission - model.pkl
features = vectorizer.get_feature_names_out()

path = "../pickles/sklearn_tfidf_bigram_vectorization"

with open(path + '/features_v1', 'wb') as files:
    pickle.dump(features, files)
    
with open(path + '/model_v1', 'wb') as files:
    pickle.dump(lda_model, files)    

## Random Testing

##### Notes
* Lets just do a quick test with the models that just saved to `pickle files` 
* In order to do that we need to take the texts through following steps, 
    * `Tokenize`
    * `Create Noun Chunks`
    * `Create Noun Chunks n-grams`
    * `Lemmatization`
    * `Count Vectorization`
    * `LDA Transform`

### Utiliy Functions

In [None]:
# helper function to load the model and features
def load_pickle_files(path):
    with open(path + '/model_v1', 'rb') as model:
        lda = pickle.load(model)

    # assuming you pickled the vectorizer
    with open(path + '/features_v1', 'rb') as vocab:
        features = pickle.load(vocab)
    return (lda, features)


def create_noun_chunks(data_words):
    docs = [create_pos_tag(" ".join(x)) for x in data_words]
    return docs

In [None]:
# with open('../pickles/noun_chunks_model_v1', 'rb') as model:
#     lda = pickle.load(model)

# # assuming you pickled the vectorizer
# with open('../pickles/noun_chunks_features_v1', 'rb') as vocab:
#     features = pickle.load(vocab)

In [None]:
def predict_topics(text, model, features):
    # tokenize the text
    # print("tokenizing the text...")
    data_words = list(sent_to_words(text))

    # create noun chunks
    # print("creating noun chunks...")
    docs = create_noun_chunks(data_words)

    # create noun chunks ngrams
    # print("creating noun chunk ngrams...")
    n_grams = create_noun_chunk_ngrams(docs)

    # lemmatization
    # print("lemmatization noun chunks...")
    data_lemmatized = lemmatization_noun_chunks(n_grams)

    # count vectorization
    # print("count vectorization...")
   
    vectorizer = CountVectorizer(vocabulary=features)
    # column names
    topicnames = ["Topic" + str(i) for i in range(model.n_components)]
    # print(topicnames)
    
    data_vectorized = vectorizer.fit_transform([" ".join(lem_word) for lem_word in data_lemmatized])

    ## Create a dataframe with topics as rows and features as columns
    ## each cell represents the weight of the feature in the topic
    df_topic_keywords = pd.DataFrame(model.components_)
    df_topic_keywords.columns = vectorizer.get_feature_names_out()
    df_topic_keywords.index = topicnames  # type: ignore
    
    topic_keywords = show_sklearn_topics(vectorizer, model)
    ## transform gives us the topic distribution for each document
    ## here we have a list of probabilities for each topic, index of the list is the topic number
    topic_probability_scores = model.transform(data_vectorized)
    
    ## from the dataframe we then select the row with the highest probability
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()  # type: ignore
    topic_list = list(topic_keywords)[np.argmax(topic_probability_scores)]
    topic.sort(reverse=True)    
    features_with_weights = list(zip(topic_list, topic))
    return features_with_weights

### Test Data

In [None]:
random_text = data.sample(1)["clean_text"].tolist()
random_text

## Random Testing SKLearn Count Vectorization

### Load Model

In [None]:
(lda_count_vectorization, features_count_vectorization) = load_pickle_files("../pickles/sklearn_count_vectorization")

In [None]:

# import time
# for text in random_text:
#     print(text)
#     features_with_weights = predict_topics(text, lda_count_vectorization, features_count_vectorization)
#     print(features_with_weights)
#     print(" ")

features_with_weights = predict_topics(random_text, lda_count_vectorization, features_count_vectorization)
print(random_text)
print(features_with_weights)

## Random Testing Count Vectorization With BiGrams

### Load Model

In [None]:
(lda_with_bigrams, features_with_bigrams) = load_pickle_files("../pickles/sklearn_count_vectorization_bigrams")

In [None]:
features_with_weights = predict_topics(random_text, lda_with_bigrams, features_with_bigrams)
print(random_text)
print(features_with_weights)

## Random Testing Count Vectorization With Tri-Grams

### Load Model

In [None]:
(lda_with_trigrams, features_with_trigrams) = load_pickle_files("../pickles/sklearn_count_vectorization_trigrams")

In [None]:
features_with_weights = predict_topics(random_text, lda_with_trigrams, features_with_trigrams)
print(random_text)
print(features_with_weights)

## Random Testing Count Vectorization With Noun Chunks

### Load Model

In [None]:
(lda_with_noun_chunks, features_with_noun_chunks) = load_pickle_files("../pickles/sklearn_count_vectorization_noun_chunks")

In [None]:
features_with_weights = predict_topics(random_text, lda_with_noun_chunks, features_with_noun_chunks)
print(random_text)
print(features_with_weights)

## Random Testing TF-IDF Vectorizer

In [None]:
(lda_tf_idf, features_tf_idf) = load_pickle_files("../pickles/sklearn_tfidf_vectorization")

In [None]:
features_with_weights = predict_topics(random_text, lda_tf_idf, features_tf_idf)
print(random_text)
print(features_with_weights)

## Random Testing Bigram TF-IDF Vectorizer

In [None]:
(lda_tf_idf, features_tf_idf) = load_pickle_files("../pickles/sklearn_tfidf_bigram_vectorization")

In [None]:
features_with_weights = predict_topics(random_text, lda_tf_idf, features_tf_idf)
print(random_text)
print(features_with_weights)