# Latent Dirichlet Allocation with SkLearn

In [2]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import pickle
from gensim import corpora, models
from nltk import pos_tag
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim 
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
import tmtoolkit

In [3]:
dtm = pd.read_pickle('dtm.pkl')
corpus = pd.read_pickle('all_text_clean.pkl')

In [4]:
dtm.head()

Unnamed: 0,aaa,aaaaaevgncl,aakiydertvy,aaron,abandonment,abbot,abbott,abc,abdus,abhorrence,...,zink,zombie,zombies,zone,zones,zuckerbergs,zux,zuyubaetw,zwf,Permalink
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462379490609461
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462715223909221
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1461150450732365
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462158127298264
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1462708023909941


In [5]:
corpus.head()

Unnamed: 0,Permalink,full_text,text
1,1462379490609461,I’ve never seen this much unrest in the states...,never seen much unrest states anyone else
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...,attention business owners masked people severi...
3,1461150450732365,This is whats wrong with attention seeking mil...,whats wrong attention seeking millenials young...
4,1462158127298264,Upset Californian Conservative,upset californian conservative
5,1462708023909941,Live Stream. CREW dragon launch.,live stream crew dragon launch


In [6]:
tokenizer = RegexpTokenizer(r'\w+')

In [7]:
corpus['tokenized_text'] = [tokenizer.tokenize(text) for text in corpus['text']]

In [8]:
corpus.head()

Unnamed: 0,Permalink,full_text,text,tokenized_text
1,1462379490609461,I’ve never seen this much unrest in the states...,never seen much unrest states anyone else,"[never, seen, much, unrest, states, anyone, else]"
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...,attention business owners masked people severi...,"[attention, business, owners, masked, people, ..."
3,1461150450732365,This is whats wrong with attention seeking mil...,whats wrong attention seeking millenials young...,"[whats, wrong, attention, seeking, millenials,..."
4,1462158127298264,Upset Californian Conservative,upset californian conservative,"[upset, californian, conservative]"
5,1462708023909941,Live Stream. CREW dragon launch.,live stream crew dragon launch,"[live, stream, crew, dragon, launch]"


In [9]:
#picking out nouns only
is_noun = lambda pos: pos[:2] == 'NN'

nouns = []
for i in corpus.index:
    nouns.append([word for (word, pos) in pos_tag(corpus.loc[i,'tokenized_text']) if is_noun(pos)])
corpus['nouns'] = nouns

In [10]:
corpus.head()

Unnamed: 0,Permalink,full_text,text,tokenized_text,nouns
1,1462379490609461,I’ve never seen this much unrest in the states...,never seen much unrest states anyone else,"[never, seen, much, unrest, states, anyone, else]","[states, anyone]"
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...,attention business owners masked people severi...,"[attention, business, owners, masked, people, ...","[attention, business, owners, people, severity..."
3,1461150450732365,This is whats wrong with attention seeking mil...,whats wrong attention seeking millenials young...,"[whats, wrong, attention, seeking, millenials,...","[whats, attention, millenials, woman, geco, co..."
4,1462158127298264,Upset Californian Conservative,upset californian conservative,"[upset, californian, conservative]",[]
5,1462708023909941,Live Stream. CREW dragon launch.,live stream crew dragon launch,"[live, stream, crew, dragon, launch]","[stream, crew, dragon, launch]"


In [11]:
#nouns and adjectives
is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'

nouns_adj = []
for i in corpus.index:
    nouns_adj.append([word for (word, pos) in pos_tag(corpus.loc[i,'tokenized_text']) if is_noun_adj(pos)])
corpus['nouns_adj'] = nouns_adj

# SkLearn

In [12]:
#thank you to this source:
#https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

In [13]:
vectorizer = CountVectorizer()
tokenized_vectorized = vectorizer.fit_transform(corpus['text'])

ldamodel = LatentDirichletAllocation(n_components = 5, max_iter = 200, random_state = 42)
ldamodel_output = ldamodel.fit_transform(tokenized_vectorized)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", ldamodel.score(tokenized_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", ldamodel.perplexity(tokenized_vectorized))

#Coherence Score- the higher the better
print('Coherence Score', tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        topic_word_distrib=ldamodel.components_, 
                        dtm=tokenized_vectorized, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=corpus['tokenized_text'].values,
                        return_mean = True))

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic %d: {topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(ldamodel, vectorizer.get_feature_names(), 10)



Log Likelihood:  -270680.3620976504
Perplexity:  4257.321437785357
Coherence Score 0.841874789026235
Topic %d: 0
th back circuit like unemployment get needs go work virus
Topic %d: 1
people see lol keep like stop america trump right need
Topic %d: 2
law people see need like hell got masks democrats black
Topic %d: 3
people business normal money new see mask want one like
Topic %d: 4
see people mask go one like vote get would let


## Nouns

In [14]:
corpus['nouns_cv'] = [" ".join(corpus.loc[i,'nouns']) for i in corpus.index]

vectorizer = CountVectorizer()
nouns_vectorized = vectorizer.fit_transform(corpus['nouns_cv'])

ldamodel = LatentDirichletAllocation(n_components = 5, max_iter = 200, random_state = 42)
ldamodel_output = ldamodel.fit_transform(nouns_vectorized)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", ldamodel.score(nouns_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", ldamodel.perplexity(nouns_vectorized))

print('Coherence Score', tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        topic_word_distrib=ldamodel.components_, 
                        dtm=nouns_vectorized, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=corpus['nouns'].values,
                        return_mean = True))

print()

        
display_topics(ldamodel, vectorizer.get_feature_names(), 10)

Log Likelihood:  -131589.7304088064
Perplexity:  3437.2430806186794
Coherence Score 0.8363488576778606

Topic %d: 0
people businesses see government control church business governor state country
Topic %d: 1
mask see business work people lol shit job needs bitch
Topic %d: 2
people law money time vote thing business constitution home god
Topic %d: 3
court th circuit hell news time refuse look unemployment course
Topic %d: 4
democrats state hope way guess masks virus idiot business world


## Nouns and Adjectives

In [15]:
corpus['nouns_adj_cv'] = [" ".join(corpus.loc[i,'nouns_adj']) for i in corpus.index]

vectorizer = CountVectorizer()
nouns_adj_vectorized = vectorizer.fit_transform(corpus['nouns_adj_cv'])

ldamodel = LatentDirichletAllocation(n_components = 5, max_iter = 200, random_state = 42)
ldamodel_output = ldamodel.fit_transform(nouns_adj_vectorized)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", ldamodel.score(nouns_adj_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", ldamodel.perplexity(nouns_adj_vectorized))


print()

        
display_topics(ldamodel, vectorizer.get_feature_names(), 10)

Log Likelihood:  -179641.39753702295
Perplexity:  4021.6231374290937

Topic %d: 0
normal new th circuit court power mask america see constitution
Topic %d: 1
people money work state media sad virus see needs mask
Topic %d: 2
people business time vote businesses mask masks right open way
Topic %d: 3
bad black god people business see white ridiculous government course
Topic %d: 4
people law church see stupid mask evil day michigan power


In [16]:
print('Coherence Score', tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        topic_word_distrib=ldamodel.components_, 
                        dtm=nouns_adj_vectorized, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=corpus['nouns_adj'].values,
                        return_mean = True))

Coherence Score 0.8362035339645146


## Gridsearch for best parameter

Based on the models ran above, the model with the text column has the best coherence score. Thus, this collection of words was used to gridsearch to find the best parameters.

In [17]:
vectorizer = CountVectorizer()
tokenized_vectorized = vectorizer.fit_transform(corpus['text'])
search_params = {'n_components': [5, 7, 9, 11, 13, 15], 'learning_decay': [.5, .7, .9]}
lda = LatentDirichletAllocation(random_state = 42)
model = GridSearchCV(lda, param_grid=search_params, cv = 5)
model.fit(tokenized_vectorized)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1, random_state=42,
                                                 topic_word_prior=None,
                                                 total_samples=1000000.0,
                 

In [18]:
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -74446.58413131862


## Best SkLearn Model

In [19]:
vectorizer = CountVectorizer()
tokenized_vectorized = vectorizer.fit_transform(corpus['text'])

ldamodel = LatentDirichletAllocation(n_components = 5, max_iter = 200, random_state = 42, learning_decay = 0.5)
ldamodel_output = ldamodel.fit_transform(tokenized_vectorized)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", ldamodel.score(tokenized_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", ldamodel.perplexity(tokenized_vectorized))



def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
display_topics(ldamodel, vectorizer.get_feature_names(), 15)

Log Likelihood:  -270680.3620976504
Perplexity:  4257.321437785357
Topic 0
th back circuit like unemployment get needs go work virus refuse say well see crazy
Topic 1
people see lol keep like stop america trump right need going get state enough war
Topic 2
law people see need like hell got masks democrats black get know https god go
Topic 3
people business normal money new see mask want one like think shop get us sick
Topic 4
see people mask go one like vote get would let evil back going well time


In [20]:
print('Coherence Score', tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v',
                        topic_word_distrib=ldamodel.components_, 
                        dtm=tokenized_vectorized, 
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]),
                        texts = corpus['tokenized_text'].values))

Coherence Score [0.8461030897422738, 0.8483235134711699, 0.844317435482765, 0.8341064988743219, 0.8365234075606445]


In [21]:
corpus['sklearn_topics'] = np.argmax(ldamodel_output, axis = 1)
corpus.head()

Unnamed: 0,Permalink,full_text,text,tokenized_text,nouns,nouns_adj,nouns_cv,nouns_adj_cv,sklearn_topics
1,1462379490609461,I’ve never seen this much unrest in the states...,never seen much unrest states anyone else,"[never, seen, much, unrest, states, anyone, else]","[states, anyone]","[much, unrest, states, anyone]",states anyone,much unrest states anyone,4
2,1462715223909221,ATTENTION BUSINESS OWNERS AND ALL THE MASKED P...,attention business owners masked people severi...,"[attention, business, owners, masked, people, ...","[attention, business, owners, people, severity...","[attention, business, owners, people, severity...",attention business owners people severity hoax...,attention business owners people severity hoax...,3
3,1461150450732365,This is whats wrong with attention seeking mil...,whats wrong attention seeking millenials young...,"[whats, wrong, attention, seeking, millenials,...","[whats, attention, millenials, woman, geco, co...","[whats, wrong, attention, millenials, young, w...",whats attention millenials woman geco covid ho...,whats wrong attention millenials young woman b...,3
4,1462158127298264,Upset Californian Conservative,upset californian conservative,"[upset, californian, conservative]",[],"[upset, californian, conservative]",,upset californian conservative,2
5,1462708023909941,Live Stream. CREW dragon launch.,live stream crew dragon launch,"[live, stream, crew, dragon, launch]","[stream, crew, dragon, launch]","[live, stream, crew, dragon, launch]",stream crew dragon launch,live stream crew dragon launch,2


In [22]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(ldamodel, tokenized_vectorized, vectorizer)
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [23]:
corpus.to_csv('./sklearn_topics.csv', index = False)