In [1]:
# Loading packages

import tqdm
import pandas as pd
import numpy as np

import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/franciscorfafonso/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load the DataFrame from the pickle file
df = pd.read_pickle('data_preprocessed_filtered.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57790 entries, 0 to 58446
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      57790 non-null  object 
 1   text                          57790 non-null  object 
 2   created_at                    57790 non-null  object 
 3   campaign_week                 57790 non-null  int64  
 4   process_text_check            57790 non-null  object 
 5   name                          57790 non-null  object 
 6   handle                        57790 non-null  object 
 7   party                         57790 non-null  object 
 8   state_code                    57790 non-null  object 
 9   state_name                    57790 non-null  object 
 10  result_pctg                   57790 non-null  float64
 11  result_votes                  57790 non-null  int64  
 12  position                      57790 non-null  int64  
 13  t

In [3]:
# Create dictionary (needed for LDA)
id2word = corpora.Dictionary(df['WordsLemmatized'])

In [4]:
# Create corpus
texts = df['WordsLemmatized']

In [5]:
# Create TDM (Frequency)
corpus = [id2word.doc2bow(text) for text in texts]

In [6]:
# supporting function
# supporting function
def compute_coherence_values(corpus, dictionary, k):

    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=1000,
                                           passes=50)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

## Analysis

***Hypertune***

In [7]:
grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 5
max_topics = 20
step_size = 1
topics_range = range(min_topics, max_topics, step_size)

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]

corpus_title = ['100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Coherence': []
                 }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(topics_range)*len(corpus_title)))

    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # get the coherence score for the given parameters
            cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k)
            print(f"Coherence for {k} topics: {cv}")  # Print coherence score for each topic
            # Save the model results
            model_results['Validation_Set'].append(corpus_title[i])
            model_results['Topics'].append(k)
            model_results['Coherence'].append(cv)

            pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

  7%|▋         | 1/15 [02:36<36:29, 156.38s/it]

Coherence for 5 topics: 0.48910919287842136


 13%|█▎        | 2/15 [05:14<34:06, 157.40s/it]

Coherence for 6 topics: 0.48727876395053094


 20%|██        | 3/15 [07:47<31:07, 155.60s/it]

Coherence for 7 topics: 0.4907975867619279


 27%|██▋       | 4/15 [10:22<28:27, 155.19s/it]

Coherence for 8 topics: 0.4662316786202284


 33%|███▎      | 5/15 [13:05<26:20, 158.06s/it]

Coherence for 9 topics: 0.5201712965889764


 40%|████      | 6/15 [15:38<23:25, 156.20s/it]

Coherence for 10 topics: 0.5187084053150935


 47%|████▋     | 7/15 [18:21<21:06, 158.35s/it]

Coherence for 11 topics: 0.5214625870331038


 53%|█████▎    | 8/15 [20:45<17:57, 153.92s/it]

Coherence for 12 topics: 0.5195117637937859


 60%|██████    | 9/15 [23:11<15:08, 151.40s/it]

Coherence for 13 topics: 0.5411883052044304


 67%|██████▋   | 10/15 [25:42<12:36, 151.36s/it]

Coherence for 14 topics: 0.493154486713588


 73%|███████▎  | 11/15 [28:23<10:17, 154.41s/it]

Coherence for 15 topics: 0.4958139090313243


 80%|████████  | 12/15 [30:46<07:32, 150.78s/it]

Coherence for 16 topics: 0.5248908443948886


 87%|████████▋ | 13/15 [33:11<04:57, 148.96s/it]

Coherence for 17 topics: 0.5073611053634252


 93%|█████████▎| 14/15 [35:40<02:29, 149.06s/it]

Coherence for 18 topics: 0.5093934270809797


100%|██████████| 15/15 [38:08<00:00, 152.55s/it]

Coherence for 19 topics: 0.5206665548658689





In [8]:
# supporting function
def compute_coherence_values(corpus, dictionary, k,a,b):

    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=1000,
                                           passes=50,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}

#grid search with the number of topics with the best baseline coherence value
topics_range = [13,9]

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]

corpus_title = ['100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                 }

# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))

    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word,
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)

                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

100%|██████████| 60/60 [2:36:11<00:00, 156.20s/it]  


In [9]:
#given the results of the grid search, the best model has 13 topics, alpha=asymmetric and beta=0.31
# Build LDA model
num_topics = 13
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=1000,
                                       passes=50,
                                       alpha='asymmetric',
                                       eta=0.01)

print(lda_model.print_topics())

[(0, '0.022*"thank" + 0.017*"support" + 0.016*"u" + 0.015*"day" + 0.015*"state" + 0.013*"work" + 0.012*"senate" + 0.011*"people" + 0.010*"fight" + 0.010*"senator"'), (1, '0.014*"today" + 0.014*"one" + 0.011*"year" + 0.011*"name" + 0.010*"please" + 0.009*"help" + 0.009*"going" + 0.009*"lee" + 0.008*"paul" + 0.007*"mike"'), (2, '0.021*"people" + 0.014*"like" + 0.012*"would" + 0.012*"know" + 0.011*"say" + 0.011*"want" + 0.010*"think" + 0.008*"party" + 0.008*"one" + 0.007*"said"'), (3, '0.065*"right" + 0.036*"woman" + 0.023*"abortion" + 0.017*"freedom" + 0.015*"protect" + 0.013*"choice" + 0.012*"life" + 0.011*"decision" + 0.011*"court" + 0.011*"ban"'), (4, '0.051*"biden" + 0.033*"border" + 0.024*"joe" + 0.020*"policy" + 0.016*"crime" + 0.015*"crisis" + 0.013*"agenda" + 0.012*"ron" + 0.012*"democrat" + 0.010*"country"'), (5, '0.024*"energy" + 0.022*"american" + 0.020*"war" + 0.019*"oil" + 0.018*"big" + 0.015*"trump" + 0.015*"america" + 0.015*"president" + 0.012*"ukraine" + 0.012*"amp"'), (6

In [10]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5625463111357373


In [11]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.022*"thank" + 0.017*"support" + 0.016*"u" + 0.015*"day" + 0.015*"state" + 0.013*"work" + 0.012*"senate" + 0.011*"people" + 0.010*"fight" + 0.010*"senator"'), (1, '0.014*"today" + 0.014*"one" + 0.011*"year" + 0.011*"name" + 0.010*"please" + 0.009*"help" + 0.009*"going" + 0.009*"lee" + 0.008*"paul" + 0.007*"mike"'), (2, '0.021*"people" + 0.014*"like" + 0.012*"would" + 0.012*"know" + 0.011*"say" + 0.011*"want" + 0.010*"think" + 0.008*"party" + 0.008*"one" + 0.007*"said"'), (3, '0.065*"right" + 0.036*"woman" + 0.023*"abortion" + 0.017*"freedom" + 0.015*"protect" + 0.013*"choice" + 0.012*"life" + 0.011*"decision" + 0.011*"court" + 0.011*"ban"'), (4, '0.051*"biden" + 0.033*"border" + 0.024*"joe" + 0.020*"policy" + 0.016*"crime" + 0.015*"crisis" + 0.013*"agenda" + 0.012*"ron" + 0.012*"democrat" + 0.010*"country"'), (5, '0.024*"energy" + 0.022*"american" + 0.020*"war" + 0.019*"oil" + 0.018*"big" + 0.015*"trump" + 0.015*"america" + 0.015*"president" + 0.012*"ukraine" + 0.012*"amp"'), (6