In [2]:
# Loading packages

import tqdm
import pandas as pd
import numpy as np

import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/franciscorfafonso/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load the DataFrame from the pickle file
df = pd.read_pickle('data_preprocessed_filtered.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57790 entries, 0 to 58446
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      57790 non-null  object 
 1   text                          57790 non-null  object 
 2   created_at                    57790 non-null  object 
 3   campaign_week                 57790 non-null  int64  
 4   process_text_check            57790 non-null  object 
 5   name                          57790 non-null  object 
 6   handle                        57790 non-null  object 
 7   party                         57790 non-null  object 
 8   state_code                    57790 non-null  object 
 9   state_name                    57790 non-null  object 
 10  result_pctg                   57790 non-null  float64
 11  result_votes                  57790 non-null  int64  
 12  position                      57790 non-null  int64  
 13  t

In [4]:
# Create dictionary (needed for LDA)
id2word = corpora.Dictionary(df['WordsLemmatized'])

In [5]:
# Create corpus
texts = df['WordsLemmatized']

In [6]:
# Create TDM (Frequency)
corpus = [id2word.doc2bow(text) for text in texts]

## Analysis

In [7]:

##given the results of the grid search, the best model has 9 topics, alpha=asymmetric and beta=symmetric
# Build LDA model
num_topics = 9
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=100,
                                       chunksize=1000,
                                       passes=50,
                                       alpha='asymmetric',
                                       eta='symmetric')

print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.016*"thank" + 0.013*"day" + 0.013*"great" + 0.011*"u" + 0.011*"state" + 0.011*"support" + 0.010*"today" + 0.008*"amp" + 0.007*"work" + 0.007*"time"'), (1, '0.012*"cost" + 0.012*"$" + 0.011*"family" + 0.011*"need" + 0.010*"job" + 0.010*"care" + 0.009*"amp" + 0.009*"working" + 0.008*"make" + 0.007*"act"'), (2, '0.015*"people" + 0.011*"like" + 0.010*"know" + 0.009*"would" + 0.009*"one" + 0.008*"think" + 0.008*"say" + 0.008*"party" + 0.008*"want" + 0.006*"law"'), (3, '0.049*"right" + 0.026*"woman" + 0.017*"abortion" + 0.015*"protect" + 0.013*"freedom" + 0.010*"life" + 0.009*"law" + 0.009*"gun" + 0.009*"senate" + 0.009*"fight"'), (4, '0.027*"border" + 0.015*"war" + 0.012*"community" + 0.012*"must" + 0.012*"crisis" + 0.011*"country" + 0.010*"amp" + 0.010*"america" + 0.009*"crime" + 0.009*"secure"'), (5, '0.020*"big" + 0.017*"oil" + 0.014*"trump" + 0.012*"interest" + 0.012*"ron" + 0.011*"social" + 0.011*"security" + 0.010*"people" + 0.010*"hold" + 0.009*"johnson"'), (6, '0.071*"vote" 

In [8]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5578812882140993


In [9]:
vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [10]:
pyLDAvis.save_html(vis, 'lda.html')

In [11]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.016*"thank" + 0.013*"day" + 0.013*"great" + 0.011*"u" + 0.011*"state" + 0.011*"support" + 0.010*"today" + 0.008*"amp" + 0.007*"work" + 0.007*"time"'), (1, '0.012*"cost" + 0.012*"$" + 0.011*"family" + 0.011*"need" + 0.010*"job" + 0.010*"care" + 0.009*"amp" + 0.009*"working" + 0.008*"make" + 0.007*"act"'), (2, '0.015*"people" + 0.011*"like" + 0.010*"know" + 0.009*"would" + 0.009*"one" + 0.008*"think" + 0.008*"say" + 0.008*"party" + 0.008*"want" + 0.006*"law"'), (3, '0.049*"right" + 0.026*"woman" + 0.017*"abortion" + 0.015*"protect" + 0.013*"freedom" + 0.010*"life" + 0.009*"law" + 0.009*"gun" + 0.009*"senate" + 0.009*"fight"'), (4, '0.027*"border" + 0.015*"war" + 0.012*"community" + 0.012*"must" + 0.012*"crisis" + 0.011*"country" + 0.010*"amp" + 0.010*"america" + 0.009*"crime" + 0.009*"secure"'), (5, '0.020*"big" + 0.017*"oil" + 0.014*"trump" + 0.012*"interest" + 0.012*"ron" + 0.011*"social" + 0.011*"security" + 0.010*"people" + 0.010*"hold" + 0.009*"johnson"'), (6, '0.071*"vote" 

In [12]:
topic_names = {
    0:'Campaign Event',
    1:'Social Protection',
    2:'General Discussion',
    3:'Abortion',
    4:'Homeland Security',
    5:'Candidate Denigration',
    6:'Vote Instigation',
    7:'Campaign Contribution',
    8:'Economy Policies'

    #    2: "Abortion",
#    1: "Homeland Security",
#    4: "Candidate Denigration",
#    3: "Generic Discussion",
#    3: "Vote Persuasion",
#    5: "Campaign Event",
#    1: "Economy"
}

In [13]:
# Get the topic distribution for each document in the corpus
document_topics = [lda_model.get_document_topics(doc, minimum_probability=0.0) for doc in corpus]

# Create a DataFrame
topics_df = pd.DataFrame()

# Fill in the DataFrame
for doc_num, doc_topics in enumerate(document_topics):
    for topic_num, prob in doc_topics:
        topics_df.at[doc_num, f'Topic_{topic_num}'] = prob

# Fill any missing values with 0 (documents may not touch on every topic)
topics_df = topics_df.fillna(0)

# Now, rename the columns in topics_df:
for topic_id, topic_name in topic_names.items():
    if f'Topic_{topic_id}' in topics_df.columns:
        topics_df.rename(columns={f'Topic_{topic_id}': topic_name}, inplace=True)

# Join the original DataFrame with the topics DataFrame
df = df.reset_index(drop=True)
df2 = pd.concat([df, topics_df], axis=1)

df2.head(10)

Unnamed: 0,tweet_id,text,created_at,campaign_week,process_text_check,name,handle,party,state_code,state_name,...,WordsLemmatized,Campaign Event,Social Protection,General Discussion,Abortion,Homeland Security,Candidate Denigration,Vote Instigation,Campaign Contribution,Economy Policies
0,1589769441016877056,Deeply honored to be standing in the historic ...,2022-11-07,36,deeply honored to be standing in the historic ...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[deeply, honored, standing, historic, town, ha...",0.742276,0.007862,0.006269,0.005225,0.004478,0.003918,0.223988,0.003134,0.002849
1,1589758870242500608,Full house tonight! Looking forward to having ...,2022-11-07,36,full house tonight looking forward to having w...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[full, house, tonight, looking, forward, grani...",0.628943,0.009704,0.007751,0.006454,0.005532,0.09112,0.243105,0.003872,0.00352
2,1589753955592331264,Everyone’s here for the final town hall in Exe...,2022-11-07,36,everyone s here for the final town hall in exe...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[everyone, final, town, hall, election, many, ...",0.761384,0.011777,0.097995,0.007836,0.006716,0.005877,0.09944,0.004701,0.004274
3,1589723041205886976,Career politicians like @SenatorHassan will sa...,2022-11-07,36,career politicians like will say anything to g...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[career, politician, like, say, anything, get,...",0.008183,0.082605,0.107793,0.331403,0.067777,0.395138,0.002612,0.002351,0.002137
4,1589684478540406784,@sixohfree Thank you!!! 🇺🇸🇺🇸,2022-11-07,36,thank you,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,[thank],0.609655,0.082244,0.065795,0.054829,0.046997,0.041122,0.036553,0.032898,0.029907
5,1589682964933197824,LET'S GO! https://t.co/dpCB6RFCZC,2022-11-07,36,let's go,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[let's, go]",0.07331,0.054942,0.043892,0.036556,0.031332,0.027415,0.690684,0.021932,0.019938
6,1589682136436465664,🚨🚨🚨Join @GovChrisSununu in supporting our Repu...,2022-11-07,36,join in supporting our republican team in new ...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[join, supporting, republican, team, #votebold...",0.665969,0.011003,0.008806,0.007316,0.288172,0.005484,0.004875,0.004387,0.003988
7,1589673963508363265,🚨—ENDORSEMENT ALERT: Thank you to the @LogCabi...,2022-11-07,36,endorsement alert thank you to the for your su...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[endorsement, alert, thank, support, campaign,...",0.681041,0.007498,0.005988,0.288426,0.004274,0.003739,0.003324,0.002991,0.002719
8,1589668789863342080,.@newsmax @Tom_Basile covering our Seabrook me...,2022-11-07,36,covering our seabrook meet and greet on the ev...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[covering, meet, greet, eve, election, day, da...",0.402375,0.010297,0.008267,0.006859,0.166499,0.005142,0.392708,0.004114,0.00374
9,1589663927956549632,Started the morning off right! Linda’s Breakfa...,2022-11-07,36,started the morning off right linda s breakfas...,Don Bolduc,GenDonBolduc,Rep,NH,New Hampshire,...,"[started, morning, right, linda, breakfast, pl...",0.64009,0.010328,0.008235,0.077222,0.005878,0.005143,0.245248,0.004114,0.00374


In [14]:
import datetime

df2['tweet_id'] = df2['tweet_id'].apply(str)
now = datetime.datetime.now()
df2.to_excel(f"documents_topics_wip_{now.strftime('%Y-%m-%d_%H-%M-%S')}.xlsx", sheet_name='documents_topics')


In [42]:
df2.to_pickle('documents_topics.pkl')