In [2]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import LdaMulticore
from pprint import pprint
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [3]:
df = pd.read_csv('../data/dataset_merged.csv', lineterminator='\n')
df.dropna(axis=0, inplace=True)
df

Unnamed: 0,debate_number,content_type,network,author,text,published_at,like_count,public
0,1,comment,CBS,@Deano-n4w,Donald Trump's reasoning is lost üò¢,2024-08-27T09:51:01Z,0,True
1,1,comment,CBS,@noobpeepxpostyguitarists5381,2:05:31 this aged well,2024-08-26T09:48:00Z,0,True
2,1,comment,CBS,@noobpeepxpostyguitarists5381,The only facts is that words shouldn't be impo...,2024-08-26T09:44:19Z,1,True
3,1,comment,CBS,@sandhyagudigudi3998,Biden sir GodenBird flying ‚ù§Trumpsir like Baby...,2024-08-22T02:10:30Z,0,True
4,1,comment,CBS,@cherylcallahan5402,LIONEL NATION AND THE MATE ARE THE BEST BEST,2024-08-21T17:11:52Z,1,True
...,...,...,...,...,...,...,...,...
195467,4,livechat,WFLA,@joannektc,I love that JD Vance was rocking the pink tie ...,2024-10-02T04:06:35Z,100,True
195468,4,livechat,WFLA,@ChrisSchrum-t6y,Tampon Tim blows,2024-10-02T04:06:21Z,0,True
195469,4,livechat,WFLA,@JygjjVdcc,But tims middle class,2024-10-02T04:05:20Z,5,True
195470,4,livechat,WFLA,@NECKBRACEBRO,I felt like it was a tie.But I am still voting...,2024-10-02T04:04:56Z,14,True


In [15]:
df.isnull().sum()

debate_number      0
content_type       0
network            0
author           135
text              13
published_at       0
like_count         0
public             0
dtype: int64

In [33]:
mask1 = df['author'].isnull()
mask2 = df['text'].isnull()

df[mask1 & mask2]

Unnamed: 0,debate_number,content_type,network,author,text,published_at,like_count,public


In [29]:
_ = df[df['text'].isnull()]
_[_['debate_number'] == 3]

Unnamed: 0,debate_number,content_type,network,author,text,published_at,like_count,public
193057,3,livechat,NBC,@steviet9485,,2024-09-11T06:42:34Z,0,True


In [3]:
# nltk downloads and setup

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
more_stopwords = ['www.youtube.com', 'https', 'http', 'www.youtube.com/watch']
stop_words = set(list(stopwords.words('english')) + more_stopwords)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/deeptikakannan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/deeptikakannan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/deeptikakannan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/deeptikakannan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# preprocessing data

df = df.drop(columns=['debate_number', 'content_type', 'network', 'author', 'published_at', 'like_count', 'public'], axis=1)

df['text'] = df['text'].apply(str)

df['processed_text'] = df['text'].apply(lambda x: re.sub('[,\.!?]', '', x))

df['processed_text'] = df['processed_text'].apply(lambda s: emoji.replace_emoji(s, ''))

df['processed_text'] = df['processed_text'].apply(lambda x: x.lower())

df.head()

  df['processed_text'] = df['text'].apply(lambda x: re.sub('[,\.!?]', '', x))


Unnamed: 0,text,processed_text
0,Donald Trump's reasoning is lost üò¢,donald trump's reasoning is lost
1,2:05:31 this aged well,2:05:31 this aged well
2,The only facts is that words shouldn't be impo...,the only facts is that words shouldn't be impo...
3,Biden sir GodenBird flying ‚ù§Trumpsir like Baby...,biden sir godenbird flying trumpsir like baby ...
4,LIONEL NATION AND THE MATE ARE THE BEST BEST,lionel nation and the mate are the best best


In [5]:
# tokenize text

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df.processed_text.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:5][:2])

[['donald', 'trump', 'reasoning', 'is', 'lost'], ['this', 'aged', 'well']]


In [6]:
# remove stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# bigram, trigram model buildings
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# lemmatize keeping only noun, adj, vb, adv
def lemmatize(texts, allowed_postags=['NN', 'JJ', 'VB', 'RB']):
    texts_out = []
    for sent in texts:
        pos_tags = nltk.pos_tag(sent)
        texts_out.append([lemmatizer.lemmatize(word) for word, pos in pos_tags if pos[:2] in allowed_postags])
    return texts_out  

data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

data_lemmatized = lemmatize(data_words_bigrams, allowed_postags=['NN', 'JJ', 'VB', 'RB'])

print(data_lemmatized[:5][:2])

[['donald', 'trump', 'reasoning', 'lost'], ['aged', 'well']]


In [7]:
# corpus and dictionary

id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

corpus[:5][:2]

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1)]]

In [8]:
# LDA model

# number of topics - can change - TODO
num_topics = 10

# LDA model

# load pre-saved model
# lda_model = models.ldamodel.LdaModel.load('lda_model_full_dataset.model')

# train model
lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=42, passes=10, workers=4, chunksize=100, per_word_topics=True)
# lda_model.save('lda_viz/full_dataset_10_topics.model')

In [9]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.034*"cnn" + 0.029*"debate" + 0.029*"abc" + 0.028*"moderator" + '
  '0.027*"number" + 0.020*"ever" + 0.017*"seems" + 0.016*"job" + 0.014*"seen" '
  '+ 0.013*"news"'),
 (1,
  '0.099*"debate" + 0.057*"question" + 0.029*"answer" + 0.024*"time" + '
  '0.020*"lol" + 0.014*"presidential" + 0.014*"even" + 0.014*"candidate" + '
  '0.011*"minute" + 0.011*"first"'),
 (2,
  '0.289*"trump" + 0.053*"biden" + 0.031*"donald" + 0.025*"love" + '
  '0.019*"joke" + 0.014*"talking" + 0.013*"understand" + 0.012*"saying" + '
  '0.012*"stand" + 0.012*"went"'),
 (3,
  '0.029*"people" + 0.011*"war" + 0.010*"country" + 0.010*"want" + 0.010*"get" '
  '+ 0.010*"american" + 0.009*"money" + 0.009*"year" + 0.009*"care" + '
  '0.009*"comment"'),
 (4,
  '0.108*"biden" + 0.089*"joe" + 0.050*"kamala" + 0.045*"harris" + '
  '0.019*"watch" + 0.017*"lost" + 0.016*"sound" + 0.015*"bad" + 0.015*"look" + '
  '0.014*"start"'),
 (5,
  '0.059*"biden" + 0.057*"lie" + 0.038*"look" + 0.017*"liar" + 0.016*"lying" + '
  '0.0

In [10]:
# visualize initial model

vis = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'lda_viz/vis_full_dataset_initial_model.html')
print("Visualization saved")

Visualization saved


In [11]:
# coherence score

coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
print('Coherence Score: ', coherence_model_lda.get_coherence())

Coherence Score:  0.5628066643308023


In [8]:
# train model over each debate comments/livechat

# partition dataset

df = pd.read_csv('../Election-Data/dataset_merged.csv', lineterminator='\n')

debate_1 = df[df['debate_number'] == 1]
debate_2 = df[df['debate_number'] == 2]
debate_3 = df[df['debate_number'] == 3]
debate_4 = df[df['debate_number'] == 4]

print(debate_1.shape, debate_2.shape, debate_3.shape, debate_4.shape)

(98167, 8) (86131, 8) (10132, 8) (1042, 8)


In [9]:
# train per debate

def preprocess_train_validate_visualize(debate_name, df):
    print('Starting on {}'.format(debate_name))
    
    # preprocess text
    df = df.drop(columns=['debate_number', 'content_type', 'network', 'author', 'published_at', 'like_count', 'public'], axis=1)
    df['text'] = df['text'].apply(str)
    df['processed_text'] = df['text'].apply(lambda x: re.sub('[,\.!?]', '', x))
    df['processed_text'] = df['processed_text'].apply(lambda s: emoji.replace_emoji(s, ''))
    df['processed_text'] = df['processed_text'].apply(lambda x: x.lower())
    print('Text preprocessed')
    
    # tokenize
    data = df.processed_text.values.tolist()
    data_words = list(sent_to_words(data))
    print('Tokenized')
    
    # lemmatize
    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(data_words_nostops)
    data_lemmatized = lemmatize(data_words_bigrams, allowed_postags=['NN', 'JJ', 'VB', 'RB'])
    print('Lemmatized')
    
    # corpus and dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    texts = data_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]
    print('Corpus & dictionary generated')
    
    # train LDA model
    lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=10, random_state=42, passes=10, workers=4, chunksize=100, per_word_topics=True)
    print('LDA model trained')
    
    # visualize
    vis = gensimvis.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(vis, 'lda_viz/vis_{}_initial_model.html'.format(debate_name))
    print('Visualization saved')
    
    # calculate coherence
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    print('Coherence Score for {} is: {}'.format(debate_name, coherence_model_lda.get_coherence()))
    print()

debates = {'2020 Presidential Debate: Biden vs Trump': debate_1,
           '2024 Presidential Debate 1: Biden vs Trump': debate_2,
           '2024 Presidential Debate 2: Harris vs Trump': debate_3,
           '2024 Vice Presidential Debate: Vance vs Walz': debate_4}

for debate_name, df in debates.items():
    preprocess_train_validate_visualize(debate_name, df)

  df['processed_text'] = df['text'].apply(lambda x: re.sub('[,\.!?]', '', x))


Starting on 2020 Presidential Debate: Biden vs Trump
Text preprocessed
Tokenized
LDA model trained
Visualization saved
Coherence Score for 2020 Presidential Debate: Biden vs Trump is: 0.5280143302306499

Starting on 2024 Presidential Debate 1: Biden vs Trump
Text preprocessed
Tokenized
Lemmatized
Corpus & dictionary generated
LDA model trained
Visualization saved
Coherence Score for 2024 Presidential Debate 1: Biden vs Trump is: 0.5282959166998478

Starting on 2024 Presidential Debate 2: Harris vs Trump
Text preprocessed
Tokenized
Lemmatized
Corpus & dictionary generated
LDA model trained
Visualization saved
Coherence Score for 2024 Presidential Debate 2: Harris vs Trump is: 0.4857249073638247

Starting on 2024 Vice Presidential Debate: Vance vs Walz
Text preprocessed
Tokenized
Lemmatized
Corpus & dictionary generated
LDA model trained
Visualization saved
Coherence Score for 2024 Vice Presidential Debate: Vance vs Walz is: 0.3641264606396487



In [None]:
# hyperparameter tuning for the full dataset

import tqdm

def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=3)  # Adjust 'workers' depending on your machine cores.
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# alpha param
alpha = list(np.arange(0.01, 1, 0.5))  # Fewer steps
alpha.append('symmetric')
alpha.append('asymmetric')

# beta param
beta = list(np.arange(0.01, 1, 0.5))  # Fewer steps
beta.append('symmetric')

# topics range
min_topics = 2
max_topics = 10
step_size = 2
topics_range = range(min_topics, max_topics, step_size)

# single corpus sample
num_of_docs = len(corpus)
corpus_sample = gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75))

model_results = {'Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': []}

pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)))

# iterate over different combinations
for k in topics_range:
    for a in alpha:
        for b in beta:
            cv = compute_coherence_values(corpus=corpus_sample, dictionary=id2word, k=k, a=a, b=b)
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)
            pbar.update(1)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

  2%|‚ñè         | 1/48 [02:28<1:55:57, 148.04s/it]

TO BE CHANGED

In [None]:
pd.DataFrame(lda_model.print_topics(), columns=['topic', 'words & their probabilities'])

In [None]:
# model testing - coherence score
# measures how interpretable or meaningful the topics are; a higher score indicates better topic quality

coherence_model = CoherenceModel(model=lda_model, texts=df['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

In [None]:
# varying number of topics

def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    num_topics_values = range(start, limit, step)
    coherence_values = []
    for n_topic in num_topics_values:
        print('Number of Topics: {}'.format(n_topic))
        
        # training model
        model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=n_topic, 
                             random_state=42, passes=10, workers=4)   
        print('LDA Model Trained')
        
        # calculating coherence
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        _ = coherencemodel.get_coherence()
        print('Coherence value: {}'.format(_))
        coherence_values.append(_)
    
    return num_topics_values, coherence_values

start = 2
limit = 40
step = 2

num_topics_values, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, 
                                                        texts=df['processed_text'], start=start, 
                                                        limit=limit, step=step)

In [None]:
# visualize results

plt.plot(num_topics_values, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.title("Coherence Scores by Number of Topics")
plt.show()

# no change in coherence score with increasing topics

In [None]:
# analysis per debate

debates = ['2020 Presidential Debate: Biden vs Trump', 
           '2024 Presidential Debate 1: Biden vs Trump',
           '2024 Presidential Debate 2: Harris vs Trump',
           '2024 Vice-presidential Debate 3: Vance vs Walz']