# US Presidential Election 2020

In [1]:
import sys
import os
import pyLDAvis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
sys.path.append(os.path.abspath('src'))

In [5]:
from src import preprocessing, topic_modeling, deepseek, misc_utils, train_model

In [6]:
# Define the configuration file path
config_path = 'conf/config.yaml'

In [None]:
# Run the preprocessing pipeline - Takes very long ~30mins
english_tweets = preprocessing.run_preprocessing_pipeline(config_path)

In [None]:
dtm_results = topic_modeling.topic_modeling_preprocessing(config_path)

In [42]:
# @Anthony: Ways to get all the keys out!!!
# Just need to sum all the values of each dtm to get the word count to create word cloud
# Can print to find all the keys
# print(dtm_results.keys())
# print(dtm_results['vectorizer'].keys())
# print(dtm_results['biden'].keys())
# print(dtm_results['trump'].keys())
# print(dtm_results['both'].keys())

# Topic Modeling

In [None]:
#Try to run all the different dtm and their vectorizer
training_path = 'conf/train_model.yaml'
trump_count_dtm = dtm_results['trump']['count_dtm']
count_vect= dtm_results['vectorizer']['count_vectorizer']
trump_count_display = train_model.training_pipeline(training_path, dtm= trump_count_dtm, vectorizer=count_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(trump_count_display)

In [None]:
tfidf_vect= dtm_results['vectorizer']['tfidf_vectorizer']
trump_tfidf_dtm = dtm_results['trump']['tfidf_dtm']
trump_tfidf_display = train_model.training_pipeline(training_path, dtm= trump_tfidf_dtm, vectorizer=tfidf_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(trump_tfidf_display)

In [None]:
biden_count_dtm = dtm_results['biden']['count_dtm']
biden_count_display = train_model.training_pipeline(training_path, dtm= biden_count_dtm, vectorizer=count_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(biden_count_display)

In [None]:
biden_tfidf_dtm = dtm_results['biden']['tfidf_dtm']
biden_tfidf_display = train_model.training_pipeline(training_path, dtm= biden_tfidf_dtm, vectorizer=tfidf_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(biden_tfidf_display)

In [None]:
both_count_dtm = dtm_results['both']['count_dtm']
both_count_display = train_model.training_pipeline(training_path, dtm= both_count_dtm, vectorizer=count_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(both_count_display)

In [None]:
both_tfidf_dtm = dtm_results['both']['tfidf_dtm']
both_tfidf_display = train_model.training_pipeline(training_path, dtm= both_tfidf_dtm, vectorizer=tfidf_vect, random_seed = 1, sample_size = 10000, topn = 10)
pyLDAvis.enable_notebook()
pyLDAvis.display(both_tfidf_display)

# EDA

In [6]:
english_tweets = pd.read_csv('data/english_tweets.csv')

In [None]:
english_tweets.info()

In [None]:
#Check to make sure that all the values are present
print(english_tweets['tweet'].nunique())
print(english_tweets['clean_tweet'].nunique())

In [None]:
results_xlm = pd.read_csv('data/results_xlm.csv')
print(results_xlm['sentiment'].value_counts())

In [None]:
results_xlm.info()

In [13]:
results_xlm = english_tweets.merge(results_xlm, on= 'clean_tweet', how = 'left')

In [None]:
results_xlm.info()

In [None]:
#Do the following checks so that make sure all the rows are present
print(results_xlm['hashtag'].value_counts())
print(results_xlm['sentiment'].unique())
assert ((results_xlm['confidence'] >= 0.0) & (results_xlm['confidence'] <= 1.0)).all(),"Found confidence values outside the range [0.0, 1.0]"
print(results_xlm['clean_tweet'].nunique())
print(results_xlm['tweet'].nunique())
print(results_xlm['user_followers_count'].min())

In [20]:
#Fill up the na value with unknown for source
results_xlm['source'] = results_xlm['source'].fillna('unknown')

In [None]:
results_xlm.info()

In [8]:
#Engagement function
results_xlm['engagement'] = misc_utils.engagement_score(results_xlm['likes'], results_xlm['retweet_count'], results_xlm['user_followers_count'])
print(results_xlm.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252563 entries, 0 to 252562
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   created_date          252563 non-null  object 
 1   created_time          252563 non-null  object 
 2   tweet_id              252563 non-null  float64
 3   tweet                 252563 non-null  object 
 4   likes                 252563 non-null  int64  
 5   retweet_count         252563 non-null  int64  
 6   source                252563 non-null  object 
 7   user_id               252563 non-null  float64
 8   user_id_post_count    252563 non-null  int64  
 9   user_description      238194 non-null  object 
 10  days_from_join_date   252563 non-null  int64  
 11  user_followers_count  252563 non-null  int64  
 12  state                 252563 non-null  object 
 13  hashtag               252563 non-null  object 
 14  clean_tweet           252563 non-null  object 
 15  

In [9]:
print(results_xlm['engagement'].max())
print(results_xlm['engagement'].min())

62233.333333333336
0.0


In [14]:
results_xlm[results_xlm['engagement'] == results_xlm['engagement'].max()][['likes', 'retweet_count', 'user_followers_count']]

Unnamed: 0,likes,retweet_count,user_followers_count
210817,3148,580,6


In [11]:
results_xlm['normalized_score'] = misc_utils.normalization(results_xlm['engagement'], results_xlm['sentiment'], results_xlm['confidence'])
results_xlm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252563 entries, 0 to 252562
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   created_date          252563 non-null  object 
 1   created_time          252563 non-null  object 
 2   tweet_id              252563 non-null  float64
 3   tweet                 252563 non-null  object 
 4   likes                 252563 non-null  int64  
 5   retweet_count         252563 non-null  int64  
 6   source                252563 non-null  object 
 7   user_id               252563 non-null  float64
 8   user_id_post_count    252563 non-null  int64  
 9   user_description      238194 non-null  object 
 10  days_from_join_date   252563 non-null  int64  
 11  user_followers_count  252563 non-null  int64  
 12  state                 252563 non-null  object 
 13  hashtag               252563 non-null  object 
 14  clean_tweet           252563 non-null  object 
 15  

In [12]:
print(results_xlm['normalized_score'].min())
print(results_xlm['normalized_score'].max())

-11481.187641620636
4319.049356877805


### Correlation between user_followers_count vs likes and retweets

In [None]:
#see how many likes and retweets are there when the followers_count =0
print(results_xlm[results_xlm['user_followers_count'] == 0][['likes']].value_counts())
print(results_xlm[results_xlm['user_followers_count'] == 0][['retweet_count']].value_counts())

In [None]:
# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot for user_followers_count vs likes
axes[0].scatter(results_xlm['user_followers_count'], results_xlm['likes'], alpha=0.5)
axes[0].set_title('User Followers Count vs Likes', fontsize=14)
axes[0].set_xlabel('User Followers Count', fontsize=12)
axes[0].set_ylabel('Likes', fontsize=12)

# Scatter plot for user_followers_count vs retweet_count
axes[1].scatter(results_xlm['user_followers_count'], results_xlm['retweet_count'], alpha=0.5)
axes[1].set_title('User Followers Count vs Retweet Count', fontsize=14)
axes[1].set_xlabel('User Followers Count', fontsize=12)
axes[1].set_ylabel('Retweet Count', fontsize=12)

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plots
plt.show()

### Sentiment distribution among tweets

In [None]:
# Calculate sentiment distribution for each hashtag group
trump_sentiment = results_xlm[results_xlm['hashtag'] == 'trump']['sentiment'].value_counts()
biden_sentiment = results_xlm[results_xlm['hashtag'] == 'biden']['sentiment'].value_counts()
both_sentiment = results_xlm[results_xlm['hashtag'] == 'both']['sentiment'].value_counts()

# Create a DataFrame to combine all sentiment counts
sentiment_df = pd.DataFrame({
    'trump': trump_sentiment,
    'biden': biden_sentiment,
    'both': both_sentiment
}).fillna(0)  # Fill NaN with 0 for missing sentiment categories

# Plot the sentiment distribution
sentiment_df.plot(kind='bar', figsize=(10, 6), width=0.8)

# Add title and labels
plt.title('Sentiment Distribution by Hashtag', fontsize=16)
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Count', fontsize=12)

#Make the labels upright
plt.xticks(rotation=0)

# Show the plot
plt.legend(title='Hashtag')
plt.show()

### Number of days from joined date

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(results_xlm['days_from_join_date'], bins=30, edgecolor='black')
plt.title('Distribution of Days from Join Date')
plt.xlabel('Days')
plt.ylabel('Count')
plt.show()

### Sentiment of tweets generated throughout the day

In [None]:
# Ensure the 'created_time' column is treated as integers (0-23 hours)
results_xlm['created_time'] = results_xlm['created_time'].astype(int)

# Group by 'created_time' and 'sentiment' to get the sentiment distribution per hour
hourly_sentiment_distribution = results_xlm.groupby(['created_time', 'sentiment']).size().unstack(fill_value=0)

# Plot the sentiment distribution as a stacked bar plot
plt.figure(figsize=(12, 6))
hourly_sentiment_distribution.plot(kind='bar', stacked=True, colormap='viridis', ax=plt.gca())

# Add title and axis labels
plt.title('Sentiment Distribution Across Hours of the Day', fontsize=16)
plt.xlabel('Hour of Day (24-hour format)', fontsize=12)
plt.ylabel('Number of Tweets', fontsize=12)

# Customize x-axis labels and gridlines
plt.xticks(rotation=0)  # Keep x-axis labels upright
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add a legend with a title
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

### Find the distribution of sentiment across state

In [None]:
# Group data by state and sentiment to count occurrences
state_sentiment_distribution = results_xlm.groupby(['state', 'sentiment']).size().unstack(fill_value=0)

# Sentiment Distribution per State (Stacked Bar Plot)
plt.figure(figsize=(15, 8))
state_sentiment_distribution.plot(kind='bar', stacked=True, colormap='viridis', figsize=(15, 8))

plt.title('Sentiment Distribution Across States', fontsize=16)
plt.xlabel('State', fontsize=12)
plt.ylabel('Count of Sentiments', fontsize=12)
plt.xticks(rotation=90)
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()