# BIDEN vs TRUMP: are tweeters a good proxy for USA elections?


In [1]:
# Import Libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.express as px 
import plotly.io as pio
from pathlib import Path
import os

# Libraries for Sentiment Analysis 
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.corpus import wordnet 
from nltk.stem import WordNetLemmatizer 
from textblob import TextBlob 
from wordcloud import WordCloud 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, words
from nltk.probability import FreqDist

## Import dataset

Set the directory

In [3]:
# Define the base path
base_path = Path("C:/Users/Davide/Desktop/Alma Mater/SECOND YEAR/PYTHON/Python_project")
# Change the working directory
os.chdir(base_path)

# Define the full path to the CSV file for Trump and Biden
merged_data = base_path / "data" / "data.csv"

# Print the current working directory
print("Current Working Directory:", Path.cwd())

Current Working Directory: C:\Users\Davide\Desktop\Alma Mater\SECOND YEAR\PYTHON\Python_project


Load data

In [4]:
try:
    data = pd.read_csv(merged_data, encoding="utf-8", engine='python', on_bad_lines='skip')
    print("First 5 rows of the DataFrame:")
    print(data.head())
except Exception as e:
    print("Error loading the file:", e)

First 5 rows of the DataFrame:
   index           created_at      tweet_id  \
0      0  2020-10-15 00:00:01  1.316529e+18   
1      2  2020-10-15 00:00:02  1.316529e+18   
2      4  2020-10-15 00:00:08  1.316529e+18   
3      5  2020-10-15 00:00:17  1.316529e+18   
4      7  2020-10-15 00:00:18  1.316529e+18   

                                               tweet  likes  retweet_count  \
0  #Elecciones2020 | En #Florida: #JoeBiden dice ...    0.0            0.0   
1  #Trump: As a student I used to hear for years,...    2.0            1.0   
2  You get a tie! And you get a tie! #Trump ‘s ra...    4.0            3.0   
3  @CLady62 Her 15 minutes were over long time ag...    2.0            0.0   
4  @DeeviousDenise @realDonaldTrump @nypost There...    0.0            0.0   

                source       user_id  \
0            TweetDeck  3.606665e+08   
1      Twitter Web App  8.436472e+06   
2   Twitter for iPhone  4.741380e+07   
3  Twitter for Android  1.138416e+09   
4   Twitter for i

### Text Analysis Data Cleaning

turn the tweets into lowercase latters

In [5]:
data['tweet'] = data['tweet'].str.lower()  
print("Sample of 5 tweets after converting to lowercase:")
print(data['tweet'].sample(5))  # Check randomly

Sample of 5 tweets after converting to lowercase:
316761    @joebiden #democrats #biden https://t.co/ufihk...
355588    "ga and pa" i'm not going to miss all the lies...
266439    this quote is for my family of 4. i can't affo...
106352        #foxnews #maga #trump https://t.co/0q5ylkzjj0
292481    in other news, the biden supporter left strand...
Name: tweet, dtype: object


In [6]:
# Check if each tweet is composed only of lowercase characters
all_lowercase = data['tweet'].apply(lambda x: x.islower())

# Count the number of tweets that are not in lowercase
non_lowercase_count = all_lowercase.value_counts().get(False, 0)  # Count False values
print("Number of tweets not in lowercase:", non_lowercase_count)

# Drop rows where the tweet is still in uppercase
data = data[~data['tweet'].str.isupper()]

# Check the cleaned DataFrame
print("Sample of DataFrame after dropping uppercase tweets:")
print(data.sample(5))  # Display a sample of the cleaned DataFrame

Number of tweets not in lowercase: 135
Sample of DataFrame after dropping uppercase tweets:
         index           created_at      tweet_id  \
276851  175590  2020-10-25 17:04:37  1.320411e+18   
255047  113656  2020-10-23 01:34:00  1.319452e+18   
309817  274377  2020-11-01 13:25:15  1.322892e+18   
109880  350142  2020-10-30 16:00:07  1.322207e+18   
280281  185663  2020-10-26 11:55:22  1.320696e+18   

                                                    tweet  likes  \
276851  #biden answers to his donors. @realdonaldtrump...    0.0   
255047  @joebiden #biden we already know, you cannot d...    5.0   
309817  @morethanmysle @hunterjcullen 🇺🇸💙🇺🇸 i voted #j...    0.0   
109880  9:15am = avg 1st #trump tweet this week (edt)....   10.0   
280281  i wonder who #joebiden is running against today??    7.0   

        retweet_count               source       user_id  \
276851            0.0  Twitter for Android  8.258398e+17   
255047            0.0   Twitter for iPhone  1.698183e+09   


## Tokenization

### Sentence Tokenization

In [7]:
data['tokenized_tweet'] = data['tweet'].apply(sent_tokenize)
print("Sample of tokenized tweets:")
print(data['tokenized_tweet'].sample(5))  # Verifica le prime righe

Sample of tokenized tweets:
176502    [[usa] jueces de georgia y michigan, desestima...
240313    [finally the #secretempires\nof #biden crimes ...
33510     [@thehill he just reveals himself to be the "d...
372986    [#biden hablará a las 8pm., https://t.co/hnivz...
373409    [#biden #election2020 is projected the 🏆\nhttp...
Name: tokenized_tweet, dtype: object


Remove punctuations

In [8]:
# Funzione per rimuovere la punteggiatura
def remove_punctuation(tokens):
    return [re.sub(r'[^\w\s]', '', word) for word in tokens]

# Rimuovi la punteggiatura dalla colonna 'tokenized_tweet'
data['tokenized_tweet'] = data['tokenized_tweet'].apply(remove_punctuation)
print("Sample of tokenized tweets after punctuation removal:")
print(data['tokenized_tweet'].sample(5))  # Verifica le prime righe aggiornate

Sample of tokenized tweets after punctuation removal:
170617    [anncoulter  there is no evidence that any siz...
155500    [even now its close, but its looking trump adv...
198949    [of course he was  not like he still has a job...
64132     [heres the video of most of trumps remarks aft...
390845    [heres what to expect with the new us presiden...
Name: tokenized_tweet, dtype: object


### Word Tokenization

In [9]:
tokenized_words = data['tokenized_tweet'].apply(lambda x: [word_tokenize(sentence) for sentence in x])
print("Sample of word tokenized tweets:")
print(tokenized_words.sample(5))

Sample of word tokenized tweets:
217402    [[newsweek, the, difference, biden, is, not, a...
175167    [[realdonaldtrump], [realdonaldtrump, psycho, ...
73079     [[trump, says, biden, will, burn, down, the, c...
199588    [[panawahpskek, as, notorious, biggie, said, i...
327308    [[mike_pence, realdonaldtrump, 230000, america...
Name: tokenized_tweet, dtype: object


Flate the column tweet

In [10]:
# Appiattiamo la lista di liste in una lista di parole
all_words_flat = [word for sublist in tokenized_words for word in sublist]
all_word_list = [word for sublist in all_words_flat for word in sublist]
print("First 10 words from the flattened list:")
print(all_words_flat[:10])  # Stampa le prime 10 parole
print("First 10 words from the one list:")
print(all_word_list[:100])  # Stampa le prime 10 parole

First 10 words from the flattened list:
[['elecciones2020', 'en', 'florida', 'joebiden', 'dice', 'que', 'donaldtrump', 'solo', 'se', 'preocupa', 'por', 'él', 'mismo'], ['el', 'demócrata', 'fue', 'anfitrión', 'de', 'encuentros', 'de', 'electores', 'en', 'pembrokepines', 'y', 'miramar'], ['clic', 'aquí', 'httpstcoqhiwpiuxst', '_', 'elsollatino', 'yobrilloconelsol', 'httpstco6flcbwf1mi'], ['trump', 'as', 'a', 'student', 'i', 'used', 'to', 'hear', 'for', 'years', 'for', 'ten', 'years', 'i', 'heard', 'china'], ['in', '2019'], ['and', 'we', 'have', '15', 'and', 'they', 'dont', 'know', 'how', 'many', 'we', 'have', 'and', 'i', 'asked', 'them', 'how', 'many', 'do', 'we', 'have', 'and', 'they', 'said', 'sir', 'we', 'dont', 'know'], ['but', 'we', 'have', 'millions'], ['like', '300', 'million'], ['um'], ['what']]
First 10 words from the one list:
['elecciones2020', 'en', 'florida', 'joebiden', 'dice', 'que', 'donaldtrump', 'solo', 'se', 'preocupa', 'por', 'él', 'mismo', 'el', 'demócrata', 'fue', '

## Stopwords

Remove stop words

In [11]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{"you're", 'shan', 'if', 'here', "hasn't", 'now', 'i', 'whom', "you'll", 'have', 'had', "doesn't", 'being', 'with', 'those', 'by', 'at', 'doesn', "couldn't", "wasn't", 'you', 'out', 'all', 'our', 'into', 'above', 'its', "didn't", 'hers', 'which', "shouldn't", 'in', 'am', 'm', 'to', 'when', 'hadn', 'because', "it's", 're', 'needn', "you'd", 'other', 'ma', 'doing', 'are', 'having', 'off', "hadn't", 'me', 'does', 'him', 'she', 'then', 'he', 'so', "shan't", "that'll", 'there', 'than', 'about', 'yours', 'ain', "mustn't", 'won', 'only', "should've", 'yourselves', 'who', 'yourself', 'just', 'has', 'through', 'until', 'few', 'haven', "won't", 'where', 'nor', 'were', 'isn', 'will', 'itself', 'my', 'what', 'myself', 'own', 'same', 'that', "aren't", 'most', 'did', 'themselves', 'further', 'but', 't', 'they', 'between', 'hasn', "she's", 'as', 'once', 'your', 'should', 'd', 'a', 'don', 'll', 'while', 'weren', 'this', 'any', 'it', 'her', 'an', "isn't", "haven't", 've', 'both', 'his', 'down', 'and', 

In [12]:
# Rimuovi le stopword 
filtered_word = []
for w in all_word_list:
    if w not in stop_words:
        filtered_word.append(w)

# Salva solo le prime 100 parole filtrate
filtered_word = filtered_word[:100]

print("\n\nFiltered Sentence:", filtered_word)



Filtered Sentence: ['elecciones2020', 'en', 'florida', 'joebiden', 'dice', 'que', 'donaldtrump', 'solo', 'se', 'preocupa', 'por', 'él', 'mismo', 'el', 'demócrata', 'fue', 'anfitrión', 'de', 'encuentros', 'de', 'electores', 'en', 'pembrokepines', 'miramar', 'clic', 'aquí', 'httpstcoqhiwpiuxst', '_', 'elsollatino', 'yobrilloconelsol', 'httpstco6flcbwf1mi', 'trump', 'student', 'used', 'hear', 'years', 'ten', 'years', 'heard', 'china', '2019', '15', 'dont', 'know', 'many', 'asked', 'many', 'said', 'sir', 'dont', 'know', 'millions', 'like', '300', 'million', 'um', 'get', 'tie', 'get', 'tie', 'trump', 'rally', 'iowa', 'httpstcojjaluumh5d', 'clady62', '15', 'minutes', 'long', 'time', 'ago', 'omarosa', 'never', 'represented', 'black', 'community', 'thereidout', 'cried', 'trump', 'begging', 'job', 'deeviousdenise', 'realdonaldtrump', 'nypost', 'wont', 'many', 'unless', 'voting', 'god', 'prevails', 'bo', 'corrupt', 'president', 'ever', 'dark', 'light', 'lies', 'coming', 'wouldnt', 'last', 'for

# LEMMATIZATION????

# FREQUENCY ANALYSIS ???

## Functions 

We shall only take into account US citizens’ opinions here, as they are a crucial deciding factor in who becomes the US president.

In [13]:
def clean(text): 
	# Remove URLs 
	text = re.sub(r'https?://\S+|www\.\S+', '', str(text)) 

	# Convert text to lowercase 
	text = text.lower() 

	# Replace anything other than alphabets a-z with a space 
	text = re.sub('[^a-z]', ' ', text) 

	# Split the text into single words 
	text = text.split() 

	# Initialize WordNetLemmatizer 
	lm = WordNetLemmatizer() 

	# Lemmatize words and remove stopwords 
	text = [lm.lemmatize(word) for word in text if word not in set( 
		stopwords.words('english'))] 

	# Join the words back into a sentence 
	text = ' '.join(word for word in text) 

	return text 

Get polarity, subjectivity, and Analysis Now, let’s create a function to get polarity, subjectivity, and Analysis function to fetch sentiments from the data.

In [None]:
def getpolarity(text): 
    return TextBlob(text).sentiment.polarity 

def getsubjectivity(text): 
    return TextBlob(text).sentiment.subjectivity 

def getAnalysis(score): 
    if score < 0: 
        return 'negative'
    elif score == 0: 
        return 'neutral'
    else: 
        return 'positive'

# ADD THE FREQUENCY ANALYSIS FOR BIDEN AND TRUMP