### This file contains the pre - processing for the larger dataset that we run our pre trained model on

In [1]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [8]:
# Load Trump and Biden tweet datasets 
trump_tweets_file = "archive/hashtag_donaldtrump.csv"
biden_tweets_file = "archive/hashtag_joebiden.csv"

# Define hashtags to filter data into pro trump or pro biden dataframes
trump_hashtags = ["#MAGA", "#KAG", "#FourMoreYears", "#SleepyJoe", "#BlacksForTrump", "#Trump2020", "#VoteRed", "#WomenForTrump", "#LatinosForTrump", "#AmericaFirst", "#BuildTheWall"]
biden_hashtags = ["#Biden2020", "#BidenHarris2020", "#VoteBlue", "#NotMyPresident", "#BlueWave2020", "#VoteBiden", "#VoteBlueToSaveAmerica", "#BlacksForBiden", "#WomenForBiden", "#LatinosForBiden"]

# Dictionary containing contentious topics to fliter the tweets with
topic_to_label = {'abortion': 0, 'Abortion': 0,'fetus': 0, 'Fetus': 0,'pro-life': 0, 'pro-choice': 0, 'babies': 0,
                'immigration': 1, 'immigrant': 1, 'immigrants': 1, 'Immigration': 1, 'Immigrant': 1, 'Immigrants': 1,
                'wall': 1, 'border': 1, 'arms': 2, 'gun': 2, 'bullet': 2, 'Gun': 2, 'guns': 2, 'Guns': 2}

# Initialize trump and biden dfs
trump_df = pd.DataFrame()
biden_df = pd.DataFrame()



In [40]:
def filter_tweets_by_hashtags(tweet_df, hashtags, label):
    """
    Filters tweets based on the presence of specified hashtags.

    Parameters:
        tweet_df (pandas.DataFrame): DataFrame containing tweets.
        hashtags (list): List of hashtags to filter tweets.
        label (str): Label to assign to filtered tweets.

    Returns:
        pandas.DataFrame: DataFrame containing filtered tweets with an additional 'label' column.
    """
    # Convert 'tweet' column to string
    tweet_df['tweet'] = tweet_df['tweet'].astype(str)
    
    # Construct a regular expression pattern to match any of the specified hashtags
    pattern = '|'.join([re.escape(tag) for tag in hashtags])
    
    # Filter tweets that contain any of the specified hashtags
    filtered_tweets = tweet_df[tweet_df['tweet'].str.contains(pattern, case=False)]
    
    # Add a new column 'label' with specified label
    filtered_tweets['label'] = label
    
    return filtered_tweets

In [41]:
def label_tweets_by_topic(tweet_df, topic_to_label):
    """
    Labels tweets based on the presence of specified topics.

    Parameters:
        tweet_df (pandas.DataFrame): DataFrame containing tweets.
        topics (dict): Dictionary mapping topics to labels.

    Returns:
        pandas.DataFrame: DataFrame containing labeled tweets with an additional 'topic' column.
    """
    # Initialize a list to store labels for each tweet
    labels = []
    
    # Iterate over each tweet
    for tweet in tweet_df['tweet']:
        # Initialize label to None
        label = None
        
        try:
            # Check if any topic appears in the tweet
            for topic in topic_to_label.keys():
                if topic in str(tweet):  # Convert tweet to string
                    # If topic found, assign the corresponding label
                    label = topic_to_label.get(topic)
                    break  # Exit loop once a topic is found
        except TypeError:  # Catch the TypeError exception when tweet is not a string
            label = -1
        
        # If no topic found, assign a default label (e.g., -1)
        if label is None:
            label = -1
        
        # Append the label to the list of labels
        labels.append(label)
    
    # Add a new column 'topic' to the DataFrame
    tweet_df['topic'] = labels
    
    return tweet_df

In [9]:
# Function to read tweets in chunks
def read_tweets_in_chunks(file_path, chunk_size):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    return chunks

### Warning: Next cell takes 30+ minutes to run

In [11]:
# Read and process Trump tweets
trump_chunks = read_tweets_in_chunks(trump_tweets_file, 100)

# Counter to track the number of processed chunks
chunk_count = 0  

# Load in the filtered tweets
for chunk in trump_chunks:
    if chunk_count >= 10000:
        break  # Break out of the loop if 100 chunks have been processed

    #label tweets by topic
    label_df = label_tweets_by_topic(chunk, topic_to_label)
    trump_df = pd.concat([trump_df, label_df])
    chunk_count += 1  # Increment the chunk count

In [12]:
trump_df.shape[0]

1229150

In [15]:
# Check presence of topics, 2 = gun control, 1 = immigration, 0 = abortion, -1 = N/A
print(trump_df['topic'].value_counts())

-1    955763
 2      7733
 1      6175
 0      1415
Name: topic, dtype: int64


In [14]:
# Drop any duplicate tweets, drop data that doesn't contain info about one of these topics
trump_df = trump_df.drop_duplicates()
trump_topics = trump_df[trump_df['topic'] != -1]

In [16]:
# Confirm data is cleaned
print(trump_topics['topic'].value_counts())

2    7733
1    6175
0    1415
Name: topic, dtype: int64


In [17]:
# Run tweets through vader to assign sentiment scores 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to each tweet in the filtered DataFrame
trump_topics['vader_score'] = trump_topics['tweet'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trump_topics['vader_score'] = trump_topics['tweet'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])


In [19]:
# Create a new dataframe and add all necessary information to feed into the model: tweet, score, topic
new_trump = pd.DataFrame()
new_trump['tweet'] = trump_topics['tweet'] 
new_trump['vader_score'] = trump_topics['vader_score']
new_trump['topic'] = trump_topics['topic']
new_trump.head()

Unnamed: 0,tweet,vader_score,topic
32,@maries_trella @CarmenAColeman @hugoventura @m...,-0.296,2
147,@Acosta Hey #trump you #POS U got a kid home #...,-0.3339,1
281,#Trump outmaneuvers Grandpa Groper #JoeBiden a...,-0.4767,0
355,@TwitterSafety @Amy_Siskind I appreciate it. ...,0.75,2
377,If ya don’t know about...\n⁃@Walmart’s #blackf...,0.25,2


# Repeat the process for the biden file

### Warning: This next cell takes 7 hours to run

In [24]:
# Read and process biden tweets
biden_chunks = read_tweets_in_chunks(biden_tweets_file, 100)
# Counter to track the number of processed chunks
chunk_count = 0  

# Load in the filtered tweets
for chunk in biden_chunks:
    if chunk.empty:  # If the chunk is empty (no more data)
        break  # Break out of the loop

    if chunk_count >= 10000:
        break  # Break out of the loop if 100 chunks have been processed

    #label tweets by topic
    label_df = label_tweets_by_topic(chunk, topic_to_label)
    biden_df = pd.concat([biden_df, label_df])
    chunk_count += 1  # Increment the chunk count

In [25]:
# View data with labeled topics
print(biden_df['topic'].value_counts())

-1    2150028
 2      16408
 1      12173
 0       3210
Name: topic, dtype: int64


In [26]:
# Drop any duplicate tweets and all data not assigned to a topic
biden_df = biden_df.drop_duplicates()
biden_topics = biden_df[biden_df['topic'] != -1]

In [27]:
# Verify data was cleaned correctly
print(biden_topics['topic'].value_counts())

2    13277
1     9845
0     2424
Name: topic, dtype: int64


In [28]:
biden_topics.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_location,lat,long,city,country,continent,state,state_code,collected_at,topic
32,2020-10-15 00:01:38,1.316529628748673e+18,@maries_trella @CarmenAColeman @hugoventura @m...,5.0,2.0,Twitter for Android,9.777713601802322e+17,ClauMorales,ClauMor32263091,No DM por favor.,...,,,,,,,,,2020-10-21 00:00:11.942928972,2
147,2020-10-15 00:07:43,1.316531159539626e+18,@Acosta Hey #trump you #POS U got a kid home #...,0.0,0.0,Twitter Web App,1.0892621673948404e+18,Actionnow1🇺🇸🌊🌊💧💙🌈🕊🗽🕆🐋🐳 FBR bluewave,ActionnowI,God's babies and children ripped from their pa...,...,Build bridges🌊🌊🌈🗽not walls,,,,,,,,2020-10-21 00:00:56.355696087,1
281,2020-10-15 00:15:53,1.316533217206448e+18,#Trump outmaneuvers Grandpa Groper #JoeBiden a...,1.0,0.0,Twitter Web App,350903254.0,khnfri,khnfri,A free man and successful small business owner...,...,,,,,,,,,2020-10-21 00:01:46.366711158,0
355,2020-10-15 00:19:54,1.3165342260748367e+18,@TwitterSafety @Amy_Siskind I appreciate it. ...,0.0,0.0,Twitter Web App,1.3132165010175263e+18,GrabHimByTheBallot 🗽🌊🌈✊🇺🇸,VoteBh2020,#BidenHarris2020\nRebuilding after being bot r...,...,,,,,,,,,2020-10-21 00:02:13.984734407,2
377,2020-10-15 00:21:36,1.3165346561085604e+18,If ya don’t know about...\n⁃@Walmart’s #blackf...,3.0,0.0,Twitter for iPhone,28619103.0,idobi Radio,idobiradio,the internet's #1 alternative & pop punk stati...,...,the internet,,,,,,,,2020-10-21 00:02:22.195498075,2


In [29]:
# Run vader on tweets to assign a sentiment score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to each tweet in the filtered DataFrame
biden_topics['vader_score'] = biden_topics['tweet'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biden_topics['vader_score'] = biden_topics['tweet'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])


In [30]:
biden_topics.head(10)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,topic,vader_score
32,2020-10-15 00:01:38,1.316529628748673e+18,@maries_trella @CarmenAColeman @hugoventura @m...,5.0,2.0,Twitter for Android,9.777713601802322e+17,ClauMorales,ClauMor32263091,No DM por favor.,...,,,,,,,,2020-10-21 00:00:11.942928972,2,-0.296
147,2020-10-15 00:07:43,1.316531159539626e+18,@Acosta Hey #trump you #POS U got a kid home #...,0.0,0.0,Twitter Web App,1.0892621673948404e+18,Actionnow1🇺🇸🌊🌊💧💙🌈🕊🗽🕆🐋🐳 FBR bluewave,ActionnowI,God's babies and children ripped from their pa...,...,,,,,,,,2020-10-21 00:00:56.355696087,1,-0.3339
281,2020-10-15 00:15:53,1.316533217206448e+18,#Trump outmaneuvers Grandpa Groper #JoeBiden a...,1.0,0.0,Twitter Web App,350903254.0,khnfri,khnfri,A free man and successful small business owner...,...,,,,,,,,2020-10-21 00:01:46.366711158,0,-0.4767
355,2020-10-15 00:19:54,1.3165342260748367e+18,@TwitterSafety @Amy_Siskind I appreciate it. ...,0.0,0.0,Twitter Web App,1.3132165010175263e+18,GrabHimByTheBallot 🗽🌊🌈✊🇺🇸,VoteBh2020,#BidenHarris2020\nRebuilding after being bot r...,...,,,,,,,,2020-10-21 00:02:13.984734407,2,0.75
377,2020-10-15 00:21:36,1.3165346561085604e+18,If ya don’t know about...\n⁃@Walmart’s #blackf...,3.0,0.0,Twitter for iPhone,28619103.0,idobi Radio,idobiradio,the internet's #1 alternative & pop punk stati...,...,,,,,,,,2020-10-21 00:02:22.195498075,2,0.25
432,2020-10-15 00:24:52,1.3165354761325404e+18,@dannymoshow @icecube You’re right.\n\n@icecub...,0.0,0.0,Twitter for iPhone,33172759.0,Ozzy 🇺🇸,geminigod,#Science | #Tech | #Business Owner | Urban Dev...,...,,,,,,,,2020-10-21 00:02:42.722407246,1,0.7482
451,2020-10-15 00:25:37,1.3165356650190316e+18,Briton who tried to grab policeman's gun 'to k...,0.0,0.0,Twitter for iPhone,476388891.0,Vote Dem to Save Your Life,mh451,“If we had had confidence the President clearl...,...,,,,,,,,2020-10-21 00:02:49.813521323,2,-0.765
551,2020-10-15 00:31:03,1.316537032869519e+18,St. Louis couple who waved guns at legally sla...,0.0,0.0,dlvr.it,2405189490.0,D.C. Nation News 🌎,DietColaNation,Follow Diet Cola Nation's #1 Noble News networ...,...,31.193277,30.054019,,Egypt,Africa,Alexandria Governorate,,2020-10-21 00:03:27.881607422,2,0.1027
642,2020-10-15 00:38:08,1.3165388161534976e+18,Por favor explíquenme por que esto me hace reí...,2.0,0.0,Twitter Web App,54883778.0,Daniel @ElTraderRoto,DaMedinaR,Muchos años perdí plata en los mercados! pero ...,...,4.653333,-74.083652,Bogota,Colombia,South America,,,2020-10-21 00:04:02.217528218,2,0.4019
697,2020-10-15 00:41:48,1.3165397395252593e+18,"The ""smoking gun"".\nVolano stracci e Facebook ...",1.0,5.0,Twitter Web App,379030092.0,Cinzia,CCKKI,Se un uomo non ha scoperto qualcosa per cui è ...,...,,,,,,,,2020-10-21 00:04:23.117653919,2,0.0


In [31]:
# create a new dataframe with necessary information to feed into model
new_biden = pd.DataFrame()
new_biden['tweet'] = biden_topics['tweet']
new_biden['topic'] = biden_topics['topic']
new_biden['vader_score'] = biden_topics['vader_score']
new_biden.head()

Unnamed: 0,tweet,topic,vader_score
32,@maries_trella @CarmenAColeman @hugoventura @m...,2,-0.296
147,@Acosta Hey #trump you #POS U got a kid home #...,1,-0.3339
281,#Trump outmaneuvers Grandpa Groper #JoeBiden a...,0,-0.4767
355,@TwitterSafety @Amy_Siskind I appreciate it. ...,2,0.75
377,If ya don’t know about...\n⁃@Walmart’s #blackf...,2,0.25


In [49]:
# Displaying data size. 
print("Biden Data size: ")
print(new_biden.shape)
print("Trump Data size: ")
print(new_trump.shape)
print("Biden topics: ")
print(new_biden['topic'].value_counts())
print("trump topics: ")
print(new_trump['topic'].value_counts())

Biden Data size: 
(25546, 4)
Trump Data size: 
(15323, 4)
Biden topics: 
2    13277
1     9845
0     2424
Name: topic, dtype: int64
trump topics: 
2    7733
1    6175
0    1415
Name: topic, dtype: int64


In [50]:
new_biden.head()

Unnamed: 0,tweet,topic,vader_score,source
32,@maries_trella @CarmenAColeman @hugoventura @m...,2,-0.296,biden
147,@Acosta Hey #trump you #POS U got a kid home #...,1,-0.3339,biden
281,#Trump outmaneuvers Grandpa Groper #JoeBiden a...,0,-0.4767,biden
355,@TwitterSafety @Amy_Siskind I appreciate it. ...,2,0.75,biden
377,If ya don’t know about...\n⁃@Walmart’s #blackf...,2,0.25,biden


In [54]:
# Filter the data by  pro trump and biden to give it labels
real_filtered_biden = filter_tweets_by_hashtags(new_biden, biden_hashtags, 'biden')
real_filtered_biden.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label


Unnamed: 0,tweet,topic,vader_score,source,label
850,@tackettdc @ZekeJMiller #TrumpRallyIowa \nTrum...,1,-0.8316,biden,biden
1780,#Trump supporters are sending out lies about t...,0,0.0258,biden,biden
3690,@realDonaldTrump Trump condones the killing of...,0,-0.5012,biden,biden
3833,You and I are paying for the damn wall that is...,1,-0.4019,biden,biden
4243,You and I are paying with our hard earned mone...,1,-0.1027,biden,biden


In [56]:
real_filtered_biden.shape
print(real_filtered_biden['topic'].value_counts())

1    852
2    631
0    246
Name: topic, dtype: int64


In [57]:
# Filter the data by  pro trump and biden to give it labels
real_filtered_trump = filter_tweets_by_hashtags(new_trump, trump_hashtags, 'trump')
real_filtered_trump.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label


Unnamed: 0,tweet,vader_score,topic,source,label
1326,@LarrySchweikart The talk about Tiffany #Trump...,0.2558,0,trump,trump
1682,#MAGA2020: @realDonaldTrump is lying about who...,-0.3049,1,trump,trump
1887,@IngrahamAngle @politico Rumors about Tiffany ...,0.4069,0,trump,trump
2012,@davenewworld_2 Very typical Trump supporter. ...,0.9068,0,trump,trump
2163,@IngrahamAngle @RaymondArroyo @HawleyMO @RealC...,0.2789,0,trump,trump


In [58]:
real_filtered_trump.shape
print(real_filtered_trump['topic'].value_counts())

1    660
2    529
0    168
Name: topic, dtype: int64


In [59]:
merged_bt = pd.concat([real_filtered_trump, real_filtered_biden])
merged_bt = merged_bt.sample(frac=1, random_state=42).reset_index(drop=True)
merged_bt.head(20)

Unnamed: 0,tweet,vader_score,topic,source,label
0,@SholaMos1 @scottienhughes Methinks the desper...,0.6814,1,biden,biden
1,Voting for #JoeBiden #BidenHarris2020 is votin...,0.0,0,biden,biden
2,And has no respect for human life (hear that p...,0.2263,0,biden,biden
3,Hey #MAGA fools - you know you're paying for #...,-0.6249,1,trump,trump
4,@Jorgeatapia @CNNEE Tu busca las pruebas. Esta...,0.4215,2,biden,biden
5,"Another wall that Mexico won't be paying for, ...",0.0,1,trump,trump
6,There are three basic reasons #MAGAts support ...,0.25,0,trump,trump
7,#DonaldTrump is a showmen...\nDon't be fooled ...,0.7482,0,biden,biden
8,Humpty Trumpty sat on the wall..\n\n#trump #tr...,0.0,1,biden,biden
9,make this fucking asshole famous! #maga #guns ...,0.0,2,trump,trump


In [60]:
# Displaying data size. 

print(real_filtered_biden['topic'].value_counts())

print(real_filtered_trump['topic'].value_counts())

1    852
2    631
0    246
Name: topic, dtype: int64
1    660
2    529
0    168
Name: topic, dtype: int64


In [43]:
# Concatenate the dataframes
biden_trump = pd.concat([new_biden, new_trump])
# Reset index
biden_trump = biden_trump.sample(frac=1, random_state=42).reset_index(drop=True)

In [38]:
biden_trump.head(20)

Unnamed: 0,tweet,topic,vader_score
0,@RealCarolJones @charlottesix6 That’s the spec...,2,-0.6351
1,@JasonMillerinDC @nypost I think I'll end on t...,0,-0.4215
2,Seuls les bulletins legos doivent être comptés...,2,0.0
3,On his first day in office #JoeBiden will use ...,0,-0.836
4,"This family does... with all our heart, all ou...",2,0.9057
5,@realDonaldTrump He’s using our tax payers dol...,2,-0.4404
6,#CHINA destroyed #trump not #biden nor #immigr...,1,-0.4939
7,Donald #Trump eats aborted babies,0,0.0
8,El moderador del #Debates2020 debería ser @lcv...,2,0.0
9,Ok so biden won... fuck YES.. i know he just ...,1,0.6983


In [44]:
biden_trump.shape

(40869, 4)

In [45]:
biden_trump.drop_duplicates
biden_trump.shape

(40869, 4)

In [39]:
# Outwrite to csv to load on different computer
biden_trump.to_csv('final_biden_trump.csv', index=False)