In [28]:
import pandas as pd
import re
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
# Load Trump and Biden tweet datasets 
trump_tweets_file = "archive/hashtag_donaldtrump.csv"
biden_tweets_file = "archive/hashtag_joebiden.csv"

# Define hashtags to filter data into pro trump or pro biden dataframes
trump_hashtags = ["#MAGA", "#KAG", "#FourMoreYears", "#SleepyJoe", "#BlacksForTrump", "#Trump2020", "#VoteRed", "#WomenForTrump", "#LatinosForTrump", "#AmericaFirst", "#BuildTheWall"]
biden_hashtags = ["#Biden2020", "#BidenHarris2020", "#VoteBlue", "#NotMyPresident", "#BlueWave2020", "#VoteBiden", "#VoteBlueToSaveAmerica", "#BlacksForBiden", "#WomenForBiden", "#LatinosForBiden"]

# Dictionary containing contentious topics to fliter the tweets with
# topic_to_label = {'abortion': 0, 'Abortion': 0,'fetus': 0, 'Fetus': 0,'pro-life': 0, 'pro-choice': 0, 'babies': 0,
#                 'immigration': 1, 'immigrant': 1, 'immigrants': 1, 'Immigration': 1, 'Immigrant': 1, 'Immigrants': 1,
#                 'wall': 1, 'border': 1, 'arms': 2, 'gun': 2, 'bullet': 2, 'Gun': 2, 'guns': 2, 'Guns': 2}

# Initialize trump and biden dfs
trump_df = pd.DataFrame()
biden_df = pd.DataFrame()



In [7]:
def filter_tweets_by_hashtags(tweet_df, hashtags, label):
    """
    Filters tweets based on the presence of specified hashtags.

    Parameters:
        tweet_df (pandas.DataFrame): DataFrame containing tweets.
        hashtags (list): List of hashtags to filter tweets.
        label (str): Label to assign to filtered tweets.

    Returns:
        pandas.DataFrame: DataFrame containing filtered tweets with an additional 'label' column.
    """
    # Convert 'tweet' column to string
    tweet_df['tweet'] = tweet_df['tweet'].astype(str)
    
    # Construct a regular expression pattern to match any of the specified hashtags
    pattern = '|'.join([re.escape(tag) for tag in hashtags])
    
    # Filter tweets that contain any of the specified hashtags
    filtered_tweets = tweet_df[tweet_df['tweet'].str.contains(pattern, case=False)]
    
    # Add a new column 'label' with specified label
    filtered_tweets['label'] = label
    
    return filtered_tweets

In [9]:
# Function to read tweets in chunks
def read_tweets_in_chunks(file_path, chunk_size):
    chunks = pd.read_csv(file_path, chunksize=chunk_size)
    return chunks

In [10]:
# Read and process Trump tweets
trump_chunks = read_tweets_in_chunks(trump_tweets_file, 100)

# Counter to track the number of processed chunks
chunk_count = 0  

# Load in the filtered tweets
for chunk in trump_chunks:
    if chunk_count >= 10000:
        break  # Break out of the loop if 100 chunks have been processed

    #label tweets by topic
    label_df = filter_tweets_by_hashtags(chunk, trump_hashtags, 'trump')
    trump_df = pd.concat([trump_df, label_df])
    chunk_count += 1  # Increment the chunk count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [11]:
trump_df.shape[0]

102091

In [12]:
# Read and process biden tweets
biden_chunks = read_tweets_in_chunks(biden_tweets_file, 100)
# Counter to track the number of processed chunks
chunk_count = 0  

# Load in the filtered tweets
for chunk in biden_chunks:
    if chunk.empty:  # If the chunk is empty (no more data)
        break  # Break out of the loop

    if chunk_count >= 10000:
        break  # Break out of the loop if 100 chunks have been processed

    #label tweets by topic
    label_df = filter_tweets_by_hashtags(chunk, biden_hashtags, 'biden')
    biden_df = pd.concat([biden_df, label_df])
    chunk_count += 1  # Increment the chunk count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tweets['label'] = label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

In [13]:
print(biden_df.shape)

(85406, 22)


In [14]:
trump_df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,user_location,lat,long,city,country,continent,state,state_code,collected_at,label
12,2020-10-15 00:00:26,1.3165293286084813e+18,#Trump #PresidentTrump #Trump2020LandslideVict...,3.0,5.0,Twitter for Android,1.2433153463979663e+18,Ron Burgundy,Anchorman_USA,"I'm kind of a Big Deal, People know me! I driv...",...,"San Diego, CA",32.717421,-117.162771,San Diego,United States of America,North America,California,CA,2020-10-21 00:00:04.478598364,trump
17,2020-10-15 00:00:44,1.3165294056284815e+18,Now Open! Create a FREE U.S. Election Pick'em ...,0.0,0.0,Hootsuite Inc.,21932049.0,OfficePools.com,officepools_com,Visit https://t.co/CdmWfqRMIS to play now! #fa...,...,,,,,,,,,2020-10-21 00:00:06.344681016,trump
23,2020-10-15 00:01:16,1.3165295378203812e+18,@tedcruz @cc125 #Trump2020 #BLM #obama #VoteRe...,1.0,0.0,Twitter Web App,1.3100400474214031e+18,Infamous One,InfamousOne13,"""The company you keep can have a major impact ...",...,,,,,,,,,2020-10-21 00:00:08.583980198,trump
25,2020-10-15 00:01:20,1.3165295552770455e+18,@RudyGiuliani @Twitter @PressSec You right @Ru...,0.0,0.0,Twitter for Android,1010405455.0,David Goldstein,Amambo12Carlos,We are part of a Symbiotic relationship with s...,...,"Florida, USA",27.756767,-81.463983,,United States of America,North America,Florida,FL,2020-10-21 00:00:09.330413259,trump
34,2020-10-15 00:01:42,1.316529648323289e+18,#BlacksForTrump \n#BlackVoicesForTrump \n#Bide...,1.0,1.0,Twitter Web App,2818208253.0,RickenRich,RickenRich,Rickenbacker guitars and basses are the intere...,...,"California, USA",36.701463,-118.755997,,United States of America,North America,California,CA,2020-10-21 00:00:12.689362032,trump


In [16]:
trump1 = pd.DataFrame()
trump1['tweet']= trump_df['tweet']
trump1['label'] = trump_df['label']
trump1['label'] = 1
trump1.head()

Unnamed: 0,tweet,label
12,#Trump #PresidentTrump #Trump2020LandslideVict...,1
17,Now Open! Create a FREE U.S. Election Pick'em ...,1
23,@tedcruz @cc125 #Trump2020 #BLM #obama #VoteRe...,1
25,@RudyGiuliani @Twitter @PressSec You right @Ru...,1
34,#BlacksForTrump \n#BlackVoicesForTrump \n#Bide...,1


In [20]:
biden1 = pd.DataFrame()
biden1['tweet']= biden_df['tweet']
biden1['label'] = biden_df['label']
biden1['label'] = 0
biden1.head()

Unnamed: 0,tweet,label
14,@tedcruz @cc125 #Trump2020 #BLM #obama #VoteRe...,0
17,"Comments on this? ""Do Democrats Understand how...",0
35,#unflatteringdogphotochallenge #HunterBiden #C...,0
42,"Since Twitter is censoring news, here’s the sc...",0
53,This is from the same night I met the cast of ...,0


In [19]:
print(biden1.shape)
print(trump1.shape)

(85406, 2)
(102091, 2)


In [21]:
# Determine the size of the smaller dataframe
min_size = min(biden1.shape[0], trump1.shape[0])

# Sample a subset of data from the larger dataframe to match the size of the smaller dataframe
trump1_subset = trump1.sample(n=min_size, random_state=42)

In [23]:
print(trump1_subset.shape)
print(biden1.shape)

(85406, 2)
(85406, 2)


In [24]:
# Concatenate the dataframes row-wise
bidentrump = pd.concat([biden1, trump1_subset], ignore_index=True)

# Shuffle the rows of the merged dataframe
bidentrump = bidentrump.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
bidentrump.head(15)

Unnamed: 0,tweet,label
0,#Trump #Trump2020 #MAGA #MAGA2020\nWHY CAN'T Y...,1
1,@realDonaldTrump caused America to slip into a...,1
2,"Electoral fraud, @RealDonaldTrump? Why would t...",1
3,@MarcoMNYC @RudyGiuliani @MariaBartiromo I mea...,0
4,@JoeBiden #Biden @KamalaHarris #HarrisBiden #B...,0
5,@realDonaldTrump Arizona Goes for Trump! Penn...,1
6,#VoteForTrump #ElectionDay #Election2020 #Mak...,1
7,Flint City Council’s Maurice Davis: Dems Not H...,1
8,Towns in America boarding up shops in case Tru...,1
9,#Biden #BidenHarris2020 #HunterBiden #HunterBi...,0


In [31]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/daisyabbott/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [32]:
# Run tweets through vader to assign sentiment scores 
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to each tweet in the filtered DataFrame
bidentrump['vader_score'] = bidentrump['tweet'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])


In [33]:
bidentrump.head(20)

Unnamed: 0,tweet,label,vader_score
0,#Trump #Trump2020 #MAGA #MAGA2020\nWHY CAN'T Y...,1,0.8225
1,@realDonaldTrump caused America to slip into a...,1,-0.6377
2,"Electoral fraud, @RealDonaldTrump? Why would t...",1,-0.8818
3,@MarcoMNYC @RudyGiuliani @MariaBartiromo I mea...,0,-0.2714
4,@JoeBiden #Biden @KamalaHarris #HarrisBiden #B...,0,0.0
5,@realDonaldTrump Arizona Goes for Trump! Penn...,1,0.0
6,#VoteForTrump #ElectionDay #Election2020 #Mak...,1,0.0
7,Flint City Council’s Maurice Davis: Dems Not H...,1,-0.2235
8,Towns in America boarding up shops in case Tru...,1,0.34
9,#Biden #BidenHarris2020 #HunterBiden #HunterBi...,0,0.0


In [34]:
# Outwrite to csv to load on different computer
bidentrump.to_csv('bidentrump.csv', index=False)