In [1]:
from dateutil import parser
import pandas as pd
import re

# Function to check if a text is mostly English based on ASCII characters
def is_mostly_english(text, threshold=0.8):
    english_chars = sum(1 for c in text if c.isascii())
    return (english_chars / len(text)) >= threshold if len(text) > 0 else False

# Function to clean content: remove hashtags, links, and keep only English text
def clean_content(text):
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Load 'tweets.csv'
tweets_df = pd.read_csv("tweets.csv")

# Convert tweetDate using dateutil parser for flexibility
tweets_df['tweetDate'] = tweets_df['tweetDate'].apply(lambda x: parser.parse(str(x)) if pd.notnull(x) else None)

# Remove duplicate tweets and drop missing content
tweets_df = tweets_df.drop_duplicates(subset=['content'])
tweets_df = tweets_df.dropna(subset=['content'])

# Extract hashtags, mentions, and links from content
tweets_df['hashtags'] = tweets_df['content'].str.findall(r"#\w+")
tweets_df['mentions'] = tweets_df['content'].str.findall(r"@\w+")
tweets_df['links'] = tweets_df['content'].str.findall(r"https?://\S+")

# Apply cleaning function to content
tweets_df['cleaned_content'] = tweets_df['content'].apply(clean_content)

# Detect and filter only mostly English text
tweets_df = tweets_df[tweets_df['cleaned_content'].apply(is_mostly_english)]

# Drop original 'content' column and rename 'cleaned_content' to 'content'
tweets_df = tweets_df.drop(columns=['content']).rename(columns={'cleaned_content': 'content'})

# Display first few rows
print("Cleaned Tweets Dataset:")
print(tweets_df.head())

Cleaned Tweets Dataset:
                  tweetDate                       twitterProfile  \
0 2024-11-01 10:26:18+00:00      https://twitter.com/great_osho_   
2 2024-11-01 14:50:34+00:00      https://twitter.com/harris_wins   
3 2024-11-01 08:44:25+00:00  https://twitter.com/Talhaofficial01   
4 2024-11-01 14:06:59+00:00  https://twitter.com/singhshreya1203   
7 2024-10-31 23:37:02+00:00    https://twitter.com/colinpmcquaid   

                                            tweetUrl  \
0  https://twitter.com/great_osho_/status/1852296...   
2  https://twitter.com/harris_wins/status/1852362...   
3  https://twitter.com/Talhaofficial01/status/185...   
4  https://twitter.com/singhshreya1203/status/185...   
7  https://twitter.com/colinpmcquaid/status/18521...   

                  timestamp            query  \
0  2024-11-01T16:17:38.608Z  #uselection2024   
2  2024-11-01T16:17:38.608Z  #uselection2024   
3  2024-11-01T16:17:38.608Z  #uselection2024   
4  2024-11-01T16:17:38.608Z  #uselecti

In [2]:
tweets_df

Unnamed: 0,tweetDate,twitterProfile,tweetUrl,timestamp,query,hashtags,mentions,links,content
0,2024-11-01 10:26:18+00:00,https://twitter.com/great_osho_,https://twitter.com/great_osho_/status/1852296...,2024-11-01T16:17:38.608Z,#uselection2024,[#USElection2024],[],[https://t.co/fkwi0l4gOn],Who will become the President of America? Like...
2,2024-11-01 14:50:34+00:00,https://twitter.com/harris_wins,https://twitter.com/harris_wins/status/1852362...,2024-11-01T16:17:38.608Z,#uselection2024,[],[],[https://t.co/1eYy7ac78H],BREAKING: New data from CNN shows women are si...
3,2024-11-01 08:44:25+00:00,https://twitter.com/Talhaofficial01,https://twitter.com/Talhaofficial01/status/185...,2024-11-01T16:17:38.608Z,#uselection2024,"[#TrumpVance2024, #DonaldTrump, #Trump, #USEle...",[],[https://t.co/wlQFX8TEA4],Drop a ❤🇺🇸 if this is your Next President!
4,2024-11-01 14:06:59+00:00,https://twitter.com/singhshreya1203,https://twitter.com/singhshreya1203/status/185...,2024-11-01T16:17:38.608Z,#uselection2024,"[#KamalaHarris, #DonaldTrump, #USElection2024]",[],[https://t.co/eTIqUDeW0j],Give your opinion plz American president elect...
7,2024-10-31 23:37:02+00:00,https://twitter.com/colinpmcquaid,https://twitter.com/colinpmcquaid/status/18521...,2024-11-01T16:17:38.608Z,#uselection2024,"[#trump, #maga, #USElection2024, #election]","[@Scaramucci, @jonstewart, @LastWeekTonight]",[https://t.co/4ebFHL06OS],Don't forget to throw out your garbage on Tues...
...,...,...,...,...,...,...,...,...,...
2422,2024-11-08 14:11:29+00:00,https://twitter.com/Trump_GAOA,https://twitter.com/Trump_GAOA/status/18548894...,2024-11-15T21:57:38.065Z,#uselection2024,"[#Solana, #SOL, #Trump2024, #Trump, #USElectio...",[],[],We're still collecting address for $GAOA airdr...
2423,2024-11-07 15:13:10+00:00,https://twitter.com/harry15809,https://twitter.com/harry15809/status/18545426...,2024-11-15T21:57:40.801Z,#uselection2024,"[#nationalgrid, #naijaeconomy, #USElection2024]",[@harry15809],[https://t.co/zqMXbsQioI],Thanks you. To celebrate this I would give 2k ...
2426,2024-11-07 12:30:00+00:00,https://twitter.com/newslaundry,https://twitter.com/newslaundry/status/1854501...,2024-11-15T21:57:48.539Z,#uselection2024,"[#NLHafta, #USElection2024, #LawrenceBishnoi]","[@AbhinandanSekhr, @ramankirpal, @MnshaP, @see...","[https://t.co/eTGDJl3wub, https://t.co/9Seq9rV...","🔒This week on , @AbhinandanSekhr, @ramankirpal..."
2427,2024-11-07 17:40:42+00:00,https://twitter.com/ZaPLMundurem,https://twitter.com/ZaPLMundurem/status/185457...,2024-11-15T21:57:48.539Z,#uselection2024,"[#TuskKłamie, #TuskTchórz, #Election2024, #Wyb...",[@KosiniakKamysz],[https://t.co/0aceci5IXc],"Fikołki Kamysza, czyli Kosiniak @KosiniakKamys..."


In [3]:
# Split data into time periods
tweets_before = tweets_df[tweets_df['tweetDate'] < '2024-11-06']
tweets_after = tweets_df[tweets_df['tweetDate'] > '2024-11-05']

# Sample up to 5000 rows for each period
num_before = min(len(tweets_before), 5000)
num_after = min(len(tweets_after), 5000)
tweets_before_sampled = tweets_before.sample(n=num_before, random_state=42)
tweets_after_sampled = tweets_after.sample(n=num_after, random_state=42)

# Sort tweets by date
tweets_before_sampled = tweets_before_sampled.sort_values(by='tweetDate', ascending=True)
tweets_after_sampled = tweets_after_sampled.sort_values(by='tweetDate', ascending=True)

# Save datasets
tweets_before_sampled.to_csv("tweets_before.csv", index=False)
tweets_after_sampled.to_csv("tweets_after.csv", index=False)

In [4]:
print("Sample Tweets Before Election:")
print(tweets_before_sampled.head())

Sample Tweets Before Election:
                    tweetDate                     twitterProfile  \
345 2024-10-21 19:06:14+00:00      https://twitter.com/Usa2024Jo   
226 2024-10-23 16:31:33+00:00   https://twitter.com/MacroMicroMe   
312 2024-10-24 05:56:57+00:00     https://twitter.com/delphibets   
277 2024-10-24 09:36:31+00:00  https://twitter.com/Terciosdelsol   
99  2024-10-24 10:03:33+00:00   https://twitter.com/TrumpGPTMeme   

                                              tweetUrl  \
345  https://twitter.com/Usa2024Jo/status/184844067...   
226  https://twitter.com/MacroMicroMe/status/184912...   
312  https://twitter.com/delphibets/status/18493292...   
277  https://twitter.com/Terciosdelsol/status/18493...   
99   https://twitter.com/TrumpGPTMeme/status/184939...   

                    timestamp            query  \
345  2024-11-01T21:57:37.002Z  #uselection2024   
226  2024-11-01T21:57:04.856Z  #uselection2024   
312  2024-11-01T21:57:30.628Z  #uselection2024   
277  2024-1

In [5]:
print("\nSample Tweets After Election:")
print(tweets_after_sampled.head())


Sample Tweets After Election:
                     tweetDate                     twitterProfile  \
1203 2024-11-05 00:13:46+00:00       https://twitter.com/RAGrubb2   
1234 2024-11-05 00:27:49+00:00    https://twitter.com/Baraniiraj3   
1219 2024-11-05 00:34:10+00:00    https://twitter.com/andywigmore   
1194 2024-11-05 00:45:45+00:00      https://twitter.com/QGurtrude   
1322 2024-11-05 00:49:15+00:00  https://twitter.com/_LauraKlassen   

                                               tweetUrl  \
1203  https://twitter.com/RAGrubb2/status/1853591500...   
1234  https://twitter.com/Baraniiraj3/status/1853595...   
1219  https://twitter.com/andywigmore/status/1853596...   
1194  https://twitter.com/QGurtrude/status/185359954...   
1322  https://twitter.com/_LauraKlassen/status/18536...   

                     timestamp            query  \
1203  2024-11-05T09:07:14.885Z  #uselection2024   
1234  2024-11-05T09:07:27.777Z  #uselection2024   
1219  2024-11-05T09:07:18.443Z  #uselection202