In [1]:
# python3 -m venv venv
# source venv/bin/activate
# pip install tweepy pandas numpy matplotlib seaborn nltk spacy gensim scikit-learn statsmodels yfinance python-dotenv
# python -m spacy download en_core_web_sm

In [None]:
import re
import spacy
import numpy as np
import pandas as pd
from textblob import TextBlob
from bertopic import BERTopic
from transformers import pipeline

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Path to your CSV file
trump_tweets_csv = 'TrumptTweetArchive20160801_2020.csv'

# Define which strings should be treated as True/False
true_vals = ['t', 'true', 'True']
false_vals = ['f', 'false', 'False']

# Read the CSV
trump_tweets_df = pd.read_csv(
    trump_tweets_csv,
    dtype={
        'id': 'int64',
        'device': 'string',
        'favorites': 'int64',
        'retweets': 'int64'
    },
    parse_dates=['date'],        # parse the date column as datetime
    true_values=true_vals,       # map these to True
    false_values=false_vals      # map these to False
)

del trump_tweets_csv

In [5]:
# Quick check
print(trump_tweets_df.dtypes)

id                    int64
text                 object
isRetweet              bool
isDeleted              bool
device       string[python]
favorites             int64
retweets              int64
date         datetime64[ns]
isFlagged              bool
dtype: object


In [6]:
trump_tweets_df.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,False,False,TweetDeck,49,255,2011-08-02 18:07:48,False
1,1234653427789070336,I was thrilled to be back in the Great city of...,False,False,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,False
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,True,False,Twitter for iPhone,0,7396,2020-01-17 03:22:47,False
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,False,False,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,False
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,True,False,Twitter for iPhone,0,9081,2020-01-17 13:13:59,False


In [7]:
trump_tweets_df.id[56569]

1319384118849949702

In [8]:
trump_tweets_df.shape

(56571, 9)

In [9]:
# Path to your second CSV file
biden_tweets_csv = 'JoeBidenTweets.csv'

# Read the CSV
biden_tweets_df = pd.read_csv(
    biden_tweets_csv,
    dtype={
        'id': 'int64',
        'url': 'string',
        'tweet': 'string',
        'replies': 'int64',
        'retweets': 'int64',
        'quotes': 'int64',
        'likes': 'int64'
    },
    parse_dates=['timestamp']   # parse timestamp as datetime
)

# Rename columns to match your first DataFrame
biden_tweets_df = biden_tweets_df.rename(columns={
    'timestamp': 'date',
    'tweet': 'text'
})

del biden_tweets_csv

# Quick sanity check
print(biden_tweets_df.dtypes)


id                   int64
date        datetime64[ns]
url         string[python]
text        string[python]
replies              int64
retweets             int64
quotes               int64
likes                int64
dtype: object


In [None]:
biden_tweets_all_tweets_all_tweets_df.tail()

Unnamed: 0,id,date,url,text,replies,retweets,quotes,likes
6059,1322865972819251200,2020-11-01 11:40:00,https://twitter.com/JoeBiden/status/1322865972...,"As president, I’ll build an economy that rewar...",3085,3654,480,36995
6060,1322871257902145536,2020-11-01 12:01:00,https://twitter.com/JoeBiden/status/1322871257...,Let’s put dogs back in the White House. https:...,2211,4211,1246,34240
6061,1322876039144636417,2020-11-01 12:20:00,https://twitter.com/JoeBiden/status/1322876039...,We have in our hands the ultimate power: the p...,918,1143,122,6954
6062,1322881072363917312,2020-11-01 12:40:00,https://twitter.com/JoeBiden/status/1322881072...,"We need to: Build bridges, not walls. Open ou...",1281,2255,344,16696
6063,1322886357535150081,2020-11-01 13:01:00,https://twitter.com/JoeBiden/status/1322886357...,Jodie –– you have my word: I will always stand...,168,270,37,1648


In [11]:
biden_tweets_df.url[6060]

'https://twitter.com/JoeBiden/status/1322871257902145536'

In [12]:
trump_hashtags_df = pd.read_csv("hashtag_donaldtrump.csv", lineterminator='\n')
trump_hashtags_df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,user_join_date,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1.316529e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666500.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,2011-08-23 15:33:45,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:01,1.316529e+18,"Usa 2020, Trump contro Facebook e Twitter: cop...",26.0,9.0,Social Mediaset,331617600.0,Tgcom24,MediasetTgcom24,Profilo ufficiale di Tgcom24: tutte le notizie...,2011-07-08 13:12:20,1067661.0,,,,,,,,,2020-10-21 00:00:00.373216530
2,2020-10-15 00:00:02,1.316529e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",2007-08-26 05:56:11,1185.0,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
3,2020-10-15 00:00:02,1.316529e+18,2 hours since last tweet from #Trump! Maybe he...,0.0,0.0,Trumpytweeter,8.283556e+17,Trumpytweeter,trumpytweeter,"If he doesn't tweet for some time, should we b...",2017-02-05 21:32:17,32.0,,,,,,,,,2020-10-21 00:00:01.119649591
4,2020-10-15 00:00:08,1.316529e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413800.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",2009-06-15 19:05:35,5393.0,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121


In [13]:
biden_hashtags_df = pd.read_csv("hashtag_joebiden.csv", lineterminator='\n')
biden_hashtags_df.head()

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,user_join_date,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1.316529e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666500.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,2011-08-23 15:33:45,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00
1,2020-10-15 00:00:18,1.316529e+18,#HunterBiden #HunterBidenEmails #JoeBiden #Joe...,0.0,0.0,Twitter for iPad,809904400.0,Cheri A. 🇺🇸,Biloximeemaw,"Locked and loaded Meemaw. Love God, my family ...",2012-09-08 01:03:57,6628.0,,,,,,,,,2020-10-21 00:00:00.517827283
2,2020-10-15 00:00:20,1.316529e+18,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,0.0,0.0,Twitter Web App,3494182000.0,Flag Waver,Flag_Wavers,,2015-08-30 19:38:21,1536.0,Golden Valley Arizona,46.304036,-109.171431,,United States of America,North America,Montana,MT,2020-10-21 00:00:01.035654566
3,2020-10-15 00:00:21,1.316529e+18,@chrislongview Watching and setting dvr. Let’s...,0.0,0.0,Twitter for iPhone,8.242596e+17,Michelle Ferg,MichelleFerg4,,2017-01-25 14:16:17,27.0,,,,,,,,,2020-10-21 00:00:01.553481849
4,2020-10-15 00:00:22,1.316529e+18,#censorship #HunterBiden #Biden #BidenEmails #...,1.0,0.0,Twitter Web App,1.032807e+18,the Gold State,theegoldstate,A Silicon Valley #independent #News #Media #St...,2018-08-24 01:49:01,390.0,"California, USA",36.701463,-118.755997,,United States of America,North America,California,CA,2020-10-21 00:00:02.071309132


In [14]:
del trump_hashtags_df
del biden_hashtags_df

In [15]:
trump_tweets_clean_df = trump_tweets_df[
    (trump_tweets_df["isRetweet"] == False)  # only keep Trump's original tweets
    & (~trump_tweets_df["text"].str.startswith("https://"))  # delete the tweets that start with https://
    ][['date', 'text']].copy() 
biden_tweets_clean_df = biden_tweets_df[['date', 'text']].copy()
del trump_tweets_df
del biden_tweets_df

In [16]:
print("Min value of date in trump_tweets_clean_df:", trump_tweets_clean_df['date'].min())
print("Max value of date in trump_tweets_clean_df:", trump_tweets_clean_df['date'].max())
print("Min value of date in biden_tweets_clean_df:", biden_tweets_clean_df['date'].min())
print("Max value of date in biden_tweets_clean_df:", biden_tweets_clean_df['date'].max())

Min value of date in trump_tweets_clean_df: 2009-05-04 18:54:25
Max value of date in trump_tweets_clean_df: 2021-01-08 15:44:28
Min value of date in biden_tweets_clean_df: 2007-10-24 22:45:00
Max value of date in biden_tweets_clean_df: 2020-11-01 13:01:00


In [17]:
analysis_start_date = '2019-01-01 00:00:00'
analysis_end_date = '2020-10-31 23:59:59'

In [18]:
trump_tweets_analysis_df = trump_tweets_clean_df[
    (trump_tweets_clean_df["date"] >= analysis_start_date) &
    (trump_tweets_clean_df["date"] <= analysis_end_date)
].copy()
# Resetting the index
trump_tweets_analysis_df = trump_tweets_analysis_df.reset_index()
# Rename 'text' to 'tweet_text'
trump_tweets_analysis_df = trump_tweets_analysis_df.rename(columns={'text': 'tweet_text'})

del trump_tweets_clean_df

In [19]:
biden_tweets_analysis_df = biden_tweets_clean_df[
    (biden_tweets_clean_df["date"] >= analysis_start_date) &
    (biden_tweets_clean_df["date"] <= analysis_end_date)
].copy()
# Resetting the index
biden_tweets_analysis_df = biden_tweets_analysis_df.reset_index()
# Rename 'text' to 'tweet_text'
biden_tweets_analysis_df = biden_tweets_analysis_df.rename(columns={'text': 'tweet_text'})

del biden_tweets_clean_df

In [20]:
print("Min value of date in trump_tweets_analysis_df:", trump_tweets_analysis_df['date'].min())
print("Max value of date in trump_tweets_analysis_df:", trump_tweets_analysis_df['date'].max())
print("Min value of date in biden_tweets_analysis_df:", biden_tweets_analysis_df['date'].min())
print("Max value of date in biden_tweets_analysis_df:", biden_tweets_analysis_df['date'].max())

Min value of date in trump_tweets_analysis_df: 2019-01-01 00:40:26
Max value of date in trump_tweets_analysis_df: 2020-10-31 23:52:33
Min value of date in biden_tweets_analysis_df: 2019-01-12 03:17:00
Max value of date in biden_tweets_analysis_df: 2020-10-31 23:55:00


In [21]:
print("Shape of trump_tweets_analysis_df:", trump_tweets_analysis_df.shape)
print("Shape of biden_tweets_analysis_df:", biden_tweets_analysis_df.shape)

Shape of trump_tweets_analysis_df: (9168, 3)
Shape of biden_tweets_analysis_df: (4755, 3)


In [22]:
trump_tweets_analysis_df.head(5)

Unnamed: 0,index,date,tweet_text
0,1,2020-03-03 01:34:50,I was thrilled to be back in the Great city of...
1,3,2020-09-12 20:10:58,The Unsolicited Mail In Ballot Scam is a major...
2,6,2020-02-01 16:14:02,Getting a little exercise this morning! https:...
3,14,2020-01-09 12:24:31,Thank you Elise! https://t.co/Y4Hb0zf5jk
4,15,2020-10-23 04:09:59,"As per your request, Joe... https://t.co/78mzc..."


In [23]:
biden_tweets_analysis_df.head(5)

Unnamed: 0,index,date,tweet_text
0,1290,2019-01-12 03:17:00,White supremacists and their shameful ideology...
1,1291,2019-01-17 03:07:00,“Equality of rights under the law shall not be...
2,1292,2019-01-17 03:07:00,Congratulations to the two women who led the w...
3,1293,2019-01-21 18:16:00,The spirit of Dr. King is still with us. It ha...
4,1294,2019-01-26 19:15:00,https://t.co/iYah5x11To


In [24]:
trump_tweets_analysis_df['politician'] = 'Trump'
biden_tweets_analysis_df['politician'] = 'Biden'

trump_final_df = trump_tweets_analysis_df[['politician', 'date', 'tweet_text']]
biden_final_df = biden_tweets_analysis_df[['politician', 'date', 'tweet_text']]

# Concatenate the two DataFrames ---
all_tweets_df = pd.concat([trump_final_df, biden_final_df], ignore_index=True)

del trump_tweets_analysis_df
del biden_tweets_analysis_df
del trump_final_df
del biden_final_df
# Save the final DataFrame to a CSV file   
all_tweets_df.to_csv('all_tweets_to_analyze.csv', index=False)

In [25]:
all_tweets_df.shape

(13923, 3)

In [26]:
all_tweets_df.head(5)

Unnamed: 0,politician,date,tweet_text
0,Trump,2020-03-03 01:34:50,I was thrilled to be back in the Great city of...
1,Trump,2020-09-12 20:10:58,The Unsolicited Mail In Ballot Scam is a major...
2,Trump,2020-02-01 16:14:02,Getting a little exercise this morning! https:...
3,Trump,2020-01-09 12:24:31,Thank you Elise! https://t.co/Y4Hb0zf5jk
4,Trump,2020-10-23 04:09:59,"As per your request, Joe... https://t.co/78mzc..."


In [27]:
all_tweets_df.head(5)

Unnamed: 0,politician,date,tweet_text
0,Trump,2020-03-03 01:34:50,I was thrilled to be back in the Great city of...
1,Trump,2020-09-12 20:10:58,The Unsolicited Mail In Ballot Scam is a major...
2,Trump,2020-02-01 16:14:02,Getting a little exercise this morning! https:...
3,Trump,2020-01-09 12:24:31,Thank you Elise! https://t.co/Y4Hb0zf5jk
4,Trump,2020-10-23 04:09:59,"As per your request, Joe... https://t.co/78mzc..."


In [28]:
gen_df = pd.DataFrame({
    "sentiment_score": [0.65, -0.3],          # From BERT
    "economic_term_count": [2, 0],            # Custom economics lexicon
    "policy_term_count": [3, 1],              # Custom policy lexicon
    "currency_mention": [1, 0],               # Regex for "dollar," "euro," etc.
    "fear_score": [0.1, 0.8],                 # NRC Emotion Lexicon
    "subjectivity_score": [0.9, 0.2],         # TextBlob
    "named_entities": [["Fed", "China"], []], # SpaCy NER
    "topic_id": [1, 0],                       # BERTopic clusters
    "sentiment_volatility_7d": [0.4, 0.6],    # Rolling std. dev
    "economic_term_ratio": [0.1, 0.0]         # economic_term_count / word count
})

In [30]:
# Initialize models
nlp = spacy.load("en_core_web_sm")

In [31]:
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="finiteautomata/bertweet-base-sentiment-analysis",
    device=0,  # Use GPU (CUDA device 0)
    # cache_dir="your_custom_model_directory"  # Set your desired model download directory
)




Device set to use cpu


In [32]:
fear_lexicon = {"fear_words": ["panic", "crisis", "collapse", "uncertain", "risk"]}  # NRC subset

In [33]:
# Custom keyword lists
economic_terms = ["inflation", "debt", "gdp", "trade", "deficit", "currency", "market", "economy"]
policy_terms = ["tax", "stimulus", "regulation", "budget", "reform", "policy", "spending"]
currency_terms = ["dollar", "euro", "yen", "yuan", "pound", "currency"]

In [40]:
# --- FEATURE GENERATION ---
# Sentiment & Subjectivity
# def get_sentiment(text):
#     result = sentiment_analyzer(text)[0]
#     return result["score"] if result["label"] == "POS" else -result["score"]

def get_sentiment(text):
    try:
        if not isinstance(text, str) or not text.strip():
            return 0.0  # Neutral for empty/invalid text
        # Truncate to 256 chars to avoid model errors
        # text = text[:256]
        result = sentiment_analyzer(text)[0]
        return result["score"] if result["label"] == "POS" else -result["score"]
    except Exception as e:
        return 0.0  # Neutral if model fails

all_tweets_df["sentiment_score"] = all_tweets_df["tweet_text"].apply(get_sentiment)


In [41]:
all_tweets_df.head(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score
0,Trump,2020-03-03 01:34:50,I was thrilled to be back in the Great city of...,0.992678
1,Trump,2020-09-12 20:10:58,The Unsolicited Mail In Ballot Scam is a major...,-0.980144
2,Trump,2020-02-01 16:14:02,Getting a little exercise this morning! https:...,-0.532646


In [42]:
all_tweets_df["subjectivity_score"] = all_tweets_df["tweet_text"].apply(
    lambda x: TextBlob(x).sentiment.subjectivity)

In [43]:
all_tweets_df.head(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score
0,Trump,2020-03-03 01:34:50,I was thrilled to be back in the Great city of...,0.992678,0.483333
1,Trump,2020-09-12 20:10:58,The Unsolicited Mail In Ballot Scam is a major...,-0.980144,0.454762
2,Trump,2020-02-01 16:14:02,Getting a little exercise this morning! https:...,-0.532646,0.5


In [44]:
# Fear Score
def calculate_fear_score(text):
    words = text.lower().split()
    fear_count = sum(1 for word in words if word in fear_lexicon["fear_words"])
    return fear_count / len(words) if len(words) > 0 else 0

In [45]:
all_tweets_df["fear_score"] = all_tweets_df["tweet_text"].apply(calculate_fear_score)

In [47]:
all_tweets_df.tail(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score,fear_score
13920,Biden,2020-10-31 23:20:00,Presidents lead by example.\n\nMask up. https:...,-0.93879,0.0,0.0
13921,Biden,2020-10-31 23:42:00,I want to extend my prayers and condolences to...,-0.555184,0.767857,0.0
13922,Biden,2020-10-31 23:55:00,Happy Halloween! https://t.co/i2BT7k0xdI,0.993051,1.0,0.0


In [48]:
# Keyword Counts & Ratios
all_tweets_df["economic_term_count"] = all_tweets_df["tweet_text"].apply(
    lambda x: len(re.findall(r"\b(" + "|".join(economic_terms) + r")\b", x, flags=re.IGNORECASE)))
all_tweets_df["policy_term_count"] = all_tweets_df["tweet_text"].apply(
    lambda x: len(re.findall(r"\b(" + "|".join(policy_terms) + r")\b", x, flags=re.IGNORECASE)))
all_tweets_df["currency_mention"] = all_tweets_df["tweet_text"].apply(
    lambda x: 1 if re.search(r"\b(" + "|".join(currency_terms) + r")\b", x, flags=re.IGNORECASE) else 0)
all_tweets_df["economic_term_ratio"] = all_tweets_df["economic_term_count"] / all_tweets_df["tweet_text"].apply(lambda x: len(x.split()))

In [49]:
all_tweets_df.tail(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score,fear_score,economic_term_count,policy_term_count,currency_mention,economic_term_ratio
13920,Biden,2020-10-31 23:20:00,Presidents lead by example.\n\nMask up. https:...,-0.93879,0.0,0.0,0,0,0,0.0
13921,Biden,2020-10-31 23:42:00,I want to extend my prayers and condolences to...,-0.555184,0.767857,0.0,0,0,0,0.0
13922,Biden,2020-10-31 23:55:00,Happy Halloween! https://t.co/i2BT7k0xdI,0.993051,1.0,0.0,0,0,0,0.0


In [50]:
# Named Entities
def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "GPE", "LOC"]]

In [51]:
all_tweets_df["named_entities"] = all_tweets_df["tweet_text"].apply(extract_entities)

In [52]:
all_tweets_df.tail(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score,fear_score,economic_term_count,policy_term_count,currency_mention,economic_term_ratio,named_entities
13920,Biden,2020-10-31 23:20:00,Presidents lead by example.\n\nMask up. https:...,-0.93879,0.0,0.0,0,0,0,0.0,[]
13921,Biden,2020-10-31 23:42:00,I want to extend my prayers and condolences to...,-0.555184,0.767857,0.0,0,0,0,0.0,"[the Chaldean Assyrian Community, Baghdad]"
13922,Biden,2020-10-31 23:55:00,Happy Halloween! https://t.co/i2BT7k0xdI,0.993051,1.0,0.0,0,0,0,0.0,[]


In [53]:
# Save current DataFrame to a CSV file   
all_tweets_df.to_csv('all_tweets_to_analyze_20250528.csv', index=False)

In [55]:
# Topic Modeling (BERTopic)
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(all_tweets_df["tweet_text"])
all_tweets_df["topic_id"] = topics

No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [56]:
all_tweets_df.tail(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score,fear_score,economic_term_count,policy_term_count,currency_mention,economic_term_ratio,named_entities,topic_id
13920,Biden,2020-10-31 23:20:00,Presidents lead by example.\n\nMask up. https:...,-0.93879,0.0,0.0,0,0,0,0.0,[],3
13921,Biden,2020-10-31 23:42:00,I want to extend my prayers and condolences to...,-0.555184,0.767857,0.0,0,0,0,0.0,"[the Chaldean Assyrian Community, Baghdad]",-1
13922,Biden,2020-10-31 23:55:00,Happy Halloween! https://t.co/i2BT7k0xdI,0.993051,1.0,0.0,0,0,0,0.0,[],139


In [58]:
all_tweets_df = all_tweets_df.sort_values("date")
all_tweets_df["sentiment_volatility_7d"] = (
    all_tweets_df.rolling(window="7D", on="date")["sentiment_score"].std()
)

In [74]:
all_tweets_df = all_tweets_df.drop("index", axis=1)

In [75]:
all_tweets_df.head(3)

Unnamed: 0,politician,date,tweet_text,sentiment_score,subjectivity_score,fear_score,economic_term_count,policy_term_count,currency_mention,economic_term_ratio,named_entities,topic_id,sentiment_volatility_7d
0,Trump,2019-01-01 00:40:26,MEXICO IS PAYING FOR THE WALL through the many...,0.693526,0.555303,0.0,2,0,0,0.039216,"[MEXICO, U.S.A., USMCA, NAFTA Trade Deal, Coun...",0,
1,Trump,2019-01-01 00:51:43,"The Democrats will probably submit a Bill, bei...",-0.889143,1.0,0.0,0,0,0,0.0,"[Border Security, Tech]",0,1.119116
2,Trump,2019-01-01 01:05:39,...Remember this. Throughout the ages some thi...,-0.970347,0.433333,0.0,0,0,0,0.0,"[NEVER, NEVER, ALWAYS, ALWAYS, NEVER]",0,0.938075


In [65]:
# Resetting the index
all_tweets_df = all_tweets_df.reset_index()

In [66]:
all_tweets_df[[
    "politician",
    "date",
    "tweet_text",
    "sentiment_score",
    "economic_term_count",
    "topic_id"]].tail(10)

Unnamed: 0,politician,date,tweet_text,sentiment_score,economic_term_count,topic_id
13913,Trump,2020-10-31 21:31:03,"Over the next four years, we will stop the rad...",0.977989,0,-1
13914,Trump,2020-10-31 21:36:18,"We are ONE movement, ONE people, ONE family, a...",0.992038,0,20
13915,Trump,2020-10-31 21:47:59,Thank you to the wonderful @RJHarrisWHP580 for...,0.993124,0,20
13916,Trump,2020-10-31 22:07:18,Just signed an order to protect fracking and t...,-0.436006,0,188
13917,Biden,2020-10-31 23:20:00,Presidents lead by example.\n\nMask up. https:...,-0.93879,0,3
13918,Biden,2020-10-31 23:42:00,I want to extend my prayers and condolences to...,-0.555184,0,-1
13919,Trump,2020-10-31 23:47:53,"Thank you Butler, Pennsylvania!\nhttps://t.co/...",0.988103,0,20
13920,Trump,2020-10-31 23:51:29,For the last 4 years you have seen me fight fo...,0.984206,0,-1
13921,Trump,2020-10-31 23:52:33,"Over the next 4 years, we will make America in...",0.960022,0,-1
13922,Biden,2020-10-31 23:55:00,Happy Halloween! https://t.co/i2BT7k0xdI,0.993051,0,139


In [71]:
print("Min date:", all_tweets_df["date"].min())
print("Max date:", all_tweets_df["date"].max())

Min date: 2019-01-01 00:40:26
Max date: 2020-10-31 23:55:00


In [67]:
# Save current DataFrame to a CSV file   
all_tweets_df.to_csv('all_tweets_to_analyze_20250528.csv', index=False)

In [68]:
all_tweets_df.columns

Index(['index', 'politician', 'date', 'tweet_text', 'sentiment_score',
       'subjectivity_score', 'fear_score', 'economic_term_count',
       'policy_term_count', 'currency_mention', 'economic_term_ratio',
       'named_entities', 'topic_id', 'sentiment_volatility_7d'],
      dtype='object')

In [70]:
gen_df.columns

Index(['sentiment_score', 'economic_term_count', 'policy_term_count',
       'currency_mention', 'fear_score', 'subjectivity_score',
       'named_entities', 'topic_id', 'sentiment_volatility_7d',
       'economic_term_ratio'],
      dtype='object')

In [78]:
# Sample exchange rate data (replace with real data)
exchange_rates = pd.DataFrame({
    "date": pd.date_range("2019-01-01", "2020-10-31", freq="D"),
    "dollar_value": np.random.rand(669)*10 + 95  # Mock values
})

# Calculate percentage changes
exchange_rates["dollar_change_24h"] = exchange_rates["dollar_value"].pct_change()
exchange_rates.head(5)

ValueError: All arrays must be of the same length