# Tweet Sentiment Analysis and Currency Market Correlation

This notebook analyzes the correlation between Trump's social media sentiment and currency market movements by:
1. Loading tweet data
2. Extracting country mentions and mapping to currencies
3. Analyzing sentiment using NLTK
4. Correlating with exchange rate movements

## 1. Loading Data

Import required libraries and load the tweets dataset.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import pycountry
from datetime import datetime, timedelta
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.ndimage import gaussian_filter1d
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
try:
    nltk.data.find('vader_lexicon')
except LookupError:
    nltk.download('vader_lexicon')

print("Libraries imported successfully")

Libraries imported successfully


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/condad/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load the tweets dataset
df = pd.read_csv('tweets.csv', on_bad_lines='skip')

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

print(f"Loaded {len(df)} tweets")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print("\nDataset info:")
print(df.info())
print("\nFirst few tweets:")
print(df[['date', 'text']].head())

Loaded 6896 tweets
Date range: 2024-10-13 04:23:37+00:00 to 2025-10-25 22:15:50+00:00

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6896 entries, 0 to 6895
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   page_number            6896 non-null   int64              
 1   browse_flag            6896 non-null   bool               
 2   date                   6896 non-null   datetime64[ns, UTC]
 3   document_id            6896 non-null   int64              
 4   image_url              6896 non-null   object             
 5   media_type             6896 non-null   object             
 6   sequence               6896 non-null   int64              
 7   speaker                6896 non-null   object             
 8   speaker_id             6896 non-null   object             
 9   subject                6896 non-null   object             
 10  text               

## 2. Adding Country and Currency Columns

Extract country mentions from tweets and map them to their respective currencies.

In [3]:
def find_countries_regex(text):
    """Extract country names from text using regex patterns"""
    if pd.isna(text):
        return []

    # Common country name patterns
    country_patterns = [
        r'\b(?:United States|USA|US|America|American)\b',
        r'\bCanada\b|\bCanadian\b',
        r'\bChina\b|\bChinese\b',
        r'\bJapan\b|\bJapanese\b',
        r'\b(?:United Kingdom|UK|Britain|British|England|English)\b',
        r'\bGermany\b|\bGerman\b',
        r'\bFrance\b|\bFrench\b',
        r'\bItaly\b|\bItalian\b',
        r'\bSpain\b|\bSpanish\b',
        r'\bIndia\b|\bIndian\b',
        r'\bBrazil\b|\bBrazilian\b',
        r'\bMexico\b|\bMexican\b',
        r'\bRussia\b|\bRussian\b',
        r'\bSouth Korea\b|\bKorea\b|\bKorean\b',
        r'\bAustralia\b|\bAustralian\b',
        r'\bSwitzerland\b|\bSwiss\b',
        r'\bNorway\b|\bNorwegian\b',
        r'\bSweden\b|\bSwedish\b',
        r'\bDenmark\b|\bDanish\b',
        r'\bNetherlands\b|\bDutch\b'
    ]

    countries = set()
    text_upper = text.upper()

    for pattern in country_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            # Normalize country names
            if re.match(r'(?i)united states|usa|us|america|american', match):
                countries.add('United States')
            elif re.match(r'(?i)canada|canadian', match):
                countries.add('Canada')
            elif re.match(r'(?i)china|chinese', match):
                countries.add('China')
            elif re.match(r'(?i)japan|japanese', match):
                countries.add('Japan')
            elif re.match(r'(?i)united kingdom|uk|britain|british|england|english', match):
                countries.add('United Kingdom')
            elif re.match(r'(?i)germany|german', match):
                countries.add('Germany')
            elif re.match(r'(?i)france|french', match):
                countries.add('France')
            elif re.match(r'(?i)italy|italian', match):
                countries.add('Italy')
            elif re.match(r'(?i)spain|spanish', match):
                countries.add('Spain')
            elif re.match(r'(?i)india|indian', match):
                countries.add('India')
            elif re.match(r'(?i)brazil|brazilian', match):
                countries.add('Brazil')
            elif re.match(r'(?i)mexico|mexican', match):
                countries.add('Mexico')
            elif re.match(r'(?i)russia|russian', match):
                countries.add('Russia')
            elif re.match(r'(?i)south korea|korea|korean', match):
                countries.add('South Korea')
            elif re.match(r'(?i)australia|australian', match):
                countries.add('Australia')
            elif re.match(r'(?i)switzerland|swiss', match):
                countries.add('Switzerland')
            elif re.match(r'(?i)norway|norwegian', match):
                countries.add('Norway')
            elif re.match(r'(?i)sweden|swedish', match):
                countries.add('Sweden')
            elif re.match(r'(?i)denmark|danish', match):
                countries.add('Denmark')
            elif re.match(r'(?i)netherlands|dutch', match):
                countries.add('Netherlands')

    return list(countries)

# Extract countries from tweets
df['countries_found'] = df['text'].apply(find_countries_regex)
df['countries_mentioned'] = df['countries_found'].apply(lambda x: ', '.join(x) if x else '')

# Filter tweets with country mentions
tweets_with_countries = df[df['countries_mentioned'] != ''].copy()

print(f"Found {len(tweets_with_countries)} tweets with country mentions out of {len(df)} total tweets")
print(f"Percentage with countries: {len(tweets_with_countries)/len(df)*100:.1f}%")

# Show country mention frequency
all_countries = []
for countries in tweets_with_countries['countries_found']:
    all_countries.extend(countries)

country_counts = pd.Series(all_countries).value_counts()
print("\nTop 10 mentioned countries:")
print(country_counts.head(10))

Found 1949 tweets with country mentions out of 6896 total tweets
Percentage with countries: 28.3%

Top 10 mentioned countries:
United States     1751
Russia             128
China               98
Canada              55
Mexico              44
United Kingdom      33
India               26
Japan               21
France              12
South Korea         10
Name: count, dtype: int64


In [4]:
def country_to_currency(country_name):
    """Map country names to their currency codes"""
    # Manual mapping for common countries
    country_currency_map = {
        'United States': 'USD',
        'Canada': 'CAD',
        'United Kingdom': 'GBP',
        'Japan': 'JPY',
        'Germany': 'EUR',
        'France': 'EUR',
        'Italy': 'EUR',
        'Spain': 'EUR',
        'Netherlands': 'EUR',
        'China': 'CNY',
        'India': 'INR',
        'Brazil': 'BRL',
        'Mexico': 'MXN',
        'Russia': 'RUB',
        'South Korea': 'KRW',
        'Australia': 'AUD',
        'Switzerland': 'CHF',
        'Norway': 'NOK',
        'Sweden': 'SEK',
        'Denmark': 'DKK'
    }

    if country_name in country_currency_map:
        return country_currency_map[country_name]

    # Try pycountry for other countries
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        currency = pycountry.currencies.get(numeric=country.numeric)
        if currency:
            return currency.alpha_3
    except:
        pass

    return None

# Map countries to currencies
tweets_with_countries['currencies'] = tweets_with_countries['countries_found'].apply(
    lambda countries: [country_to_currency(country) for country in countries if country_to_currency(country)]
)

tweets_with_countries['currency_codes'] = tweets_with_countries['currencies'].apply(
    lambda x: ', '.join(set(x)) if x else ''
)

# Add currencies_found to the main dataframe
df['currencies_found'] = df['countries_found'].apply(
    lambda countries: [country_to_currency(country) for country in countries if country_to_currency(country)]
)

# Add currency_codes to the main dataframe
df['currency_codes'] = df['currencies_found'].apply(
    lambda x: ', '.join(set(x)) if x else ''
)

# Filter tweets with valid currency mappings
tweets_with_currencies = tweets_with_countries[tweets_with_countries['currency_codes'] != ''].copy()

print(f"Successfully mapped {len(tweets_with_currencies)} tweets to currencies")

# Show currency mention frequency
all_currencies = []
for currencies in tweets_with_currencies['currencies']:
    all_currencies.extend(currencies)

currency_counts = pd.Series(all_currencies).value_counts()
print("\nCurrency mention frequency:")
print(currency_counts.head(15))

Successfully mapped 1949 tweets to currencies

Currency mention frequency:
USD    1751
RUB     128
CNY      98
CAD      55
MXN      44
EUR      33
GBP      33
INR      26
JPY      21
KRW      10
BRL       5
AUD       4
SEK       3
CHF       2
NOK       1
Name: count, dtype: int64


## 3. Adding Sentiment Columns

Analyze the sentiment of each tweet using NLTK's VADER sentiment analyzer.

In [None]:
def get_sentiment_scores(text):
    """Get sentiment scores for a given text using VADER"""
    analyzer = SentimentIntensityAnalyzer()

    if pd.isna(text):
        return {
            'sentiment_compound': 0.0,
            'sentiment_positive': 0.0,
            'sentiment_negative': 0.0,
            'sentiment_neutral': 0.0
        }

    scores = analyzer.polarity_scores(text)
    return {
        'sentiment_compound': scores['compound'],  # Overall sentiment (-1 to 1)
        'sentiment_positive': scores['pos'],       # Positive sentiment ratio
        'sentiment_negative': scores['neg'],       # Negative sentiment ratio
        'sentiment_neutral': scores['neu']         # Neutral sentiment ratio
    }

def categorize_sentiment(compound_score):
    """Categorize sentiment based on compound score"""
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to all tweets
print("Analyzing sentiment for all tweets...")
sentiment_results = df['text'].apply(get_sentiment_scores)

# Extract sentiment scores into separate columns
df['sentiment_compound'] = sentiment_results.apply(lambda x: x['sentiment_compound'])
df['sentiment_positive'] = sentiment_results.apply(lambda x: x['sentiment_positive'])
df['sentiment_negative'] = sentiment_results.apply(lambda x: x['sentiment_negative'])
df['sentiment_neutral'] = sentiment_results.apply(lambda x: x['sentiment_neutral'])

# Add categorical sentiment label
df['sentiment_label'] = df['sentiment_compound'].apply(categorize_sentiment)

print("Sentiment analysis complete!")
print(f"\nSentiment distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nAverage compound sentiment: {df['sentiment_compound'].mean():.3f}")
print(f"Sentiment range: {df['sentiment_compound'].min():.3f} to {df['sentiment_compound'].max():.3f}")

# Show sample tweets with sentiment
print("\nSample tweets with sentiment scores:")
sample_df = df[['text', 'sentiment_compound', 'sentiment_label']].head(10)
for idx, row in sample_df.iterrows():
    print(f"\nTweet: {row['text'][:100]}...")
    print(f"Compound Score: {row['sentiment_compound']:.3f} ({row['sentiment_label']})")

Analyzing sentiment for all tweets...
Sentiment analysis complete!

Sentiment distribution:
sentiment_label
neutral     3507
positive    2430
negative     959
Name: count, dtype: int64

Average compound sentiment: 0.177
Sentiment range: -0.998 to 0.997

Sample tweets with sentiment scores:

Tweet: I am on my way to Malaysia, where I will sign the great Peace Deal, which I proudly brokered between...
Compound Score: 0.968 (positive)

Tweet: RT @realDonaldTrump Canada was caught, red handed, putting up a fraudulent advertisement on Ronald R...
Compound Score: -0.893 (negative)

Tweet: https://www. dailysignal.com/2025/10/22/tru mps-middle-east-triumph-embarrassed-self-proclaimed-expe...
Compound Score: 0.000 (neutral)

Tweet: https://www. foxnews.com/opinion/new-high-t ech-tool-trump-using-secure-our-border...
Compound Score: 0.000 (neutral)

Tweet: https://www. foxnews.com/politics/scoop-tru mps-memphis-crime-crackdown-locates-dozens-missing-kids-...
Compound Score: 0.000 (neutral)

Twe

## 4. Save Metadata

Output file is called 'tweet_metadata.csv'

In [None]:

# Reduce and save only the requested columns to avoid duplicated/unnecessary data
print("Current column order:")
print(list(df.columns))

# Determine ID column (use 'id' if present, else add 'tweet_id' from the index)
if 'id' in df.columns:
    id_col = 'id'
else:
    if 'tweet_id' not in df.columns:
        df = df.reset_index().rename(columns={'index': 'tweet_id'})
    id_col = 'tweet_id'

# Columns to keep
desired_cols = [
    id_col,
    'countries_found',
    'currencies_found',
    'sentiment_compound',
    'sentiment_positive',
    'sentiment_negative',
    'sentiment_neutral',
    'sentiment_label'
]

# Only keep columns that actually exist in the dataframe
present_cols = [c for c in desired_cols if c in df.columns]

df_reduced = df[present_cols].copy()

print(f"\nSaving reduced dataframe with columns: {present_cols}")
output_filename = 'tweet_metadata.csv'
df_reduced.to_csv(output_filename, index=False)

print(f"Saved reduced dataframe to {output_filename}")
print(f"Shape: {df_reduced.shape}")
print(f"\nFirst few rows of reduced data:")
print(df_reduced.head())