In [1]:
!pip install transformers torch pandas tqdm --quiet

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from pathlib import Path
from tqdm import tqdm

In [2]:
# Paths
PROCESSED_DATA_DIR = Path("../data/processed")

tweets_file = PROCESSED_DATA_DIR / "tweet_finance_clean.csv"
news_file = PROCESSED_DATA_DIR / "news_data_clean.csv"

# Choose financial sentiment model (FinBERT)
MODEL_NAME = "ProsusAI/finbert"

In [3]:
# Load model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    truncation=True,
    max_length=256,
    device=-1
)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use cpu
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [4]:
# Sentiment mapping
label_to_score = {
    "positive": 1,
    "neutral": 0,
    "negative": -1
}

In [5]:
def get_sentiment_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return None
    result = sentiment_pipeline(text[:256])[0]
    label = result['label'].lower()
    return label_to_score.get(label, 0)

In [6]:
# Apply to tweets
if tweets_file.exists():
    tweets_df = pd.read_csv(tweets_file)
    print(f"Processing {tweets_df.shape[0]} tweets...")

    tqdm.pandas()
    tweets_df["sentiment_score"] = tweets_df["clean_text"].progress_apply(get_sentiment_score)

    # Save updated tweets with sentiment
    tweets_df.to_csv(PROCESSED_DATA_DIR / "tweet_finance_sentiment.csv", index=False)
    print("Saved tweet sentiment file.")
else:
    print("No tweet data found")

Processing 100 tweets...


  return forward_call(*args, **kwargs)
  2%|█▋                                                                                | 2/100 [00:00<00:08, 10.92it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.65it/s]

Saved tweet sentiment file.





In [7]:
# Apply to news
if news_file.exists():
    news_df = pd.read_csv(news_file)
    print(f"Processing {news_df.shape[0]} news articles...")

    tqdm.pandas()
    news_df["sentiment_score"] = news_df["clean_title"].progress_apply(get_sentiment_score)

    # Save updated news with sentiment
    news_df.to_csv(PROCESSED_DATA_DIR / "news_data_sentiment.csv", index=False)
    print("Saved news sentiment file.")
else:
    print("No news data found")

Processing 1247 news articles...


  return forward_call(*args, **kwargs)
100%|██████████████████████████████████████████████████████████████████████████████| 1247/1247 [02:38<00:00,  7.89it/s]

Saved news sentiment file.





In [8]:
# Aggregate daily sentiment
# Tweets daily sentiment
if tweets_file.exists():
    tweets_df = pd.read_csv(PROCESSED_DATA_DIR / "tweet_finance_sentiment.csv")
    tweets_df["date"] = pd.to_datetime(tweets_df["date"], errors='coerce').dt.date
    tweet_sentiment_daily = tweets_df.groupby('date').agg(
        tweet_sentiment_mean=('sentiment_score', 'mean'),
        tweet_sentiment_std=('sentiment_score', 'std'),
        tweet_count=('sentiment_score', 'count')
    ).reset_index()
else:
    tweet_sentiment_daily = pd.DataFrame()

# News daily sentiment
if news_file.exists():
    news_df = pd.read_csv(PROCESSED_DATA_DIR / "news_data_sentiment.csv")
    if 'publishedAt' in news_df.columns:
        news_df["date"] = pd.to_datetime(news_df["publishedAt"], errors='coerce').dt.date
    elif 'date' in news_df.columns:
        news_df["date"] = pd.to_datetime(news_df["date"], errors='coerce').dt.date

    news_sentiment_daily = news_df.groupby('date').agg(
        news_sentiment_mean=('sentiment_score', 'mean'),
        news_sentiment_std=('sentiment_score', 'std'),
        news_count=('sentiment_score', 'count')
    ).reset_index()
else:
    news_sentiment_daily = pd.DataFrame()

# Merge both
if not tweet_sentiment_daily.empty and not news_sentiment_daily.empty:
    sentiment_features = pd.merge(tweet_sentiment_daily, news_sentiment_daily, on="date", how="outer")
elif not tweet_sentiment_daily.empty:
    sentiment_features = tweet_sentiment_daily
elif not news_sentiment_daily.empty:
    sentiment_features = news_sentiment_daily
else:
    sentiment_features = pd.DataFrame()

sentiment_features = sentiment_features.sort_values("date")

In [9]:
# Save sentiment features
output_path = PROCESSED_DATA_DIR / "sentiment_features.csv"
sentiment_features.to_csv(output_path, index=False)
print(f"Updated sentiment features saved to {output_path}")

Updated sentiment features saved to ..\data\processed\sentiment_features.csv


In [10]:
# Quick check
sentiment_features.tail(15)

Unnamed: 0,date,tweet_sentiment_mean,tweet_sentiment_std,tweet_count,news_sentiment_mean,news_sentiment_std,news_count
16,2025-07-18,,,,-0.184211,0.691855,38
17,2025-07-19,,,,0.363636,0.6742,11
18,2025-07-20,,,,-0.125,0.64087,8
19,2025-07-21,,,,-0.019608,0.616123,51
20,2025-07-22,,,,0.0,0.679366,53
21,2025-07-23,,,,-0.098039,0.538699,51
22,2025-07-24,,,,-0.014706,0.680049,68
23,2025-07-25,,,,-0.102041,0.549428,49
24,2025-07-26,,,,0.076923,0.759555,13
25,2025-07-27,,,,-0.3,0.656947,20
