(Done on Google Collab using GPU)

Performs sentiment analysis on financial news headlines using FinBERT.
It cleans and scores all articles (train and test), then aggregates
daily sentiment by date and ticker.
Finally, it merges the sentiment data with market prices to prepare
datasets for further modeling and analysis.

Purpose:
To generate clean sentiment signals and merge them with price data.

Input:
- articles_merge_of_APIs.csv
- prices_long.csv                          → used only for the final merge step

Outputs:
- sentiment_daily.csv   → daily average sentiment 
- sentiment_prices.csv        → merged prices + sentiment

In [14]:
from google.colab import files
uploaded = files.upload()

Saving articles_merge_of_APIs.csv to articles_merge_of_APIs.csv


In [17]:
!pip -q install transformers==4.44.2 torch pandas tqdm


Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 4, in <module>
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/main.py", line 11, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/local/lib/python3.12/dist-packages/pip/_internal/utils/logging.py", line 13, in <module>
    from

In [16]:
import os, pandas as pd, numpy as np, torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

In [18]:
CSV_PATH = "articles_merge_of_APIs.csv"
# FinBERT model loading
MODEL_NAME = "yiyanghkust/finbert-tone"
DEVICE = 0 if torch.cuda.is_available() else -1
print(f"GPU dispo: {torch.cuda.is_available()}  |  device={DEVICE}")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=DEVICE,
    return_all_scores=True,
    function_to_apply="softmax",
    truncation=True,
)

# Data loading
df = pd.read_csv(CSV_PATH)
required = ["date", "ticker", "title", "domain"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Colonnes manquantes: {missing} | Présentes: {list(df.columns)}")

work_df = df.copy()

texts = work_df["title"].astype(str).tolist()

# Inference
results = pipe(texts, batch_size=32, max_length=128)

def scores_to_row(scores_list):
    m = {d["label"].lower(): d["score"] for d in scores_list}
    return pd.Series({
        "prob_negative": m.get("negative", np.nan),
        "prob_neutral":  m.get("neutral",  np.nan),
        "prob_positive": m.get("positive", np.nan),
    })

scores_df = pd.DataFrame([scores_to_row(x) for x in results])

out = pd.concat([work_df.reset_index(drop=True), scores_df], axis=1)
out["sentiment_label"] = out[["prob_negative","prob_neutral","prob_positive"]].idxmax(axis=1).str.replace("prob_","")
out["sentiment_score"] = out["prob_positive"] - out["prob_negative"]

out.to_csv("sentiment_finbert.csv", index=False)
print("✅ Fini. Fichier enregistré: sentiment_finbert.csv")
out

GPU dispo: True  |  device=0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cuda:0


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

✅ Fini. Fichier enregistré: sentiment_finbert.csv


Unnamed: 0,date,ticker,title,domain,prob_negative,prob_neutral,prob_positive,sentiment_label,sentiment_score
0,2023-01-01,AMZN,2 Stocks Down More Than 50 % to Buy Right Now,www.fool.com,5.080678e-01,5.695687e-02,4.349753e-01,negative,-0.073093
1,2023-01-01,META,"This Company Sales Soared 2 , 250 % in 9 Years...",www.fool.com,1.201365e-06,7.362704e-06,9.999914e-01,positive,0.999990
2,2023-01-01,NVDA,Nvidia Has Promising AI Predictions for 2023,www.fool.com,2.326532e-08,3.247954e-07,9.999996e-01,positive,1.000000
3,2023-01-01,NVDA,Nvidia accidently leaks RTX 4070 Ti GPU specs,news.webindia123.com,7.533405e-01,2.466062e-01,5.332923e-05,negative,-0.753287
4,2023-01-01,NVDA,Nvidia plans to release unlaunched 12GB grap...,www.prokerala.com,4.229196e-05,9.999063e-01,5.133379e-05,neutral,0.000009
...,...,...,...,...,...,...,...,...,...
62454,2025-10-05,TSLA,Is Ford BlueCruise Better Than Tesla Autopilot ?,insideevs.com:443,7.800447e-07,8.003297e-06,9.999912e-01,positive,0.999990
62455,2025-10-05,TSLA,Parents of two college students killed in a Te...,whdh.com,3.121608e-01,6.878299e-01,9.355605e-06,neutral,-0.312151
62456,2025-10-05,TSLA,He Drove His Tesla Model X until It Died to Se...,thenewswheel.com,1.576358e-03,9.984193e-01,4.405362e-06,neutral,-0.001572
62457,2025-10-05,V,Trust Co of Kansas Sells 152 Shares of Visa In...,www.etfdailynews.com,1.609522e-06,9.999980e-01,3.752106e-07,neutral,-0.000001


In [1]:
from google.colab import files
uploaded = files.upload()

Saving prices_long.csv to prices_long.csv


In [1]:
from google.colab import files
uploaded = files.upload()

Saving prices_long.csv to prices_long.csv
Saving sentiment_finbert.csv to sentiment_finbert.csv


In [2]:
import pandas as pd
import numpy as np

In [3]:
# Inputs
SCORED_NEWS_PATH = "sentiment_finbert.csv"  # must contain per-article sentiment
PRICES_PATH      = "prices_long.csv"

# Outputs
OUT_DAILY  = "daily_sentiment.csv"
OUT_MERGED = "sentiment_prices.csv"

# --- Load
news = pd.read_csv(SCORED_NEWS_PATH)
prices = pd.read_csv(PRICES_PATH)

# --- Normalize types
news["date"] = pd.to_datetime(news["date"], errors="coerce").dt.date
prices["date"] = pd.to_datetime(prices["date"], errors="coerce").dt.date
news["ticker"] = news["ticker"].astype(str).str.upper().str.strip()
prices["ticker"] = prices["ticker"].astype(str).str.upper().str.strip()

# --- Select sentiment column
sent_col = None
for c in ["sentiment", "sentiment_score"]:
    if c in news.columns:
        sent_col = c
        break
if sent_col is None:
    raise ValueError("No sentiment column found. Expected 'sentiment' or 'sentiment_score'.")

# --- Basic filtering
news = news.dropna(subset=["date", "ticker", sent_col])
prices = prices.dropna(subset=["date", "ticker", "adj_close"])

# --- Daily aggregation (mean + majority label if available)
has_label = "sentiment_label" in news.columns
agg_dict = {sent_col: "mean"}
daily = (
    news.groupby(["date", "ticker"], as_index=False)
        .agg(**{
            "sentiment_mean": (sent_col, "mean"),
            "n_articles":     (sent_col, "size"),
            **({"sentiment_label": ("sentiment_label", lambda x: x.mode().iloc[0] if len(x.mode()) else np.nan)} if has_label else {})
        })
        .sort_values(["ticker", "date"])
)

# --- Save daily sentiment
daily.to_csv(OUT_DAILY, index=False)
print(f"[OK] {OUT_DAILY} written ({len(daily)} rows)")

# --- Merge with prices (keep numeric sentiment only for ML)
merged = (
    prices.merge(
        daily[["date", "ticker", "sentiment_mean"]],
        on=["date", "ticker"],
        how="left"
    )
    .sort_values(["ticker", "date"])
    .reset_index(drop=True)
)

merged.to_csv(OUT_MERGED, index=False)
print(f"[OK] {OUT_MERGED} written ({len(merged)} rows)")

[OK] daily_sentiment.csv written (9488 rows)
[OK] sentiment_prices.csv written (6910 rows)
