In [1]:
import pathlib, pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

PROC = pathlib.Path("data/processed")
df = pd.read_csv(PROC/"phrasebank_test.csv").reset_index().rename(columns={"index":"id"})

ana = SentimentIntensityAnalyzer()
def to_label(c):
    if c >= 0.05: return "positive"
    if c <= -0.05: return "negative"
    return "neutral"

df["pred"] = [to_label(ana.polarity_scores(t)["compound"]) for t in df["sentence"]]
df.to_csv(PROC/"vader_preds.csv", index=False)
print("Wrote", PROC/"vader_preds.csv", "rows:", len(df))

Wrote data/processed/vader_preds.csv rows: 453


In [3]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

PROC = pathlib.Path("data/processed")
df = pd.read_csv(PROC/"phrasebank_test.csv").reset_index().rename(columns={"index":"id"})

tok = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()
id2label = {int(k): v.lower() for k, v in model.config.id2label.items()}

def predict_batch(texts, max_length=64):
    enc = tok(texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        logits = model(**enc).logits
    return [id2label[i] for i in logits.argmax(dim=-1).tolist()]

batch = 32
preds = []
for i in range(0, len(df), batch):
    preds.extend(predict_batch(df["sentence"].iloc[i:i+batch].tolist()))

out = df.copy()
out["pred"] = preds
out.to_csv(PROC/"finbert_preds.csv", index=False)
print("Wrote", PROC/"finbert_preds.csv", "rows:", len(out))


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


Wrote data/processed/finbert_preds.csv rows: 453
