## Annotation pipeline: FinBERT sentiment + regex ticker extraction

In [None]:
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# load FinBERT
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = AutoModelForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# sample tweets
tweets = pd.read_csv('sample_tweets.csv', parse_dates=['created_at'])

# regex for tickers (e.g. $AAPL)
ticker_pattern = re.compile(r"\$[A-Z]{1,5}\b")

def annotate_row(row):
    text = row['text']
    # sentiment
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    scores = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
    row['sent_bear'] = scores[0].item()
    row['sent_neut'] = scores[1].item()
    row['sent_bull'] = scores[2].item()
    # tickers
    row['tickers'] = ticker_pattern.findall(text)
    return row

annotated = tweets.apply(annotate_row, axis=1)
annotated.to_csv('sample_tweets_annotated.csv', index=False)