In [1]:
!pip install -U transformers datasets accelerate evaluate scikit-learn sentencepiece

Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m83.1 MB/s[0m eta [36m0:00:00

In [20]:
!pip install datasets==3.6.0

Collecting datasets==3.6.0
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-3.6.0


In [1]:
import os
import random
from dataclasses import dataclass
from typing import Dict, List, Optional


import numpy as np
import evaluate
from sklearn.metrics import f1_score, accuracy_score


from datasets import load_dataset, DatasetDict
from transformers import (
T5TokenizerFast,
T5ForConditionalGeneration,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
)

In [2]:
# Reproducibility
seed = int(os.environ.get("SEED", 42))
random.seed(seed)
np.random.seed(seed)

In [3]:
# Model & run config
model_name = os.environ.get("MODEL_NAME", "t5-small") # e.g., "t5-base", "google/flan-t5-base"
run_name = os.environ.get("RUN_NAME", f"t5-finance-sentiment-{model_name}")
output_dir = os.environ.get("OUTPUT_DIR", "./outputs/t5_finance_sentiment")
max_input_length = int(os.environ.get("MAX_INPUT", 256))
max_target_length = int(os.environ.get("MAX_TARGET", 8))

In [4]:
# Training hyperparams
num_train_epochs = float(os.environ.get("EPOCHS", 3))
per_device_train_batch_size = int(os.environ.get("TRAIN_BSZ", 16))
per_device_eval_batch_size = int(os.environ.get("EVAL_BSZ", 16))
learning_rate = float(os.environ.get("LR", 3e-4))
weight_decay = float(os.environ.get("WD", 0.01))
warmup_ratio = float(os.environ.get("WARMUP", 0.1))

In [5]:
# Mixed precision
fp16 = os.environ.get("FP16", "true").lower() == "true"


# Task labels (keep lowercase for normalization)
label_list = ["positive", "negative", "neutral"]
label_set = set(label_list)

In [6]:
USE_HF_DATASET = os.environ.get("USE_HF_DATASET", "true").lower() == "true"
HF_SUBSET = os.environ.get("HF_SUBSET", "sentences_allagree")
CSV_TRAIN = os.environ.get("CSV_TRAIN", "")
CSV_VAL = os.environ.get("CSV_VAL", "")
CSV_TEST = os.environ.get("CSV_TEST", "")


if USE_HF_DATASET:
  raw = load_dataset("financial_phrasebank", name=HF_SUBSET)

  full = raw["train"].train_test_split(test_size=0.2, seed=seed)
  test_valid = full["test"].train_test_split(test_size=0.5, seed=seed)


  dataset = DatasetDict(
  train=full["train"],
  validation=test_valid["train"],
  test=test_valid["test"],
  )


  text_column = "sentence"
  label_column = "label"
else:
  assert CSV_TRAIN, "When USE_HF_DATASET=false, provide CSV_TRAIN path."
  tmp = {}
  tmp["train"] = load_dataset("csv", data_files=CSV_TRAIN)["train"]
  if CSV_VAL:
    tmp["validation"] = load_dataset("csv", data_files=CSV_VAL)["train"]
  if CSV_TEST:
    tmp["test"] = load_dataset("csv", data_files=CSV_TEST)["train"]


  dataset = DatasetDict({k: v for k, v in tmp.items()})
  text_column = "text"
  label_column = "label"


print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


The repository for financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1811
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 226
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 227
    })
})


In [8]:
def normalize_label(x):
  lab = x[label_column]
  # Map numeric or variant forms to canonical lower-case strings.
  if isinstance(lab, (int, np.integer)):
  # Common mapping in financial_phrasebank: 0 negative, 1 neutral, 2 positive
    mapping = {0: "negative", 1: "neutral", 2: "positive"}
    lab_text = mapping.get(int(lab), str(lab).lower())
  else:
    lab_text = str(lab).strip().lower()
  if lab_text not in label_set:
    raise ValueError(f"Unknown label '{lab_text}'. Expected one of {label_list}.")
  return {"target": lab_text}


for split in dataset.keys():
  dataset[split] = dataset[split].map(normalize_label)


# %%
# --- Tokenizer & Prefixing ---
# We frame the task as: input = "sentiment: <text>", target = one of the labels.


tokenizer = T5TokenizerFast.from_pretrained(model_name)


def preprocess(batch):
  inputs = [f"sentiment: {t}" for t in batch[text_column]]
  model_inputs = tokenizer(
  inputs,
  max_length=max_input_length,
  truncation=True,
  padding=False,
  )
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(
    batch["target"],
    max_length=max_target_length,
    truncation=True,
    padding=False,
    )
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs


processed = dataset.map(
  preprocess,
  batched=True,
  remove_columns=[col for col in dataset["train"].column_names if col not in [text_column, "target"]],
)

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]



Map:   0%|          | 0/226 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

In [17]:
# --- Model & Data Collator ---
model = T5ForConditionalGeneration.from_pretrained(model_name)
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


# %%
# --- Metrics (Accuracy & Macro-F1 on generated labels) ---


metric_acc = evaluate.load("accuracy")


def postprocess_text(preds: List[str]) -> List[str]:
    cleaned = []
    for p in preds:
        p = (p or "").strip().lower()
        if not p:
            cleaned.append("neutral")  # default fallback
            continue
        if p not in label_set:
            p = p.split()[0] if p else "neutral"
            if p not in label_set:
                p = "neutral"
        cleaned.append(p)
    return cleaned




def compute_metrics(eval_pred):
  preds, labels = eval_pred
  # Decode
  pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
  label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)


  pred_clean = postprocess_text(pred_str)
  label_clean = postprocess_text(label_str)


  acc = accuracy_score(label_clean, pred_clean)
  f1_macro = f1_score(label_clean, pred_clean, average="macro")


  return {"accuracy": acc, "f1_macro": f1_macro}

In [12]:
# --- Training Arguments ---
args = Seq2SeqTrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
per_device_eval_batch_size=per_device_eval_batch_size,
learning_rate=learning_rate,
weight_decay=weight_decay,
warmup_ratio=warmup_ratio,
eval_strategy="epoch",
save_strategy="epoch",
logging_strategy="steps",
logging_steps=50,
predict_with_generate=True,
fp16=fp16,
gradient_accumulation_steps=1,
load_best_model_at_end=True,
metric_for_best_model="f1_macro",
greater_is_better=True,
report_to=["none"], # set to ["wandb"] or ["tensorboard"] if desired
seed=seed,
)


# %%
# --- Trainer ---
trainer = Seq2SeqTrainer(
model=model,
args=args,
train_dataset=processed.get("train"),
eval_dataset=processed.get("validation"),
tokenizer=tokenizer,
data_collator=collator,
compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [13]:
# --- Train ---
# Uncomment to train
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1335,0.068518,,


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1335,0.068518,,
2,0.059,0.067656,,
3,0.0299,0.079222,,


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


TrainOutput(global_step=342, training_loss=0.2684196195407221, metrics={'train_runtime': 1478.2702, 'train_samples_per_second': 3.675, 'train_steps_per_second': 0.231, 'total_flos': 101149757177856.0, 'train_loss': 0.2684196195407221, 'epoch': 3.0})

In [14]:
# %%
# --- Evaluate on Test Split ---
# Uncomment after training
test_metrics = trainer.evaluate(processed.get("test"))
print({k: round(v, 4) for k, v in test_metrics.items()})


# %%
# --- Save ---
# Uncomment after training
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)



  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'eval_loss': 0.0581, 'eval_accuracy': nan, 'eval_f1_macro': nan, 'eval_runtime': 46.6776, 'eval_samples_per_second': 4.863, 'eval_steps_per_second': 0.321, 'epoch': 3.0}


('./outputs/t5_finance_sentiment/tokenizer_config.json',
 './outputs/t5_finance_sentiment/special_tokens_map.json',
 './outputs/t5_finance_sentiment/spiece.model',
 './outputs/t5_finance_sentiment/added_tokens.json',
 './outputs/t5_finance_sentiment/tokenizer.json')

In [18]:
def predict_sentiment(text: str, max_new_tokens: int = 3) -> str:
  model.eval()
  prompt = f"sentiment: {text.strip()}"
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_length)
  outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
  pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
  # Postprocess to canonical label
  labels = postprocess_text([pred])
  return labels[0] if labels else "neutral"



# Example (uncomment once model is trained or if using a checkpoint):
print(predict_sentiment("Company beats earnings expectations but warns about supply chain risks."))

neutral
