# Task 2 — Fine-tuning T5-base for Generative QA on SQuAD

Goal: given `(question, context)`, generate the answer text.

**Outputs to keep for your report**:
- Training config (max lengths, epochs, batch size, LR)
- Final SQuAD metrics (Exact Match, F1)
- A few qualitative examples (good + failure cases)


In [4]:
!pip -q install -U evaluate datasets transformers accelerate sentencepiece sacrebleu rouge-score


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [5]:
# If running on Colab, uncomment:
# !pip -q install -r ../requirements.txt

import os, random
import numpy as np

from datasets import load_dataset
import evaluate

from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    set_seed,
)

SEED = 42
set_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)


## 1) Load dataset (SQuAD)

We use HuggingFace Datasets `squad`.
- `train` split for training
- `validation` split for evaluation


In [13]:
ds = load_dataset("rajpurkar/squad")


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

Optional: If you are on CPU / limited GPU, you can train on a small subset to validate the pipeline quickly.

In [7]:
# Toggle this if you want a quick run
USE_SMALL_SUBSET = False  # set True for quick debugging

if USE_SMALL_SUBSET:
    train_ds = dataset["train"].shuffle(seed=SEED).select(range(2000))
    val_ds   = dataset["validation"].shuffle(seed=SEED).select(range(500))
else:
    train_ds = dataset["train"]
    val_ds   = dataset["validation"]

len(train_ds), len(val_ds)


(87599, 10570)

## 2) Preprocess: format for T5

Common prompt format for T5 QA:

- **Input**: `question: <q>  context: <c>`
- **Target**: the answer text (for training, we can pick the first reference answer)

SQuAD provides multiple acceptable answers; evaluation uses all references.

In [8]:
MODEL_NAME = "t5-base"
tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)

max_source_length = 512
max_target_length = 32

def preprocess(examples):
    questions = examples["question"]
    contexts = examples["context"]
    inputs = [f"question: {q}  context: {c}" for q, c in zip(questions, contexts)]

    # For training target, pick first answer text
    answers = [a["text"][0] if len(a["text"]) > 0 else "" for a in examples["answers"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_source_length,
        truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            answers,
            max_length=max_target_length,
            truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]

    # Keep fields needed for SQuAD metric post-processing
    model_inputs["id"] = examples["id"]
    model_inputs["answers"] = examples["answers"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)

train_tok, val_tok


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]



Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(Dataset({
     features: ['id', 'answers', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 87599
 }),
 Dataset({
     features: ['id', 'answers', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 10570
 }))

## 3) Load model + trainer

In [9]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## 4) Metrics: SQuAD (Exact Match & F1)

We will generate answers on the validation set and compute official SQuAD metrics.

Important: `evaluate.load('squad')` expects:
- `predictions`: list of `{id, prediction_text}`
- `references`: list of `{id, answers}`


In [10]:
squad_metric = evaluate.load("squad")

def postprocess_text(text: str) -> str:
    # Basic cleanup; you can add more normalization if desired
    return text.strip()

def compute_metrics(eval_preds):
    pred_ids, label_ids = eval_preds

    # pred_ids from Seq2SeqTrainer with predict_with_generate=True are token ids
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # We won't decode labels for SQuAD metric (it uses references from dataset)
    # We'll build predictions/references from val_tok's stored fields.
    # Seq2SeqTrainer passes predictions in the same order as eval dataset.

    predictions = []
    references = []

    for i, pred in enumerate(decoded_preds):
        ex_id = val_tok[i]["id"]
        answers = val_tok[i]["answers"]

        predictions.append({"id": ex_id, "prediction_text": postprocess_text(pred)})
        references.append({"id": ex_id, "answers": answers})

    return squad_metric.compute(predictions=predictions, references=references)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

## 5) Training setup

If you have GPU, enable fp16. If not, set `fp16=False`.

Tip: If you get out-of-memory, reduce `per_device_train_batch_size` or `max_source_length`.

In [11]:
# Adjust these based on your hardware
output_dir = "../outputs/t5-squad"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=1,  # increase to 2-4 for better results
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=max_target_length,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,  # set False if no GPU / fp16 unsupported
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

## 6) Train

In [None]:
train_result = trainer.train()
train_result


## 7) Evaluate (SQuAD EM/F1)

In [None]:
metrics = trainer.evaluate()
metrics


## 8) Save artifacts for `reports/`

We save:
- metrics JSON
- a small table of qualitative examples (question, gold answers, prediction)

In [None]:
import json
import pandas as pd

os.makedirs("../reports", exist_ok=True)

# Save metrics
with open("../reports/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Generate qualitative examples
def generate_answers(batch):
    inputs = [f"question: {q}  context: {c}" for q, c in zip(batch["question"], batch["context"])]
    enc = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_source_length)
    enc = {k: v.to(model.device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, max_new_tokens=max_target_length)
    preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    return preds

import torch

sample_n = 10
sample = val_ds.shuffle(seed=SEED).select(range(sample_n))

# Batched generation for speed
batch_preds = []
batch_size = 4
for i in range(0, sample_n, batch_size):
    b = sample.select(range(i, min(i+batch_size, sample_n)))
    preds = generate_answers(b)
    batch_preds.extend(preds)

rows = []
for ex, pred in zip(sample, batch_preds):
    rows.append({
        "id": ex["id"],
        "question": ex["question"],
        "prediction": postprocess_text(pred),
        "gold_answers": ex["answers"]["text"][:3],  # show up to 3
    })

df = pd.DataFrame(rows)
df.to_csv("../reports/qualitative_examples.csv", index=False)
df


## 9) (Optional) Error analysis ideas

- Check if failures come from context truncation.
- Look at questions with long contexts.
- Try increasing `max_source_length` or using a smaller batch size.
- Train longer (2–4 epochs) and tune LR (1e-4 to 5e-4).