<a href="https://colab.research.google.com/github/fasyabrhns/Fine-tuning-Phi-2-for-Abstractive-Text-Summarization/blob/main/task_2_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install evaluate
import torch
import evaluate
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import warnings
warnings.filterwarnings('ignore')

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device count: {torch.cuda.device_count()}")

PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA version: 12.6
Device name: Tesla T4
Device count: 1


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
dataset = load_dataset("rajpurkar/squad")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Train samples: 87599
Validation samples: 10570


In [6]:
sample = dataset["train"][0]
print("Question:", sample["question"])
print("\nContext:", sample["context"][:200], "...")
print("\nAnswer:", sample["answers"]["text"][0])
print("Answer start position:", sample["answers"]["answer_start"][0])

Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta ...

Answer: Saint Bernadette Soubirous
Answer start position: 515


In [7]:
df_train = pd.DataFrame(dataset["train"])
df_train = df_train[["question", "context", "answers"]]
df_train["answer_text"] = df_train["answers"].apply(lambda x: x["text"][0] if x["text"] else "")
df_train = df_train.drop(columns=["answers"])
df_train.head()

Unnamed: 0,question,context,answer_text
0,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",Saint Bernadette Soubirous
1,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",a copper statue of Christ
2,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",the Main Building
3,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",a Marian place of prayer and reflection
4,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",a golden statue of the Virgin Mary


In [8]:
df_val = pd.DataFrame(dataset["validation"])
df_val = df_val[["question", "context", "answers"]]
df_val["answer_text"] = df_val["answers"].apply(lambda x: x["text"][0] if x["text"] else "")
df_val = df_val.drop(columns=["answers"])
print(f"Train shape: {df_train.shape}, Val shape: {df_val.shape}")

Train shape: (87599, 3), Val shape: (10570, 3)


In [9]:
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
print(f"Training dataset: {train_dataset}")
print(f"Validation dataset: {val_dataset}")

Training dataset: Dataset({
    features: ['question', 'context', 'answer_text'],
    num_rows: 87599
})
Validation dataset: Dataset({
    features: ['question', 'context', 'answer_text'],
    num_rows: 10570
})


In [10]:
MODEL_NAME = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Loaded tokenizer: {MODEL_NAME}")
print(f"Vocab size: {len(tokenizer)}")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loaded tokenizer: t5-base
Vocab size: 32100


In [11]:
def preprocess_function(examples):
    inputs = [
        f"answer question: {q} context: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )

    labels = tokenizer(
        examples["answer_text"],
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
TRAIN_SIZE = 15000
VAL_SIZE = 2000

tokenized_train = train_dataset.select(range(TRAIN_SIZE)).map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train dataset"
)

tokenized_val = val_dataset.select(range(VAL_SIZE)).map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation dataset"
)

print(f"Tokenized training samples: {len(tokenized_train)}")
print(f"Tokenized validation samples: {len(tokenized_val)}")

Tokenizing train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Tokenizing validation dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenized training samples: 15000
Tokenized validation samples: 2000


In [13]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print(f"Model loaded: {MODEL_NAME}")
print(f"Total parameters: {model.num_parameters():,}")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded: t5-base
Total parameters: 222,903,552


In [14]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 224,673,024 || trainable%: 0.7876


In [15]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to: {device}")

Model moved to: cuda


In [16]:
training_args = TrainingArguments(
    output_dir="./flan-t5-lora-squad",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=200,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=2,
    warmup_steps=100,
    report_to="none",
    push_to_hub=False
)

print("Training Configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  FP16: {training_args.fp16}")

Training Configuration:
  Epochs: 3
  Batch size: 8
  Learning rate: 0.001
  FP16: True


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Trainer initialized with LoRA model!")

Trainer initialized with LoRA model!


In [18]:
print("="*60)
print("Starting LoRA Fine-tuning on SQuAD Dataset...")
print("="*60)

train_result = trainer.train()

print("\n" + "="*60)
print("Training Complete!")
print("="*60)
print(f"Training Loss: {train_result.training_loss:.4f}")
print(f"Training Runtime: {train_result.metrics['train_runtime']:.2f}s")
print(f"Training Samples/Second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"Training Steps/Second: {train_result.metrics['train_steps_per_second']:.2f}")

Starting LoRA Fine-tuning on SQuAD Dataset...


Epoch,Training Loss,Validation Loss
1,0.2424,0.327308
2,0.2163,0.314793
3,0.2679,0.321566



Training Complete!
Training Loss: 0.2421
Training Runtime: 2958.35s
Training Samples/Second: 15.21
Training Steps/Second: 0.95


In [19]:
model.save_pretrained("./flan-t5-lora-squad/final_lora_model")
tokenizer.save_pretrained("./flan-t5-lora-squad/final_lora_model")
print("LoRA adapters and tokenizer saved successfully!")

LoRA adapters and tokenizer saved successfully!


In [20]:
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
print("="*60)
for key, value in eval_results.items():
    print(f"{key}: {value}")


Evaluation Results:
eval_loss: 0.31479308009147644
eval_runtime: 55.6336
eval_samples_per_second: 35.95
eval_steps_per_second: 4.494
epoch: 3.0


In [21]:
def generate_answer(question, context, max_length=128):
    input_text = f"answer question: {question} context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=4,
        early_stopping=True,
        temperature=0.7
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [22]:
# Load SQuAD metric
squad_metric = evaluate.load("squad")

def compute_metrics_squad(eval_pred):
    """
    Compute Exact Match and F1 score for question answering.
    """
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Format for SQuAD metric (requires specific structure)
    formatted_predictions = [
        {"id": str(i), "prediction_text": pred.strip()}
        for i, pred in enumerate(decoded_preds)
    ]
    formatted_references = [
        {"id": str(i), "answers": {"text": [label.strip()], "answer_start": [0]}}
        for i, label in enumerate(decoded_labels)
    ]

    # Compute metrics
    metrics = squad_metric.compute(
        predictions=formatted_predictions,
        references=formatted_references
    )

    return {
        "exact_match": metrics["exact_match"],
        "f1": metrics["f1"]
    }

print("Metric computation function defined!")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Metric computation function defined!


In [23]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import DataCollatorForSeq2Seq

# Generate predictions on validation set
print("Generating predictions on validation set in batches...")

# Create a DataCollator to handle padding and batching
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Create a DataLoader for the validation set
val_dataloader = torch.utils.data.DataLoader(
    tokenized_val,
    batch_size=training_args.per_device_eval_batch_size,
    collate_fn=data_collator,
    shuffle=False
)

# Changed to store list of lists for generated tokens for variable length handling
all_generated_token_ids_lists = []
all_label_token_ids = []

# Move model to evaluation mode
model.eval()

# Ensure no gradient calculations for efficiency and memory saving
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Predicting"):
        # Move batch inputs to the device (GPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        # Generate tokens. model.generate directly returns token IDs, not logits.
        generated_tokens = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=64, # Max length for generated answers, matches labels max_length
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id # Explicitly pass pad_token_id
        )

        # Extend the list with generated token lists (variable length)
        all_generated_token_ids_lists.extend(generated_tokens.cpu().tolist())

        # Move labels to CPU and convert to numpy (these are already padded to 64)
        all_label_token_ids.append(batch["labels"].cpu().numpy())

# Pad all generated token IDs to the maximum length observed across all predictions
# This ensures that `concatenated_pred_ids` will have a consistent sequence length
padded_predictions_batch = tokenizer.pad(
    {"input_ids": all_generated_token_ids_lists},
    padding="longest", # Pad to the longest sequence in `all_generated_token_ids_lists`
    return_tensors="np"
)
concatenated_pred_ids = padded_predictions_batch["input_ids"]

# Concatenate all collected label token IDs into a single numpy array.
# These should already be uniformly padded to 64 from the preprocessing step.
concatenated_label_ids = np.concatenate(all_label_token_ids, axis=0)

# Compute metrics using the collected token IDs
metrics = compute_metrics_squad((concatenated_pred_ids, concatenated_label_ids))

print("\n=== Evaluation Results ===")
print(f"Exact Match: {metrics['exact_match']:.2f}")
print(f"F1 Score: {metrics['f1']:.2f}")


Generating predictions on validation set in batches...


Predicting:   0%|          | 0/250 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



=== Evaluation Results ===
Exact Match: 72.05
F1 Score: 83.81


In [24]:
def generate_answer(question, context):
    """
    Generate an answer given a question and context.
    """
    # Format input
    input_text = f"question: {question} context: {context}"

    # Tokenize
    inputs = tokenizer(
        input_text,
        max_length=512,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    # Generate answer
    outputs = model.generate(
        **inputs,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )

    # Decode answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Test with examples from validation set
num_examples = 3
for i in range(num_examples):
    sample = val_dataset[i]
    question = sample["question"]
    context = sample["context"]
    true_answer = sample["answer_text"]

    predicted_answer = generate_answer(question, context)

    print(f"Example {i+1}")
    print(f"Question: {question}")
    print(f"\nContext (first 200 chars): {context[:200]}...")
    print(f"\nTrue Answer: {true_answer}")
    print(f"Predicted Answer: {predicted_answer}")

Example 1
Question: Which NFL team represented the AFC at Super Bowl 50?

Context (first 200 chars): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated...

True Answer: Denver Broncos
Predicted Answer: Denver Broncos
Example 2
Question: Which NFL team represented the NFC at Super Bowl 50?

Context (first 200 chars): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated...

True Answer: Carolina Panthers
Predicted Answer: Carolina Panthers
Example 3
Question: Where did Super Bowl 50 take place?

Context (first 200 chars): Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion