In [5]:
# Import necessary libraries
import json
import numpy as np
import pandas as pd
import torch
from pathlib import Path
from collections import Counter
from tqdm.auto import tqdm
from evaluate import load
import matplotlib.pyplot as plt
import seaborn as sns

# Transformers and datasets
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
)
from datasets import Dataset, DatasetDict

# Set random seeds for reproducibility
import random
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Check device - optimized for Apple Silicon
def get_device():
    """Get the best available device for training"""
    if torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

device = get_device()
print(f"Using device: {device}")

# Additional device info
if device.type == "mps":
    print("Apple Silicon")
elif device.type == "cuda":
    print("NVIDIA GPU")
elif device.type == "cpu":
    print("CPU")

Using device: cuda
✅ NVIDIA GPU detected!


In [6]:
# load local squad data from canvas
def load_squad_data(file_path):
    """Load SQuAD format JSON data"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

# load training and validation data
train_data = load_squad_data('train-v1.1.json')
dev_data = load_squad_data('dev-v1.1.json')

print(f"Training data version: {train_data['version']}")
print(f"Number of articles in training set: {len(train_data['data'])}")
print(f"Number of articles in dev set: {len(dev_data['data'])}")

Training data version: 1.1
Number of articles in training set: 442
Number of articles in dev set: 48


In [7]:
def parse_squad_data(squad_data):
    """Parse SQuAD data into a flat list of examples"""
    examples = []

    for article in squad_data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                example = {
                    'id': qa['id'],
                    'title': title,
                    'context': context,
                    'question': qa['question'],
                    'answers': qa['answers']
                }
                examples.append(example)

    return examples

# Parse the data
train_examples = parse_squad_data(train_data)
dev_examples = parse_squad_data(dev_data)

print(f"\nNumber of training examples: {len(train_examples)}")
print(f"Number of dev examples: {len(dev_examples)}")


Number of training examples: 87599
Number of dev examples: 10570


In [8]:
# explore sample
sample = train_examples[0]
print("Sample Training Example:")
print(f"ID: {sample['id']}")
print(f"Title: {sample['title']}")
print(f"\nQuestion: {sample['question']}")
print(f"\nContext (first 200 chars): {sample['context'][:200]}...")
print(f"\nAnswers: {sample['answers']}")

Sample Training Example:
ID: 5733be284776f41900661182
Title: University_of_Notre_Dame

Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Context (first 200 chars): Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta...

Answers: [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}]


In [9]:
# compute statistics
def compute_statistics(examples):
    """Compute various statistics about the dataset"""
    stats = {
        'num_examples': len(examples),
        'question_lengths': [],
        'context_lengths': [],
        'answer_lengths': [],
        'num_answers': []
    }

    for ex in examples:
        stats['question_lengths'].append(len(ex['question'].split()))
        stats['context_lengths'].append(len(ex['context'].split()))
        stats['num_answers'].append(len(ex['answers']))
        if ex['answers']:
            stats['answer_lengths'].append(len(ex['answers'][0]['text'].split()))

    return stats

train_stats = compute_statistics(train_examples)
dev_stats = compute_statistics(dev_examples)

print("Training Set Statistics:")
print(f"  Total examples: {train_stats['num_examples']}")
print(f"  Avg question length: {np.mean(train_stats['question_lengths']):.2f} words")
print(f"  Avg context length: {np.mean(train_stats['context_lengths']):.2f} words")
print(f"  Avg answer length: {np.mean(train_stats['answer_lengths']):.2f} words")
print(f"\nDev Set Statistics:")
print(f"  Total examples: {dev_stats['num_examples']}")
print(f"  Avg question length: {np.mean(dev_stats['question_lengths']):.2f} words")
print(f"  Avg context length: {np.mean(dev_stats['context_lengths']):.2f} words")
print(f"  Avg answer length: {np.mean(dev_stats['answer_lengths']):.2f} words")

Training Set Statistics:
  Total examples: 87599
  Avg question length: 10.06 words
  Avg context length: 119.76 words
  Avg answer length: 3.16 words

Dev Set Statistics:
  Total examples: 10570
  Avg question length: 10.22 words
  Avg context length: 123.95 words
  Avg answer length: 3.02 words


In [10]:
# initialize tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# set maximum lengths
MAX_LENGTH = 384
DOC_STRIDE = 128

print(f"Tokenizer: {MODEL_NAME}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Max length: {MAX_LENGTH}")
print(f"Doc stride: {DOC_STRIDE}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer: bert-base-uncased
Vocabulary size: 30522
Max length: 384
Doc stride: 128


In [42]:
# convert examples to huggingface dataset format
def examples_to_dataset(examples):
    """Convert parsed examples to HuggingFace Dataset"""
    dataset_dict = {
        'id': [],
        'title': [],
        'context': [],
        'question': [],
        'answers': []
    }

    for ex in examples:
        dataset_dict['id'].append(ex['id'])
        dataset_dict['title'].append(ex['title'])
        dataset_dict['context'].append(ex['context'])
        dataset_dict['question'].append(ex['question'])

        # Format answers for HuggingFace
        if ex['answers']:
            dataset_dict['answers'].append({
                'text': [ans['text'] for ans in ex['answers']],
                'answer_start': [ans['answer_start'] for ans in ex['answers']]
            })
        else:
            dataset_dict['answers'].append({'text': [], 'answer_start': []})

    return Dataset.from_dict(dataset_dict)

# create datasets
train_dataset = examples_to_dataset(train_examples)
dev_dataset = examples_to_dataset(dev_examples)

# create DatasetDict
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': dev_dataset
})

print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [12]:
# preprocessing function for training
def prepare_train_features(examples):
    """
    Tokenize questions and contexts, and find the start and end positions of answers.
    Handle cases where the context is too long by creating multiple features with stride.
    """
    # tokenize questions and contexts
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # map from feature back to original example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # initialize labels
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # Get the input_ids to find the context span
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Get the sequence ids (0 for question, 1 for context)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Get the original example index
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        # If no answers, set positions to cls_index
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Get the start and end character positions of the answer
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Find the start and end token positions
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Check if the answer is in this feature (not truncated)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Move token_start_index and token_end_index to the answer span
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [60]:
# preprocessing function for validation dataset
def prepare_validation_features(examples):
    """
    Tokenize questions and contexts for validation.
    Keep offset mapping and example ids for post-processing.
    """
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=MAX_LENGTH,
        stride=DOC_STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Map from feature back to original example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Keep example ids and offset mapping for post-processing
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Get the sequence ids
        sequence_ids = tokenized_examples.sequence_ids(i)

        # Set context offsets to None
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == 1 else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

        # Get the original example index
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

    return tokenized_examples

In [61]:
# Apply preprocessing - use prepare_train_features for BOTH during training
print("Tokenizing training data...")
tokenized_train = raw_datasets["train"].map(
    prepare_train_features,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    desc="Running tokenizer on train dataset",
)

print("\nTokenizing validation data...")
tokenized_validation = raw_datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
    desc="Running tokenizer on validation dataset",
)

print(f"\nTraining features: {len(tokenized_train)}")
print(f"Validation features: {len(tokenized_validation)}")

Tokenizing training data...


Running tokenizer on train dataset:   0%|          | 0/87599 [00:00<?, ? examples/s]


Tokenizing validation data...


Running tokenizer on validation dataset:   0%|          | 0/10570 [00:00<?, ? examples/s]


Training features: 88524
Validation features: 10784


In [71]:
# Load pre-trained BERT model for question answering
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
model.to(device)

# Print model info
num_params = sum(p.numel() for p in model.parameters())
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_frozen_params = num_params - num_trainable_params

print(f"Model: {MODEL_NAME}")
print(f"Total parameters: {num_params:,}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: bert-base-uncased
Total parameters: 108,893,186


In [48]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./models/bert-qa-baseline",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1, # 1 epoch bc v slow
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    fp16=True,  # use fp16 only on CUDA
    dataloader_pin_memory=device.type == "cuda",
)


# Data collator
data_collator = DefaultDataCollator()

print("Training Arguments:")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  FP16: {training_args.fp16}")

Training Arguments:
  Batch size: 8
  Learning rate: 3e-05
  Epochs: 1
  FP16: True


In [49]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized successfully!")

Trainer initialized successfully!


  trainer = Trainer(


In [50]:
# Train the model
print("Starting training...\n")
train_result = trainer.train()

# Save the model
trainer.save_model("./models/bert-qa-baseline/final")

# Print training metrics
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)

print("\nTraining completed!")

Starting training...



Epoch,Training Loss,Validation Loss
1,1.0014,0.967447


***** train metrics *****
  epoch                    =        1.0
  total_flos               = 14541777GF
  train_loss               =     1.2236
  train_runtime            = 0:29:32.51
  train_samples_per_second =      44.95
  train_steps_per_second   =      5.619

Training completed!


In [63]:
# post-process predictions
def postprocess_qa_predictions(
    examples,
    features,
    predictions,
    n_best_size=20,
    max_answer_length=30,
):
    """
    Post-process the predictions to get the final answer text.
    """
    all_start_logits, all_end_logits = predictions

    # Build a map from example to its corresponding features
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = {}
    for i, feature in enumerate(features):
        example_id = feature["example_id"]
        if example_id not in features_per_example:
            features_per_example[example_id] = []
        features_per_example[example_id].append(i)

    # Dictionary to store predictions
    predictions_dict = {}

    # Loop through all examples
    for example_index, example in enumerate(tqdm(examples, desc="Post-processing")):
        example_id = example["id"]
        context = example["context"]

        # Get the features for this example
        feature_indices = features_per_example[example_id]

        min_null_score = None
        valid_answers = []

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null score
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Get top n_best_size start and end positions
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip invalid predictions
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue

                    # Skip answers that are too long or have end < start
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    # Get the answer text
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    answer_text = context[start_char:end_char]

                    valid_answers.append({
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": answer_text,
                    })

        # Select the best answer
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions_dict[example_id] = best_answer["text"]

    return predictions_dict

In [64]:
# Get predictions on validation set
print("Getting predictions on validation set...")
raw_predictions = trainer.predict(tokenized_validation)

Getting predictions on validation set...


In [65]:
# Post-process predictions
final_predictions = postprocess_qa_predictions(
    raw_datasets["validation"],
    tokenized_validation,
    raw_predictions.predictions,
)

print(f"\nGenerated {len(final_predictions)} predictions")

Post-processing:   0%|          | 0/10570 [00:00<?, ?it/s]


Generated 10570 predictions


In [68]:
# Save predictions to JSON file (required format for evaluate-v2.0.py)
with open('predictions.json', 'w') as f:
    json.dump(final_predictions, f)

print("Predictions saved to predictions.json")

Predictions saved to predictions.json


In [66]:
# Load SQuAD metric
metric = load("squad")

# Format predictions and references
formatted_predictions = [
    {"id": k, "prediction_text": v} for k, v in final_predictions.items()
]

references = [
    {
        "id": ex["id"],
        "answers": {
            "text": ex["answers"]["text"],
            "answer_start": ex["answers"]["answer_start"],
        },
    }
    for ex in raw_datasets["validation"]
]

# Compute metrics
results = metric.compute(predictions=formatted_predictions, references=references)
print("\nEvaluation Results:")
print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")


Evaluation Results:
Exact Match: 79.45
F1 Score: 87.41


In [70]:
# Function to answer questions
def answer_question(question, context, model, tokenizer):
    """Answer a question given a context"""
    # Tokenize input
    inputs = tokenizer(
        question,
        context,
        max_length=MAX_LENGTH,
        truncation="only_second",
        return_tensors="pt",
    )

    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get start and end logits
    start_logits = outputs.start_logits[0].cpu().numpy()
    end_logits = outputs.end_logits[0].cpu().numpy()

    # Get the most likely answer
    start_idx = np.argmax(start_logits)
    end_idx = np.argmax(end_logits)

    # Get offset mapping
    offset_mapping = inputs["input_ids"][0].cpu().numpy()

    # Decode answer
    answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
    answer = tokenizer.decode(answer_tokens)

    return answer

# Test with a custom example
test_context = """
Several commemorative events take place every year. Gatherings of thousands of people on the banks of the Vistula on Midsummer’s Night for a festival called Wianki (Polish for Wreaths) have become a tradition and a yearly event in the programme of cultural events in Warsaw. The festival traces its roots to a peaceful pagan ritual where maidens would float their wreaths of herbs on the water to predict when they would be married, and to whom. By the 19th century this tradition had become a festive event, and it continues today. The city council organize concerts and other events. Each Midsummer’s Eve, apart from the official floating of wreaths, jumping over fires, looking for the fern flower, there are musical performances, dignitaries' speeches, fairs and fireworks by the river bank.
"""

test_question = "How many people gather along the banks of the Vistula for the Wianki festival?"

answer = answer_question(test_question, test_context, model, tokenizer)
print(f"Question: {test_question}")
print(f"Answer: {answer}")

Question: How many people gather along the banks of the Vistula for the Wianki festival?
Answer: thousands


In [72]:
trainer.save_model("./bert_baseline")

In [73]:
!zip -r bert_baseline/baseline.zip bert_baseline/

  adding: bert_baseline/ (stored 0%)
  adding: bert_baseline/config.json (deflated 47%)
  adding: bert_baseline/model.safetensors (deflated 7%)
  adding: bert_baseline/special_tokens_map.json (deflated 42%)
  adding: bert_baseline/vocab.txt (deflated 53%)
  adding: bert_baseline/tokenizer_config.json (deflated 75%)
  adding: bert_baseline/tokenizer.json (deflated 71%)
  adding: bert_baseline/training_args.bin (deflated 53%)


In [75]:
from huggingface_hub import login
login()  # Login once

# Upload your trained model
model.push_to_hub("G20-CS4248/bert-baseline-qa")
tokenizer.push_to_hub("G20-CS4248/bert-baseline-qa")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...vwb6spr/model.safetensors:   0%|          | 14.2kB /  436MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/G20-CS4248/bert-baseline-qa/commit/20f692dfb0c4b4afff5fd64fff3ef9b7d3096686', commit_message='Upload tokenizer', commit_description='', oid='20f692dfb0c4b4afff5fd64fff3ef9b7d3096686', pr_url=None, repo_url=RepoUrl('https://huggingface.co/G20-CS4248/bert-baseline-qa', endpoint='https://huggingface.co', repo_type='model', repo_id='G20-CS4248/bert-baseline-qa'), pr_revision=None, pr_num=None)