In [2]:
# Step 1: Install dependencies
!pip install transformers datasets --quiet

In [1]:
# Step 2: Import libraries and check GPU
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from datasets import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [2]:
# Step 3: Load pre-trained BERT QA model
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly ident

In [3]:
# Step 4: Load the SQuAD v1.1 dataset
dataset = load_dataset("squad")
print(dataset["train"][0])

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [4]:
# Step 5: Run model on one example
example = dataset["train"][0]
context = example["context"]
question = example["question"]
true_answer = example["answers"]["text"][0]

# Tokenize and move to GPU
inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512).to(device)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits)

# Decode predicted span
answer_tokens = inputs["input_ids"][0][start_idx : end_idx + 1]
predicted_answer = tokenizer.decode(answer_tokens)

print(f"\nQuestion: {question}")
print(f"True Answer: {true_answer}")
print(f"Predicted Answer: {predicted_answer}")


Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
True Answer: Saint Bernadette Soubirous
Predicted Answer: saint bernadette soubirous


In [5]:
for i in range(10):
    example = dataset["train"][i]
    context = example["context"]
    question = example["question"]
    true_answer = example["answers"]["text"][0]

    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        start_idx = torch.argmax(outputs.start_logits)
        end_idx = torch.argmax(outputs.end_logits)

    answer_tokens = inputs["input_ids"][0][start_idx : end_idx + 1]
    predicted_answer = tokenizer.decode(answer_tokens)

    print(f"\nQ{i+1}: {question}")
    print(f"Predicted: {predicted_answer}")
    print(f"True:      {true_answer}")


Q1: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Predicted: saint bernadette soubirous
True:      Saint Bernadette Soubirous

Q2: What is in front of the Notre Dame Main Building?
Predicted: a copper statue of christ
True:      a copper statue of Christ

Q3: The Basilica of the Sacred heart at Notre Dame is beside to which structure?
Predicted: the main building
True:      the Main Building

Q4: What is the Grotto at Notre Dame?
Predicted: a replica of the grotto at lourdes, france
True:      a Marian place of prayer and reflection

Q5: What sits on top of the Main Building at Notre Dame?
Predicted: a golden statue of the virgin mary
True:      a golden statue of the Virgin Mary

Q6: When did the Scholastic Magazine of Notre dame begin publishing?
Predicted: september 1876
True:      September 1876

Q7: How often is Notre Dame's the Juggler published?
Predicted: twice
True:      twice

Q8: What is the daily student paper at Notre Dame called?
Predicted: the 

In [8]:
import re
import string
from tqdm import tqdm
from datasets import load_dataset
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load model & tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name).to("cuda")

# Load SQuAD dataset (fixed path)
dataset = load_dataset("squad")["validation"]
subset = dataset.select(range(100))
# Normalization functions for EM/F1
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact(a_pred, a_gold):
    return int(normalize_answer(a_pred) == normalize_answer(a_gold))

def compute_f1(a_pred, a_gold):
    pred_tokens = normalize_answer(a_pred).split()
    gold_tokens = normalize_answer(a_gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    if not common:
        return 0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * (precision * recall) / (precision + recall)

# Evaluate over first 100 samples
exact_scores = []
f1_scores = []

for example in tqdm(subset):
    context = example["context"]
    question = example["question"]
    true_answers = example["answers"]["text"]

    # Tokenize input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512).to("cuda")

    # Get start/end logits
    with torch.no_grad():
        outputs = model(**inputs)
        start_idx = torch.argmax(outputs.start_logits)
        end_idx = torch.argmax(outputs.end_logits)

    # Decode predicted answer
    pred_tokens = inputs["input_ids"][0][start_idx : end_idx + 1]
    pred_answer = tokenizer.decode(pred_tokens)

    # Compare against first ground-truth (you can also loop over all if needed)
    gold_answer = true_answers[0]

    # Score
    em = compute_exact(pred_answer, gold_answer)
    f1 = compute_f1(pred_answer, gold_answer)

    exact_scores.append(em)
    f1_scores.append(f1)

# Final Scores
avg_em = sum(exact_scores) / len(exact_scores)
avg_f1 = sum(f1_scores) / len(f1_scores)

print(f"\nEvaluation on 100 SQuAD examples:")
print(f"Exact Match (EM): {avg_em:.2f}")
print(f"F1 Score:         {avg_f1:.2f}")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 100/100 [00:03<00:00, 28.07it/s]


Evaluation on 100 SQuAD examples:
Exact Match (EM): 0.75
F1 Score:         0.81



