In [1]:
# 📦 Install required packages
!pip install transformers faiss-cpu jsonlines -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import random
import torch
from transformers import pipeline
import pickle

# Load paragraph metadata
with open("harry_dense_index.pkl", "rb") as f:
    metadata = pickle.load(f)

# Sample 30 entries to build eval set
sampled = random.sample(metadata, 30)

# Load question generator model
qg = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl", device=0 if torch.cuda.is_available() else -1)

eval_data = []

# For each sampled paragraph
for entry in sampled:
    context = entry['paragraph_text']
    if len(context.split()) < 8:
        continue  # Skip short contexts

    # Format for question generation
    input_text = f"generate question: {context}"
    try:
        question = qg(input_text, max_length=64, do_sample=False)[0]['generated_text']
    except Exception as e:
        continue

    # Build positive and negative context sets
    positive = {"text": context}
    negatives = [
        {"text": random.choice(metadata)["paragraph_text"]}
        for _ in range(3)
    ]

    eval_data.append({
        "query": question,
        "positive_ctxs": [positive],
        "negative_ctxs": negatives
    })

# Save to eval_data.json
with open("eval_data.json", "w") as f:
    json.dump(eval_data, f, indent=2)

print(f"✅ Saved {len(eval_data)} query examples to eval_data.json")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_toke

✅ Saved 21 query examples to eval_data.json


In [3]:
!pip install jsonlines -q
import pickle, json, random


In [9]:
# 📚 Harry Potter Dense Retriever Evaluation Notebook
# ✅ Generates eval_data.json with:
#   - Real trivia questions
#   - Multiple correct answers per query
#   - Positive & negative contexts
#   - Retrieval scoring (Hit@1, MRR@10)

!pip install faiss-cpu transformers datasets -q

import json, random, pickle
import faiss
import numpy as np
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from datasets import Dataset

# === 1. Load FAISS index and metadata ===
with open("harry_dense_index.pkl", "rb") as f:
    metadata = pickle.load(f)
index = faiss.read_index("harry_dense_index.faiss")

# === 2. Load question encoder ===
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")
device = model.device

# === 3. Trivia Questions + Answers ===
questions_answers = {
    "Who killed Dumbledore?": ["Severus Snape", "Snape"],
    "What are the four houses of Hogwarts?": ["Gryffindor", "Slytherin", "Ravenclaw", "Hufflepuff"],
    "Who gave Harry his first birthday cake?": ["Hagrid"],
    "What kind of creature is Hedwig?": ["Owl", "Snowy owl"],
    "What wand does Voldemort use?": ["Yew", "13 and a half inches", "Phoenix feather"],
    "What is a Patronus?": ["Magical guardian", "Charm to repel Dementors"],
    "Who is the Half-Blood Prince?": ["Snape", "Severus Snape"],
    "What does the Marauder's Map show?": ["Everyone's location", "Hogwarts map"],
    "What is a Horcrux?": ["Object containing part of soul"],
    "Who is Harry Potter's godfather?": ["Sirius Black"],
    "What is the name of Harry's owl?": ["Hedwig"],
    "Who teaches Potions at Hogwarts?": ["Severus Snape", "Horace Slughorn"],
    "Who is the headmaster of Hogwarts for most of the series?": ["Albus Dumbledore"],
    "What platform does the Hogwarts Express leave from?": ["Platform 9 and 3/4", "Nine and three-quarters"],
    "What spell is used to disarm an opponent?": ["Expelliarmus"],
    "What spell produces light?": ["Lumos"],
    "What is the name of Hagrid's pet dragon?": ["Norbert", "Norberta"],
    "What position does Harry play in Quidditch?": ["Seeker"],
    "Who is the house-elf that serves the Malfoy family?": ["Dobby"],
    "Who kills Bellatrix Lestrange?": ["Molly Weasley"],
    "What is the name of Voldemort's snake?": ["Nagini"],
    "What object turns back time?": ["Time-Turner"],
    "Who is the author of 'A History of Magic'?": ["Bathilda Bagshot"],
    "Who was Harry's first kiss?": ["Cho Chang"],
    "What is the name of the Weasleys' home?": ["The Burrow"],
    "What vault number was the Philosopher's Stone kept in?": ["713"],
    "Who is the Half-Giant friend of Harry?": ["Hagrid"],
    "What does the spell 'Alohomora' do?": ["Opens locked doors"],
    "Who is the ghost of Gryffindor House?": ["Nearly Headless Nick"],
    "What does the spell 'Expecto Patronum' do?": ["Summons a Patronus", "Repels Dementors"]
}

# === 4. Embed query ===
def encode_question(q):
    inputs = tokenizer(q, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        return model(**inputs).pooler_output.cpu().numpy()

# === 5. Build dataset with all matching positives ===
eval_data = []

for question, answers in questions_answers.items():
    q_vec = encode_question(question)
    _, I = index.search(q_vec, 25)
    top_matches = [metadata[i]["paragraph_text"] for i in I[0]]

    # Collect all passages that contain any correct answer
    positive_ctxs = [
        {"text": text} for text in top_matches if any(ans.lower() in text.lower() for ans in answers)
    ]
    if not positive_ctxs:
        continue

    # Filter out positive matches from top matches, keep rest as hard negatives
    neg_texts = [text for text in top_matches if text not in [ctx["text"] for ctx in positive_ctxs]]
    negative_ctxs = [{"text": text} for text in neg_texts[:3]]  # cap at 3

    eval_data.append({
        "query": question,
        "answers": answers,
        "positive_ctxs": positive_ctxs,
        "negative_ctxs": negative_ctxs
    })

# === 6. Save to JSON ===
with open("eval_data_multi3.json", "w") as f:
    json.dump(eval_data, f, indent=2)

# === 7. Convert to Hugging Face Dataset ===
hf_data = Dataset.from_list(eval_data)
hf_data.save_to_disk("eval_hf_dataset")

# === 8. Evaluate Retriever (Hit@1, MRR@10) ===
def compute_metrics():
    hits, rr = 0, 0
    for item in eval_data:
        q_vec = encode_question(item["query"])
        _, I = index.search(q_vec, 10)
        retrieved = [metadata[i]["paragraph_text"] for i in I[0]]

        found = False
        for rank, para in enumerate(retrieved):
            if any(ans.lower() in para.lower() for ans in item["answers"]):
                if rank == 0:
                    hits += 1
                rr += 1 / (rank + 1)
                found = True
                break
        if not found:
            rr += 0

    n = len(eval_data)
    print(f"Hit@1: {hits}/{n} = {hits/n:.2f}")
    print(f"MRR@10: {rr/n:.2f}")

compute_metrics()


Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Saving the dataset (0/1 shards):   0%|          | 0/17 [00:00<?, ? examples/s]

Hit@1: 6/17 = 0.35
MRR@10: 0.47
