In [1]:
# 📦 Install dependencies###
!pip install transformers datasets faiss-cpu
# 📦 Install jsonlines
!pip install jsonlines
import torch
import faiss
import numpy as np
import pickle
import re
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [2]:
# 📁 Step 1: Download dataset.jsonl from GitHub
import requests

dataset_url = "https://raw.githubusercontent.com/fubotz/IR_2025S/main/data/processed/dataset.jsonl"
local_path = "dataset.jsonl"

r = requests.get(dataset_url)
with open(local_path, "w", encoding="utf-8") as f:
    f.write(r.text)

print("✅ Downloaded dataset.jsonl")


✅ Downloaded dataset.jsonl


In [3]:
# 📄 Step 2: Load JSONL into Hugging Face Dataset
import jsonlines
from datasets import Dataset

data = []
with jsonlines.open("dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

dataset = Dataset.from_list(data)

print(f"✅ Loaded dataset with {len(dataset)} chapters")
print(dataset[0])

✅ Loaded dataset with 198 chapters
{'chapter_id': '1_1', 'book': "HP 1 - Harry Potter and the Sorcerer's Stone", 'book_number': 1, 'chapter_str_number': 'CHAPTER ONE', 'chapter_int_number': 1, 'chapter_title': 'THE BOY WHO LIVED', 'text': 'Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, b

In [4]:
class DenseRetrieverFAISS: #try with spacy to encompass entities split
    def __init__(self,
                 model_name="facebook/dpr-ctx_encoder-single-nq-base",
                 question_model_name="facebook/dpr-question_encoder-single-nq-base"):

        self.model_name = model_name
        self.question_model_name = question_model_name
        self.embedding_dim = 768

        # 🔌 Detect GPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"🖥️ Using device: {self.device}")

        # Load models on the selected device
        print("🤖 Loading DPR context encoder...")
        self.ctx_encoder = DPRContextEncoder.from_pretrained(model_name).to(self.device)
        self.ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(model_name)

        print("🤖 Loading DPR question encoder...")
        self.q_encoder = DPRQuestionEncoder.from_pretrained(question_model_name).to(self.device)
        self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(question_model_name)

        self.faiss_index = None
        self.paragraph_metadata = []

    def _split_into_paragraphs(self, text):
        import re
        sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z“"])', text.strip())
        return [s.strip().replace('\n', ' ') for s in sentences if len(s.strip()) > 20]

    def _encode_text(self, texts, batch_size=16):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            inputs = self.ctx_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
##test
            with torch.no_grad():
                outputs = self.ctx_encoder(**inputs)
                batch_embeddings = outputs.pooler_output.cpu().numpy()  # Move back to CPU for FAISS
                embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

    def build_index(self, dataset, top_k=5):
        all_paragraphs = []
        metadata = []

        for example in dataset:
            paragraphs = self._split_into_paragraphs(example["text"])
            for idx, paragraph in enumerate(paragraphs):
                all_paragraphs.append(paragraph)
                metadata.append({
                    "chapter_id": example["chapter_id"],
                    "book": example["book"],
                    "chapter_title": example["chapter_title"],
                    "paragraph_idx": idx,
                    "paragraph_text": paragraph
                })

        print(f"📝 Created {len(all_paragraphs)} paragraphs from {len(dataset)} chapters")
        embeddings = self._encode_text(all_paragraphs)
        faiss.normalize_L2(embeddings)
        self.faiss_index = faiss.IndexFlatIP(self.embedding_dim)
        self.faiss_index.add(embeddings)
        self.paragraph_metadata = metadata
        print(f"✅ FAISS index built with {self.faiss_index.ntotal} vectors")

    def save_index(self, base_path="dense_index"):
        faiss.write_index(self.faiss_index, f"{base_path}.faiss")
        with open(f"{base_path}.pkl", "wb") as f:
            pickle.dump(self.paragraph_metadata, f)
        print(f"💾 Saved FAISS index and metadata to '{base_path}.faiss' and '{base_path}.pkl'")


In [5]:
# 🔧 Step 4: Build and save the dense index
retriever = DenseRetrieverFAISS()
retriever.build_index(dataset)
retriever.save_index("harry_dense_index")


🖥️ Using device: cuda
🤖 Loading DPR context encoder...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


🤖 Loading DPR question encoder...


config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

📝 Created 48956 paragraphs from 198 chapters
✅ FAISS index built with 48956 vectors
💾 Saved FAISS index and metadata to 'harry_dense_index.faiss' and 'harry_dense_index.pkl'


In [6]:
# 📦 Install jsonlines
!pip install jsonlines



In [11]:
import pickle

# Adjust the path as needed
with open("harry_dense_index.pkl", "rb") as f:
    metadata = pickle.load(f)

# View first few entries
for entry in metadata[:20]:
    print(entry)

{'chapter_id': '1_1', 'book': "HP 1 - Harry Potter and the Sorcerer's Stone", 'chapter_title': 'THE BOY WHO LIVED', 'paragraph_idx': 0, 'paragraph_text': 'Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.'}
{'chapter_id': '1_1', 'book': "HP 1 - Harry Potter and the Sorcerer's Stone", 'chapter_title': 'THE BOY WHO LIVED', 'paragraph_idx': 1, 'paragraph_text': 'They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.'}
{'chapter_id': '1_1', 'book': "HP 1 - Harry Potter and the Sorcerer's Stone", 'chapter_title': 'THE BOY WHO LIVED', 'paragraph_idx': 2, 'paragraph_text': 'Dursley was the director of a firm called Grunnings, which made drills.'}
{'chapter_id': '1_1', 'book': "HP 1 - Harry Potter and the Sorcerer's Stone", 'chapter_title': 'THE BOY WHO LIVED', 'paragraph_idx': 3, 'paragraph_text': 'He was a big, beefy man with hardly any neck, althou