In [1]:
!pip install -U transformers sentence-transformers faiss-cpu pandas nltk datasets dspy

# mount to drive
from google.colab import drive
drive.mount('/content/drive')

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dspy
  Downloading dspy-2.6.27-py3-none-any.whl.metadata (7.0 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting backoff>=2.2 (from dspy)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting ujson>=5.8.0 (from dspy)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting optuna>=3.4.0 (from dspy)
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
C

In [2]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch
import os
import json
from tqdm import tqdm
from huggingface_hub import login
import numpy as np
from collections import defaultdict
import random

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === ColBERT setup ===
colbert_tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").to(device)
colbert_model.eval()

# === Hugging Face auth ===
login("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")

# === Dataset ===
dataset = load_dataset("hotpot_qa", "fullwiki", trust_remote_code=True)
train_dataset = dataset['train'][:25000]
# LeReT uses around 25K items from their dataset.


query_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", trust_remote_code=True)
query_tokenizer.pad_token = query_tokenizer.eos_token

query_generator = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
query_generator.eval()


'''
query_tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
query_generator = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
query_generator.eval()
'''

fewshot_ex_path = 'drive/MyDrive/c438_project/fewshot_examples.json'

with open(fewshot_ex_path, 'r') as f:
    FEWSHOT_EXAMPLES = json.load(f)

print(f"Loaded {len(FEWSHOT_EXAMPLES)} few-shot examples.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

hotpot_qa.py:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Loaded 100 few-shot examples.


In [24]:
train_dataset = dataset['train'][:25000]

In [26]:
print((len(train_dataset['question'])))

25000


In [29]:
def build_fewshot_prompt(question, context=""):
    num_fewshots = random.randint(1, 3)
    fewshots = random.sample(FEWSHOT_EXAMPLES, num_fewshots)

    fewshot_str = "Examples:\n"
    for ex in fewshots:
        fewshot_str += f"Question:{ex['question']}\nQuery:{ex['query']}\n\n"

    context_str = f"Context:\n{context}\n\n" if context else ""

    # Avoid pattern that triggers extra completions
    task_str = f"Generate a search query for the following question:\n{question}"

    return f"{fewshot_str}{context_str}{task_str}"


# === Embedding utility ===
def compute_colbert_embeddings(texts):
    encoded = colbert_tokenizer(
        texts,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        output = colbert_model(**encoded).last_hidden_state
    masks = encoded["attention_mask"].bool()
    return [output[i][masks[i]].cpu().numpy() for i in range(len(texts))]

# === Scoring utility ===
def maxsim_score(query_emb, doc_embs):
    return float((torch.matmul(query_emb, doc_embs.T)).max(dim=1).values.sum())

def compute_ap_recall(supporting_pairs, retrieved_ids, sentence_metadata):
    retrieved_pairs = {
        (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) for i in retrieved_ids
    }
    hits = [1 if (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) in supporting_pairs else 0 for i in retrieved_ids]
    ap = sum(hits[i] / (i + 1) for i in range(len(hits)) if hits[i]) / max(sum(hits), 1)
    recall = sum(hits) / len(supporting_pairs) if supporting_pairs else 0
    return ap, recall

# === Params ===
NUM_HOPS = 2
NUM_QUERIES = 5
TOP_K = 5

preference_dataset = {}

for idx in tqdm(range(5000)):
    sample = {k: train_dataset[k][idx] for k in train_dataset.keys()}
    question = sample['question']
    supporting_facts = sample['supporting_facts']

    preference_dataset[question] = {
        "question": question,
        "hops": {}
    }

    # Flatten context
    context_titles = sample['context']['title']
    context_sentences_grouped = sample['context']['sentences']
    flattened_sentences = []
    sentence_metadata = []
    for title, sentences in zip(context_titles, context_sentences_grouped):
        for i, sent in enumerate(sentences):
            flattened_sentences.append(sent)
            sentence_metadata.append({"title": title, "sent_idx": i})
    context_embeddings = compute_colbert_embeddings(flattened_sentences)
    vector_store = {
        "id": list(range(len(flattened_sentences))),
        "embeddings": [torch.tensor(emb, dtype=torch.float32) for emb in context_embeddings]
    }

    current_context = ""  # No context for the first hop

    for hop in range(NUM_HOPS):

        query_tokenizer.padding_side = "left"  # Required for decoder-only models

        # Generate few-shot prompts
        prompts = [build_fewshot_prompt(question, context=current_context) for _ in range(NUM_QUERIES)]

        # Tokenize prompts
        inputs = query_tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(query_generator.device)

        # Save input_ids *before* generate() call
        input_ids = inputs["input_ids"]

        # Get the true prompt lengths
        prompt_lengths = inputs["attention_mask"].sum(dim=1).tolist()

        # Generate
        outputs = query_generator.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"],
            max_new_tokens=12,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            num_return_sequences=1,
            pad_token_id=query_tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=False
        )

        # Decode new tokens only
        generated_sequences = outputs.sequences  # shape: (NUM_QUERIES, total_len)

        queries = []
        for i in range(NUM_QUERIES):
            full_output_ids = generated_sequences[i]
            generated_ids = full_output_ids[prompt_lengths[i]:]  # skip prompt tokens
            decoded = query_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
            queries.append(decoded)


        # === Batched embedding of queries ===
        query_embeddings = compute_colbert_embeddings(queries)
        query_embeddings = [torch.tensor(q, dtype=torch.float32) for q in query_embeddings]

        # === Per-query MaxSim scoring ===
        scores_per_query = []
        for q_emb in query_embeddings:
            scores = [maxsim_score(q_emb, doc_emb) for doc_emb in vector_store["embeddings"]]
            scores_per_query.append(scores)

        # Score queries and collect preference data
        scored = []
        supporting_pairs = set(zip(supporting_facts['title'], supporting_facts['sent_id']))
        for q, scores in zip(queries, scores_per_query):
            top_indices = np.argsort(scores)[-TOP_K:][::-1]
            ap, recall = compute_ap_recall(supporting_pairs, top_indices, sentence_metadata)
            retrieved_context = [flattened_sentences[i] for i in top_indices]
            scored.append({
                "query": q,
                "ap": ap,
                "recall": recall,
                "top_indices": top_indices,
                "retrieved_context": retrieved_context
            })

        # Sort by AP
        scored.sort(key=lambda x: x["ap"], reverse=True)

        ap_to_indices = defaultdict(list)
        for i, item in enumerate(scored):
            ap_to_indices[item["ap"]].append(i)

        sorted_aps = sorted(ap_to_indices.keys())

        preference_pairs = []
        for i in range(len(scored)):
            for j in range(len(scored)):
                if i != j and scored[i]["ap"] > scored[j]["ap"]:
                    preference_pairs.append((i, j))

        preference_dataset[question]["hops"][f"hop_{hop}"] = {
            "queries": [x["query"] for x in scored],
            "aps": [x["ap"] for x in scored],
            "recalls": [x["recall"] for x in scored],
            "preference_pairs": preference_pairs
        }

        current_context = "\n".join(scored[0]["retrieved_context"])

# Save
with open("preference_dataset_hotpotqa.json", "w") as f:
    json.dump(preference_dataset, f, indent=2)

100%|██████████| 5000/5000 [2:12:16<00:00,  1.59s/it]


# HoVeR dataset

In [None]:
hover_dataset = load_dataset("hover-nlp/hover", "fullwiki", trust_remote_code=True)
hover_train = hover_dataset['train'].select(range(10))

In [7]:
import json
import torch
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import dspy
from transformers import AutoTokenizer, AutoModelForCausalLM
from pprint import pprint


retriever_hover = dspy.ColBERTv2(
    url="http://20.102.90.50:2017/wiki17_abstracts"
)

# === Config ===
NUM_HOPS = 4
NUM_QUERIES = 1
TOP_K = 5

# === Output structure ===
preference_dataset = {}


def compute_ap_score_hover(retrieved_titles, gold_titles):
    relevant = [1 if title in gold_titles else 0 for title in retrieved_titles]
    if sum(relevant) == 0:
        return 0.0
    precisions = []
    num_relevant = 0
    for i, is_rel in enumerate(relevant):
        if is_rel:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    ap = sum(precisions) / len(gold_titles)
    print(f'AP: {ap}')
    return ap

# === Main loop ===

for sample in tqdm(hover_train):  # Reduce range for test runs
    claim = sample['claim']
    question = f"Is the following statement supported: {claim}"
    gold_titles = [sf["key"] for sf in sample["supporting_facts"]]

    preference_dataset[question] = {"question": question, "hops": {}}
    current_context = ""

    for hop in range(NUM_HOPS):

        # Build prompt using current context
        if current_context:
          prompt = f"Context:{current_context}\n\nGenerate a search query for the following question.\nQuestion: {question}".strip()
        else:
          prompt = f"Generate a search query for the following question.\nQuestion: {question}".strip()
        input_tokens = query_tokenizer(prompt, return_tensors="pt").to(query_generator.device)

        # Generate NUM_QUERIES queries
        outputs = query_generator.generate(
            **input_tokens,
            max_new_tokens=16,
            do_sample=False,
            num_return_sequences=NUM_QUERIES,
            pad_token_id=query_tokenizer.eos_token_id
        )

        queries = query_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        queries = [q.strip() for q in queries]
        pprint(f'Queries: {queries}')

        scored = []
        for query in queries:
            try:
                retrieved = retriever_hover(query, k=TOP_K)
                long_texts = [doc["long_text"] for doc in retrieved]
                titles = []
                texts = []
                for item in long_texts:
                  item = item.split(" | ")
                  titles.append(item[0].strip())
                  texts.append(item[1].strip())

                pprint(f'Titles: {titles}')
                pprint(f'Texts: {texts}')

                ap = compute_ap_score_hover(titles, gold_titles)
                print(10*'-')

                scored.append({
                    "query": query,
                    "ap": ap,
                    "retrieved_titles": titles,
                    "retrieved_context": texts
                })
            except Exception as e:
                print(f"Error retrieving for query '{query}': {e}")

        if not scored:
            continue

        scored.sort(key=lambda x: x["ap"])
        ap_to_indices = defaultdict(list)
        for i, item in enumerate(scored):
            ap_to_indices[item["ap"]].append(i)
        sorted_aps = sorted(ap_to_indices.keys())

        preference_pairs = []
        for i in range(len(sorted_aps) - 1):
            lower = ap_to_indices[sorted_aps[i]]
            higher = ap_to_indices[sorted_aps[i + 1]]
            for j in higher:
                for k in lower:
                    preference_pairs.append((j, k))

        preference_dataset[question]["hops"][f"hop_{hop}"] = {
            "queries": [x["query"] for x in scored],
            "aps": [x["ap"] for x in scored],
            "retrieved_titles": [x["retrieved_titles"] for x in scored],
            "retrieved_contexts": [x["retrieved_context"] for x in scored],
            "preference_pairs": preference_pairs
        }

        current_context = "\n".join(scored[-1]["retrieved_context"])  # best one

# === Save to JSON ===
with open("hover_preference_dataset.json", "w") as f:
    json.dump(preference_dataset, f, indent=2)
print("Saved preference dataset to hover_preference_dataset.json")


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


("Queries: ['Generate a search query for the following question.\\nQuestion: "
 'Is the following statement supported: Skagen Painter Peder Severin Krøyer '
 'favored naturalism along with Theodor Esbern Philipsen and the artist Ossian '
 'Elgström studied with in the early 1900s.?\\nSearch query: (Skagen Painter '
 "OR Peder Severin Krøyer']")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


('Titles: ["Summer Evening on Skagen\'s Southern Beach", "P. S. Krøyer\'s '
 'paintings of Marie", \'Hip, Hip, Hurrah!\', \'Peder Severin Krøyer\', '
 "'Skagen Painters']")
('Texts: ["Summer Evening on Skagen\'s Southern Beach (Danish: Sommeraften på '
 'Skagen Sønderstrand ) is a painting by Peder Severin Krøyer (1851–1909), '
 'from 1893, and is counted as one of his masterpieces. Krøyer was one of the '
 'most notable members of the Danish artistic community known as the Skagen '
 'Painters. The works of Krøyer often emphasise the special effects of the '
 'Skagen light, with several memorable works depicting beach scenes.", \'Peder '
 'Severin Krøyer painted various portraits of his wife, Marie Krøyer née '
 'Triepcke, a fellow Danish artist who was said to be one of the most '
 'beautiful women in Copenhagen. Norwegian-born Peder had met and painted '
 'Marie in Copenhagen but fell in love with her when they met in Paris in '
 '1889. After a honeymoon in northern Jutland and Italy

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


('Titles: ["Summer Evening on Skagen\'s Southern Beach", \'Men of Skagen on a '
 "Summer Evening in Fair Weather', 'Midsummer Eve Bonfire on Skagen Beach', "
 "'Inger on the Beach', 'Skagen Painters']")
('Texts: ["Summer Evening on Skagen\'s Southern Beach (Danish: Sommeraften på '
 'Skagen Sønderstrand ) is a painting by Peder Severin Krøyer (1851–1909), '
 'from 1893, and is counted as one of his masterpieces. Krøyer was one of the '
 'most notable members of the Danish artistic community known as the Skagen '
 'Painters. The works of Krøyer often emphasise the special effects of the '
 'Skagen light, with several memorable works depicting beach scenes.", \'Men '
 'of Skagen on a Summer Evening in Fair Weather (Danish: "Mænd af Skagen en '
 'sommeraften i godt vejr" ) is an 1848 painting by Martinus Rørbye, one of '
 "the central figures of the Golden Age of Danish Painting.', 'Midsummer Eve "
 'Bonfire on Skagen Beach (Danish: "Sankt Hansblus på Skagen strand" ) is a '
 '1906 painti

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


('Titles: ["Summer Evening on Skagen\'s Southern Beach", \'Men of Skagen on a '
 "Summer Evening in Fair Weather', 'Midsummer Eve Bonfire on Skagen Beach', "
 "'Inger on the Beach', 'Skagen Painters']")
('Texts: ["Summer Evening on Skagen\'s Southern Beach (Danish: Sommeraften på '
 'Skagen Sønderstrand ) is a painting by Peder Severin Krøyer (1851–1909), '
 'from 1893, and is counted as one of his masterpieces. Krøyer was one of the '
 'most notable members of the Danish artistic community known as the Skagen '
 'Painters. The works of Krøyer often emphasise the special effects of the '
 'Skagen light, with several memorable works depicting beach scenes.", \'Men '
 'of Skagen on a Summer Evening in Fair Weather (Danish: "Mænd af Skagen en '
 'sommeraften i godt vejr" ) is an 1848 painting by Martinus Rørbye, one of '
 "the central figures of the Golden Age of Danish Painting.', 'Midsummer Eve "
 'Bonfire on Skagen Beach (Danish: "Sankt Hansblus på Skagen strand" ) is a '
 '1906 painti

 10%|█         | 1/10 [02:36<23:25, 156.17s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


("Queries: ['Context:Summer Evening on Skagen\\'s Southern Beach (Danish: "
 'Sommeraften på Skagen Sønderstrand ) is a painting by Peder Severin Krøyer '
 '(1851–1909), from 1893, and is counted as one of his masterpieces. Krøyer '
 'was one of the most notable members of the Danish artistic community known '
 'as the Skagen Painters. The works of Krøyer often emphasise the special '
 'effects of the Skagen light, with several memorable works depicting beach '
 'scenes.\\nMen of Skagen on a Summer Evening in Fair Weather (Danish: "Mænd '
 'af Skagen en sommeraften i godt vejr" ) is an 1848 painting by Martinus '
 'Rørbye, one of the central figures of the Golden Age of Danish '
 'Painting.\\nMidsummer Eve Bonfire on Skagen Beach (Danish: "Sankt Hansblus '
 'på Skagen strand" ) is a 1906 painting by P.S. Krøyer. The large work, which '
 'took several years to complete, shows many of the artists in the group known '
 "as the Skagen Painters as well as influential members of Skagen\\'s l

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


('Titles: ["Summer Evening on Skagen\'s Southern Beach", "P. S. Krøyer\'s '
 'paintings of Marie", \'Hip, Hip, Hurrah!\', \'Peder Severin Krøyer\', '
 "'Skagen Painters']")
('Texts: ["Summer Evening on Skagen\'s Southern Beach (Danish: Sommeraften på '
 'Skagen Sønderstrand ) is a painting by Peder Severin Krøyer (1851–1909), '
 'from 1893, and is counted as one of his masterpieces. Krøyer was one of the '
 'most notable members of the Danish artistic community known as the Skagen '
 'Painters. The works of Krøyer often emphasise the special effects of the '
 'Skagen light, with several memorable works depicting beach scenes.", \'Peder '
 'Severin Krøyer painted various portraits of his wife, Marie Krøyer née '
 'Triepcke, a fellow Danish artist who was said to be one of the most '
 'beautiful women in Copenhagen. Norwegian-born Peder had met and painted '
 'Marie in Copenhagen but fell in love with her when they met in Paris in '
 '1889. After a honeymoon in northern Jutland and Italy

 10%|█         | 1/10 [03:45<33:48, 225.44s/it]


KeyboardInterrupt: 

In [29]:

from pprint import pprint
print(type(hover_train))
pprint(hover_train[10])

<class 'datasets.arrow_dataset.Dataset'>
{'claim': 'Red, White & Crüe and Mike Tyson both died.',
 'hpqa_id': '5a7d2ddb554299452d57bb48',
 'id': 10,
 'label': 0,
 'num_hops': 2,
 'supporting_facts': [{'key': 'Red, White &amp; Crüe', 'value': 2},
                      {'key': 'Mike Tyson', 'value': 2}],
 'uid': '1632da2c-c6ca-4d12-9545-6469f24ea3a2'}


# Creating the Preference Dataset for IPO

In [37]:
import dspy


retrieved_docs = retriever("Is the following statement supported: Red, White & Crüe and Mike Tyson both died.")

pprint(retrieved_docs)

[{'long_text': 'Red, White &amp; Crüe | Red, White & Crüe is an anthology '
               'album by the heavy metal band Mötley Crüe, released on '
               'February 1, 2005 by Mötley Records and charted at number 6 on '
               "The Billboard 200. To coincide with the album's release, the "
               'band reunited with drummer Tommy Lee, who left the band in '
               "1999. Bassist Nikki Sixx commented on the band's reunion, "
               'comparing it to "seeing Mike Tyson fight. He probably won’t '
               'bite anyone’s ear off — but you’re there in case he does."',
  'pid': 509331,
  'prob': 0.6821404865833579,
  'rank': 1,
  'score': 17.368511199951172,
  'text': 'Red, White &amp; Crüe | Red, White & Crüe is an anthology album by '
          'the heavy metal band Mötley Crüe, released on February 1, 2005 by '
          'Mötley Records and charted at number 6 on The Billboard 200. To '
          "coincide with the album's release, the band re

In [None]:
class PreferenceDataset(Dataset):
    def __init__(self, json_path):

        with open(json_path, 'r') as f:
            raw_data = json.load(f)

        self.data = []
        for question, entry in raw_data.items():
            self.data.append({
                "question": entry["question"],
                "preferred": entry["preferred"],
                "dispreferred": entry["dispreferred"]
            })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def retrieve_docs(query: str, index: int) -> dict[str, list[int]]:
    # TODO: connect to ColBERTv2 retriever
    return {
        "Title1": [11.2, 12.5],
        "Title2": [25.4, 17.6],
    }

def score_retrieval(supporting_facts, retrieval_scores):
    """
    supporting_facts: dict with keys 'title' and 'sent_id'
        (e.g., {'title': ["Arthur's Magazine", 'First for Women'], 'sent_id': [0, 0]})

    retrieval_scores: dict where key = title, value = list of retrieval scores
        (e.g., {"Arthur's Magazine": [11.2, 12.5], "First for Women": [25.4, 17.6]})
    """
    # Step 1: Flatten retrievals
    flattened_retrievals = []
    for title, scores in retrieval_scores.items():
        for idx, score in enumerate(scores):
            flattened_retrievals.append((score, title, idx))  # (score, title, sentence id)

    # Step 2: Sort retrievals by descending score
    flattened_retrievals.sort(reverse=True)

    # Step 3: Build the set of gold facts
    gold_set = set(zip(supporting_facts['title'], supporting_facts['sent_id']))

    # Step 4: Go through the ranked list and compute Precision at each relevant hit
    num_relevant = 0
    precisions = []

    for rank, (score, title, sent_id) in enumerate(flattened_retrievals, start=1):
        if (title, sent_id) in gold_set:
            num_relevant += 1
            precisions.append(num_relevant / rank)

    # Step 5: Compute Average Precision (AP)
    if len(gold_set) == 0:
        ap = 0.0
    else:
        ap = sum(precisions) / len(gold_set)

    # Step 6: Compute Recall
    recall = num_relevant / len(gold_set) if len(gold_set) > 0 else 0.0

    return ap, recall

NameError: name 'Dataset' is not defined

# Training Loop

In [None]:
preference_dataset = PreferenceDataset('preference_dataset.json')

tau = 0.05
learning_rate = 1e-5
batch_size = 2
num_epochs = 3

optimizer = AdamW(query_generator.parameters(), lr=learning_rate)
train_loader = DataLoader(preference_dataset, batch_size=batch_size, shuffle=True)

def ipo_loss(logp_w, logp_l, tau=0.05):
    return ((logp_w - logp_l - 0.5 / tau) ** 2).mean()

for epoch in range(num_epochs):
    query_generator.train()
    total_loss = 0

    for batch in train_loader:
        x = batch["question"]
        yw = batch["preferred"]
        yl = batch["dispreferred"]

        x_tokens = query_tokenizer(list(x), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)
        yw_tokens = query_tokenizer(list(yw), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)
        yl_tokens = query_tokenizer(list(yl), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)

        with torch.no_grad():
            ref_logp_yw = query_generator_ref(**x_tokens, labels=yw_tokens.input_ids).loss
            ref_logp_yl = query_generator_ref(**x_tokens, labels=yl_tokens.input_ids).loss

        logp_yw = query_generator(**x_tokens, labels=yw_tokens.input_ids).loss
        logp_yl = query_generator(**x_tokens, labels=yl_tokens.input_ids).loss

        logp_yw = -logp_yw
        logp_yl = -logp_yl
        ref_logp_yw = -ref_logp_yw
        ref_logp_yl = -ref_logp_yl

        log_ratio_yw = logp_yw - ref_logp_yw
        log_ratio_yl = logp_yl - ref_logp_yl

        loss = ipo_loss(log_ratio_yw, log_ratio_yl, tau=tau)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.