In [None]:
!pip install -U transformers sentence-transformers faiss-cpu pandas nltk datasets dspy



In [None]:
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch
import os
import json
from tqdm import tqdm
from huggingface_hub import login
import numpy as np
from collections import defaultdict
import random

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === ColBERT setup ===
colbert_tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").to(device)
colbert_model.eval()

# === Hugging Face auth ===
login("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")

# === Dataset ===
dataset = load_dataset("hotpot_qa", "fullwiki", trust_remote_code=True)
train_dataset = dataset['train'][:25000]
# LeReT uses around 25K items from their dataset.


query_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", trust_remote_code=True)
query_tokenizer.pad_token = query_tokenizer.eos_token

query_generator = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
query_generator.eval()


'''
query_tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
query_generator = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
query_generator.eval()
'''

fewshot_ex_path = 'fewshot_examples/fewshot_examples.json'

with open(fewshot_ex_path, 'r') as f:
    FEWSHOT_EXAMPLES = json.load(f)

print(f"Loaded {len(FEWSHOT_EXAMPLES)} few-shot examples.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded 100 few-shot examples.


# HotPotQA Helper Functions

In [None]:
def build_fewshot_prompt(question, context="", add_fewshot=False):
    num_fewshots = random.randint(1, 3)
    fewshots = random.sample(FEWSHOT_EXAMPLES, num_fewshots)

    fewshot_str = "Examples:\n"
    for ex in fewshots:
        fewshot_str += f"Question:{ex['question']}\nQuery:{ex['query']}\n\n"

    context_str = f"Context:\n{context}\n\n" if context else ""

    # Avoid pattern that triggers extra completions
    task_str = f"Generate a search query for the following question:\n{question}"

    return f"{fewshot_str}{context_str}{task_str}"


# === Embedding utility ===
def compute_colbert_embeddings(texts):
    encoded = colbert_tokenizer(
        texts,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        output = colbert_model(**encoded).last_hidden_state
    masks = encoded["attention_mask"].bool()
    return [output[i][masks[i]].cpu().numpy() for i in range(len(texts))]

# === Scoring utility ===
def maxsim_score(query_emb, doc_embs):
    return float((torch.matmul(query_emb, doc_embs.T)).max(dim=1).values.sum())

def compute_ap_recall(supporting_pairs, retrieved_ids, sentence_metadata):
    retrieved_pairs = {
        (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) for i in retrieved_ids
    }
    hits = [1 if (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) in supporting_pairs else 0 for i in retrieved_ids]
    ap = sum(hits[i] / (i + 1) for i in range(len(hits)) if hits[i]) / max(sum(hits), 1)
    recall = sum(hits) / len(supporting_pairs) if supporting_pairs else 0
    return ap, recall

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return (2 * precision * recall) / (precision + recall)

In [None]:
# === Params ===
NUM_HOPS = 2
NUM_QUERIES = 5
TOP_K = 5

preference_dataset = {}

for idx in tqdm(range(5000)):
    sample = {k: train_dataset[k][idx] for k in train_dataset.keys()}
    question = sample['question']
    supporting_facts = sample['supporting_facts']

    preference_dataset[question] = {
        "question": question,
        "hops": {}
    }

    # Flatten context
    context_titles = sample['context']['title']
    context_sentences_grouped = sample['context']['sentences']
    flattened_sentences = []
    sentence_metadata = []
    for title, sentences in zip(context_titles, context_sentences_grouped):
        for i, sent in enumerate(sentences):
            flattened_sentences.append(sent)
            sentence_metadata.append({"title": title, "sent_idx": i})
    context_embeddings = compute_colbert_embeddings(flattened_sentences)
    vector_store = {
        "id": list(range(len(flattened_sentences))),
        "embeddings": [torch.tensor(emb, dtype=torch.float32) for emb in context_embeddings]
    }

    current_context = ""  # No context for the first hop

    for hop in range(NUM_HOPS):

        query_tokenizer.padding_side = "left"  # Required for decoder-only models

        # Generate few-shot prompts
        prompts = [build_fewshot_prompt(question, context=current_context, add_fewshot=True) for _ in range(NUM_QUERIES)]

        # Tokenize prompts
        inputs = query_tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(query_generator.device)

        # Save input_ids *before* generate() call
        input_ids = inputs["input_ids"]

        # Get the true prompt lengths
        prompt_lengths = inputs["attention_mask"].sum(dim=1).tolist()

        # Generate
        outputs = query_generator.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"],
            max_new_tokens=12,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            num_return_sequences=1,
            pad_token_id=query_tokenizer.eos_token_id,
            return_dict_in_generate=True,
            output_scores=False
        )

        # Decode new tokens only
        generated_sequences = outputs.sequences  # shape: (NUM_QUERIES, total_len)

        queries = []
        for i in range(NUM_QUERIES):
            full_output_ids = generated_sequences[i]
            generated_ids = full_output_ids[prompt_lengths[i]:]  # skip prompt tokens
            decoded = query_tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
            queries.append(decoded)


        # === Batched embedding of queries ===
        query_embeddings = compute_colbert_embeddings(queries)
        query_embeddings = [torch.tensor(q, dtype=torch.float32) for q in query_embeddings]

        # === Per-query MaxSim scoring ===
        scores_per_query = []
        for q_emb in query_embeddings:
            scores = [maxsim_score(q_emb, doc_emb) for doc_emb in vector_store["embeddings"]]
            scores_per_query.append(scores)

        # Score queries and collect preference data
        scored = []
        supporting_pairs = set(zip(supporting_facts['title'], supporting_facts['sent_id']))
        for q, scores in zip(queries, scores_per_query):
            top_indices = np.argsort(scores)[-TOP_K:][::-1]
            ap, recall = compute_ap_recall(supporting_pairs, top_indices, sentence_metadata)
            retrieved_context = [flattened_sentences[i] for i in top_indices]
            scored.append({
                "query": q,
                "ap": ap,
                "recall": recall,
                "top_indices": top_indices,
                "retrieved_context": retrieved_context
            })

        # Sort by AP
        scored.sort(key=lambda x: x["ap"], reverse=True)

        ap_to_indices = defaultdict(list)
        for i, item in enumerate(scored):
            ap_to_indices[item["ap"]].append(i)

        sorted_aps = sorted(ap_to_indices.keys())

        preference_pairs = []
        for i in range(len(scored)):
            for j in range(len(scored)):
                if i != j and scored[i]["ap"] > scored[j]["ap"]:
                    preference_pairs.append((i, j))

        preference_dataset[question]["hops"][f"hop_{hop}"] = {
            "queries": [x["query"] for x in scored],
            "aps": [x["ap"] for x in scored],
            "recalls": [x["recall"] for x in scored],
            "preference_pairs": preference_pairs
        }

        current_context = "\n".join(scored[0]["retrieved_context"])

# Save
with open("preference_dataset_hotpotqa.json", "w") as f:
    json.dump(preference_dataset, f, indent=2)

# Evaluation with HotPotQA

In [None]:
def evaluate_hotpotqa(
    eval_dataset,
    query_generator,
    query_tokenizer,
    colbert_model,
    colbert_tokenizer,
    num_hops=2,
    num_queries_per_hop=1, # For evaluation, we typically generate one "best" query per hop
    top_k_retrieval=5,
    max_new_tokens=20 # Allow more tokens for potentially longer queries
):
    print(f"Starting evaluation with {len(eval_dataset)} samples...")

    total_ap = 0.0
    total_recall = 0.0
    total_precision = 0.0 # To calculate F1
    num_evaluated_samples = 0 # To handle potential empty supporting_facts or other issues

    all_results = [] # To store detailed results for inspection

    for idx in tqdm(range(len(eval_dataset)), desc="Evaluating Samples"):
        sample = {k: eval_dataset[k][idx] for k in eval_dataset.keys()}
        question = sample['question']
        supporting_facts = sample['supporting_facts']

        # Flatten context
        context_titles = sample['context']['title']
        context_sentences_grouped = sample['context']['sentences']
        flattened_sentences = []
        sentence_metadata = []
        for title, sentences in zip(context_titles, context_sentences_grouped):
            for i, sent in enumerate(sentences):
                flattened_sentences.append(sent)
                sentence_metadata.append({"title": title, "sent_idx": i})

        # Compute embeddings for the entire context once
        context_embeddings = compute_colbert_embeddings(flattened_sentences)

        # Convert list of numpy arrays to a list of tensors for maxsim_score
        # vector_store_embeddings_for_scoring stores individual document token embeddings
        vector_store_embeddings_for_scoring = [torch.tensor(emb, dtype=torch.float32).to(device) for emb in context_embeddings]


        current_context = ""  # No context for the first hop

        # Ground truth supporting pairs for the current question
        ground_truth_supporting_pairs = set(zip(supporting_facts['title'], supporting_facts['sent_id']))

        # Store results for this question
        question_results = {
            "question": question,
            "ground_truth_supporting_pairs": list(ground_truth_supporting_pairs),
            "hops": []
        }

        # Skip questions with no supporting facts, as AP/Recall/F1 are ill-defined
        if not ground_truth_supporting_pairs:
            # print(f"Skipping question '{question}' due to no supporting facts.") # Keep this for debugging if needed
            continue

        for hop in range(num_hops):

            prompt = build_fewshot_prompt(question, context=current_context)

            inputs = query_tokenizer(
                prompt,
                return_tensors="pt",
                padding=True, # Apply padding if batching (though num_return_sequences=1 here)
                truncation=True
            ).to(query_generator.device)

            # For T5 (Seq2Seq), you don't typically slice by prompt_length from `outputs.sequences`
            # Instead, the decoder output is directly the generated text.
            # You feed `input_ids` to the encoder, and the decoder generates.
            outputs = query_generator.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=max_new_tokens,
                do_sample=False, # You can set this to False for deterministic generation
                top_p=0.9,
                temperature=0.7
                num_return_sequences=1,
                pad_token_id=query_tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=False
            )

            # For T5 (Seq2Seq models), the `generated_sequences` are just the decoded output.
            # No need to slice by `prompt_length`.
            generated_sequence = outputs.sequences[0]
            generated_query = query_tokenizer.decode(generated_sequence, skip_special_tokens=True).strip()

            if not generated_query:
                print(f"Warning: Empty query generated for question: '{question}' hop: {hop}.")
                continue # Skip if empty query

            # === Retrieval and Scoring ===
            query_emb_list = compute_colbert_embeddings([generated_query])
            if not query_emb_list:
                print(f"Warning: No embedding generated for query: '{generated_query}' for question: '{question}' hop: {hop}.")
                continue # Skip if embedding fails

            query_emb = query_emb_list[0] # This is already a numpy array from compute_colbert_embeddings
            # No need for `torch.tensor().to(device)` here, as `maxsim_score` will handle it for each call
            # `maxsim_score` itself converts to tensor on the device.

            scores = []
            for doc_emb in vector_store_embeddings_for_scoring: # Iterate through each document's token embeddings
                scores.append(maxsim_score(query_emb, doc_emb)) # query_emb (numpy), doc_emb (tensor already)

            if not scores:
                continue

            # Get top_k retrieved document indices
            top_indices = np.argsort(scores)[-top_k_retrieval:][::-1]

            # Calculate AP and Recall for the current hop
            ap, recall = compute_ap_recall(ground_truth_supporting_pairs, top_indices, sentence_metadata)

            f1 = calculate_f1(ap, recall)

            # Accumulate scores
            total_ap += ap
            total_recall += recall
            total_precision += precision
            num_evaluated_samples += 1 # Increment for each hop where a valid query was generated

            retrieved_context = [flattened_sentences[i] for i in top_indices]

            question_results["hops"].append({
                "hop": hop,
                "generated_query": generated_query,
                "raw_generated_query": raw_generated_query, # For debugging if needed
                "ap": ap,
                "recall": recall,
                "precision": precision,
                "f1": f1,
                "top_k_retrieved_docs": retrieved_context,
                "top_k_retrieved_ids": top_indices.tolist()
            })

            # Update current_context for the next hop
            current_context = "\n".join(retrieved_context)

        all_results.append(question_results)

    # Calculate average metrics
    avg_ap = total_ap / num_evaluated_samples if num_evaluated_samples > 0 else 0.0
    avg_recall = total_recall / num_evaluated_samples if num_evaluated_samples > 0 else 0.0
    avg_f1 = calculate_f1(avg_precision, avg_recall) # F1 based on average P and R

    print("\n--- Evaluation Summary ---")
    print(f"Total Samples Evaluated (hops combined): {num_evaluated_samples}")
    print(f"Average AP: {avg_ap:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1: {avg_f1:.4f}")
    print("--------------------------")

    return {
        "average_ap": avg_ap,
        "average_recall": avg_recall,
        "average_f1": avg_f1,
        "detailed_results": all_results
    }

# Model to evaluate
model_path = "path/to/your/finetuned/flan-t5-small-model"
model_to_eval = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
model_to_eval_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Test dataset
dataset = load_dataset("hotpot_qa", "fullwiki", trust_remote_code=True)
eval_dataset = dataset['test']

# ColBERT
colbert_tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").to(device)
colbert_model.eval()


# --- Run Evaluation ---
evaluation_metrics = evaluate_hotpotqa(
    eval_dataset=eval_dataset,
    query_generator=model_to_eval,
    query_tokenizer=model_to_eval_tokenizer,
    colbert_model=colbert_model, # Pass the ColBERT model
    colbert_tokenizer=colbert_tokenizer, # Pass the ColBERT tokenizer
    device=device,
    num_hops=2,           # Keep consistent with your training/preference dataset generation
    num_queries_per_hop=1, # Generate only 1 query per hop for evaluation
    top_k_retrieval=5,    # Keep consistent with your preference dataset generation
    max_new_tokens=20     # Adjust as needed for query length
)

# --- Save Results (Optional) ---
output_filename = "hotpotqa_evaluation_results.json"
with open(output_filename, "w") as f:
    json.dump(evaluation_metrics, f, indent=4)
print(f"\nDetailed evaluation results saved to {output_filename}")

# HoVeR dataset

In [None]:
hover_dataset = load_dataset("hover-nlp/hover", "fullwiki", trust_remote_code=True)
hover_train = hover_dataset['train'].select(range(10))

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

hover.py:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/899k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
import json
import torch
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import dspy
from transformers import AutoTokenizer, AutoModelForCausalLM
from pprint import pprint


retriever_hover = dspy.ColBERTv2(
    url="http://20.102.90.50:2017/wiki17_abstracts"
)

# === Config ===
NUM_HOPS = 4
NUM_QUERIES = 1
TOP_K = 5

# === Output structure ===
preference_dataset = {}


def compute_ap_score_hover(retrieved_titles, gold_titles):
    relevant = [1 if title in gold_titles else 0 for title in retrieved_titles]
    if sum(relevant) == 0:
        return 0.0
    precisions = []
    num_relevant = 0
    for i, is_rel in enumerate(relevant):
        if is_rel:
            num_relevant += 1
            precisions.append(num_relevant / (i + 1))
    ap = sum(precisions) / len(gold_titles)
    print(f'AP: {ap}')
    return ap

# === Main loop ===

for sample in tqdm(hover_train):  # Reduce range for test runs
    claim = sample['claim']
    question = f"Is the following statement supported: {claim}"
    gold_titles = [sf["key"] for sf in sample["supporting_facts"]]

    preference_dataset[question] = {"question": question, "hops": {}}
    current_context = ""

    for hop in range(NUM_HOPS):

        # Build prompt using current context
        if current_context:
          prompt = f"Context:{current_context}\n\nGenerate a search query for the following question.\nQuestion: {question}".strip()
        else:
          prompt = f"Generate a search query for the following question.\nQuestion: {question}".strip()
        input_tokens = query_tokenizer(prompt, return_tensors="pt").to(query_generator.device)

        # Generate NUM_QUERIES queries
        outputs = query_generator.generate(
            **input_tokens,
            max_new_tokens=16,
            do_sample=False,
            num_return_sequences=NUM_QUERIES,
            pad_token_id=query_tokenizer.eos_token_id
        )

        queries = query_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        queries = [q.strip() for q in queries]
        #print(f'Queries: {queries}')

        scored = []
        for query in queries:
            try:
                retrieved = retriever_hover(query, k=TOP_K)
                long_texts = [doc["long_text"] for doc in retrieved]
                titles = []
                texts = []
                for item in long_texts:
                  item = item.split(" | ")
                  titles.append(item[0].strip())
                  texts.append(item[1].strip())

                #pprint(f'Titles: {titles}')
                #pprint(f'Texts: {texts}')

                ap = compute_ap_score_hover(titles, gold_titles)
                #print(10*'-')

                scored.append({
                    "query": query,
                    "ap": ap,
                    "retrieved_titles": titles,
                    "retrieved_context": texts
                })
            except Exception as e:
                print(f"Error retrieving for query '{query}': {e}")

        if not scored:
            continue

        scored.sort(key=lambda x: x["ap"])
        ap_to_indices = defaultdict(list)
        for i, item in enumerate(scored):
            ap_to_indices[item["ap"]].append(i)
        sorted_aps = sorted(ap_to_indices.keys())

        preference_pairs = []
        for i in range(len(sorted_aps) - 1):
            lower = ap_to_indices[sorted_aps[i]]
            higher = ap_to_indices[sorted_aps[i + 1]]
            for j in higher:
                for k in lower:
                    preference_pairs.append((j, k))

        preference_dataset[question]["hops"][f"hop_{hop}"] = {
            "queries": [x["query"] for x in scored],
            "aps": [x["ap"] for x in scored],
            "retrieved_titles": [x["retrieved_titles"] for x in scored],
            "retrieved_contexts": [x["retrieved_context"] for x in scored],
            "preference_pairs": preference_pairs
        }

        current_context = "\n".join(scored[-1]["retrieved_context"])  # best one


# === Save to JSON ===
with open("hover_preference_dataset.json", "w") as f:
    json.dump(preference_dataset, f, indent=2)
print("Saved preference dataset to hover_preference_dataset.json")


  0%|          | 0/10 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.0625


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 10%|█         | 1/10 [00:01<00:16,  1.89s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.0625


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 2/10 [00:03<00:15,  1.89s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.0625


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 30%|███       | 3/10 [00:05<00:13,  1.87s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 40%|████      | 4/10 [00:07<00:11,  1.84s/it]The following generation flags are not valid and m

AP: 0.0625


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 50%|█████     | 5/10 [00:09<00:09,  1.85s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.06666666666666667


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.05


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.05


 60%|██████    | 6/10 [00:11<00:07,  1.87s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.05


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 70%|███████   | 7/10 [00:13<00:06,  2.01s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.08333333333333333


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 80%|████████  | 8/10 [00:15<00:04,  2.11s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AP: 0.05


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 90%|█████████ | 9/10 [00:18<00:02,  2.20s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
100%|██████████| 10/10 [00:20<00:00,  2.05s/it]

Saved preference dataset to hover_preference_dataset.json





In [None]:

from pprint import pprint
print(type(hover_train))
pprint(hover_train[0])

<class 'datasets.arrow_dataset.Dataset'>
{'claim': 'Skagen Painter Peder Severin Krøyer favored naturalism along with '
          'Theodor Esbern Philipsen and the artist Ossian Elgström studied '
          'with in the early 1900s.',
 'hpqa_id': '5ab7a86d5542995dae37e986',
 'id': 0,
 'label': 1,
 'num_hops': 3,
 'supporting_facts': [{'key': 'Kristian Zahrtmann', 'value': 0},
                      {'key': 'Kristian Zahrtmann', 'value': 1},
                      {'key': 'Peder Severin Krøyer', 'value': 1},
                      {'key': 'Ossian Elgström', 'value': 2}],
 'uid': '330ca632-e83f-4011-b11b-0d0158145036'}


# Creating the Preference Dataset for IPO

In [None]:
import dspy


retrieved_docs = retriever_hover("Is the following statement supported: Red, White & Crüe and Mike Tyson both died.")

pprint(retrieved_docs)

[{'long_text': 'Red, White &amp; Crüe | Red, White & Crüe is an anthology '
               'album by the heavy metal band Mötley Crüe, released on '
               'February 1, 2005 by Mötley Records and charted at number 6 on '
               "The Billboard 200. To coincide with the album's release, the "
               'band reunited with drummer Tommy Lee, who left the band in '
               "1999. Bassist Nikki Sixx commented on the band's reunion, "
               'comparing it to "seeing Mike Tyson fight. He probably won’t '
               'bite anyone’s ear off — but you’re there in case he does."',
  'pid': 509331,
  'prob': 0.6821404865833579,
  'rank': 1,
  'score': 17.368511199951172,
  'text': 'Red, White &amp; Crüe | Red, White & Crüe is an anthology album by '
          'the heavy metal band Mötley Crüe, released on February 1, 2005 by '
          'Mötley Records and charted at number 6 on The Billboard 200. To '
          "coincide with the album's release, the band re

In [None]:

from torch.utils.data import Dataset

class PreferenceDataset(Dataset):
    def __init__(self, json_path):

        with open(json_path, 'r') as f:
            raw_data = json.load(f)

        self.data = []
        for question, entry in raw_data.items():
            self.data.append({
                "question": entry["question"],
                "preferred": entry["preferred"],
                "dispreferred": entry["dispreferred"]
            })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def retrieve_docs(query: str, index: int) -> dict[str, list[int]]:
    # TODO: connect to ColBERTv2 retriever
    return {
        "Title1": [11.2, 12.5],
        "Title2": [25.4, 17.6],
    }

def score_retrieval(supporting_facts, retrieval_scores):
    """
    supporting_facts: dict with keys 'title' and 'sent_id'
        (e.g., {'title': ["Arthur's Magazine", 'First for Women'], 'sent_id': [0, 0]})

    retrieval_scores: dict where key = title, value = list of retrieval scores
        (e.g., {"Arthur's Magazine": [11.2, 12.5], "First for Women": [25.4, 17.6]})
    """
    # Step 1: Flatten retrievals
    flattened_retrievals = []
    for title, scores in retrieval_scores.items():
        for idx, score in enumerate(scores):
            flattened_retrievals.append((score, title, idx))  # (score, title, sentence id)

    # Step 2: Sort retrievals by descending score
    flattened_retrievals.sort(reverse=True)

    # Step 3: Build the set of gold facts
    gold_set = set(zip(supporting_facts['title'], supporting_facts['sent_id']))

    # Step 4: Go through the ranked list and compute Precision at each relevant hit
    num_relevant = 0
    precisions = []

    for rank, (score, title, sent_id) in enumerate(flattened_retrievals, start=1):
        if (title, sent_id) in gold_set:
            num_relevant += 1
            precisions.append(num_relevant / rank)

    # Step 5: Compute Average Precision (AP)
    if len(gold_set) == 0:
        ap = 0.0
    else:
        ap = sum(precisions) / len(gold_set)

    # Step 6: Compute Recall
    recall = num_relevant / len(gold_set) if len(gold_set) > 0 else 0.0

    return ap, recall

# Training Loop

In [None]:
preference_dataset = PreferenceDataset('preference_dataset_hotpotqa_final.json')

tau = 0.05
learning_rate = 1e-5
batch_size = 2
num_epochs = 3

optimizer = AdamW(query_generator.parameters(), lr=learning_rate)
train_loader = DataLoader(preference_dataset, batch_size=batch_size, shuffle=True)

def ipo_loss(logp_w, logp_l, tau=0.05):
    return ((logp_w - logp_l - 0.5 / tau) ** 2).mean()

for epoch in range(num_epochs):
    query_generator.train()
    total_loss = 0

    for batch in train_loader:
        x = batch["question"]
        yw = batch["preferred"]
        yl = batch["dispreferred"]

        x_tokens = query_tokenizer(list(x), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)
        yw_tokens = query_tokenizer(list(yw), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)
        yl_tokens = query_tokenizer(list(yl), return_tensors="pt", padding=True, truncation=True).to(query_generator.device)

        with torch.no_grad():
            ref_logp_yw = query_generator_ref(**x_tokens, labels=yw_tokens.input_ids).loss
            ref_logp_yl = query_generator_ref(**x_tokens, labels=yl_tokens.input_ids).loss

        logp_yw = query_generator(**x_tokens, labels=yw_tokens.input_ids).loss
        logp_yl = query_generator(**x_tokens, labels=yl_tokens.input_ids).loss

        logp_yw = -logp_yw
        logp_yl = -logp_yl
        ref_logp_yw = -ref_logp_yw
        ref_logp_yl = -ref_logp_yl

        log_ratio_yw = logp_yw - ref_logp_yw
        log_ratio_yl = logp_yl - ref_logp_yl

        loss = ipo_loss(log_ratio_yw, log_ratio_yl, tau=tau)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

KeyError: 'preferred'