In [None]:
!pip install -U transformers datasets torch tqdm huggingface_hub numpy

## Preference Dataset Creation with Google T5 Flan

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
import torch
import json
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import random
import datetime

CURRENT_TIME_STAMP = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# === Query Generator (T5 Flan) ===
model_path = "google/flan-t5-small"
query_generator = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
query_tokenizer = AutoTokenizer.from_pretrained(model_path)
query_generator.eval()

# === ColBERT for retrieval scoring ===
colbert_tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").to(device)
colbert_model.eval()

# === Dataset ===
dataset = load_dataset("hotpot_qa", "fullwiki", trust_remote_code=True)
DATASET_SPLIT = 0.9  # 90% for training, 10% for validation
train_dataset = dataset['train'][:5000]  # Use 5K for faster processing
val_dataset = dataset['train'][5000:6000]  # Use 1K for validation

print(f"Loaded {len(train_dataset['question'])} samples for preference dataset creation")

# Dump dataset into JSONL files for future use
def dump_dataset_to_jsonl(dataset, filename):
    with open(filename, 'w') as f:
        for item in dataset:
            f.write(json.dumps(item) + '\n')
            
# Generate timestamped and parameter-specific filenames
timestamp = CURRENT_TIME_STAMP.replace(" ", "_").replace(":", "-")
dataset_size = len(train_dataset['question'])
val_size = len(val_dataset['question'])

train_filename = f"hotpot_train_{dataset_size}samples_{timestamp}.jsonl"
val_filename = f"hotpot_val_{val_size}samples_{timestamp}.jsonl"

dump_dataset_to_jsonl(train_dataset, train_filename)
dump_dataset_to_jsonl(val_dataset, val_filename)

print(f"Training dataset saved to: {train_filename}")
print(f"Validation dataset saved to: {val_filename}")

# Core Utility Functions

In [None]:
def compute_colbert_embeddings(texts):
    """Compute ColBERT embeddings for texts"""
    encoded = colbert_tokenizer(
        texts,
        max_length=512,  # Increased for better context
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        output = colbert_model(**encoded).last_hidden_state
    
    masks = encoded["attention_mask"].bool()
    return [output[i][masks[i]].cpu().numpy() for i in range(len(texts))]

# Main Preference Dataset Creation Loop

## Scoring and Evaluation Functions

In [None]:
def maxsim_score(query_emb, doc_emb):
    """Compute MaxSim score between query and document embeddings"""
    query_tensor = torch.tensor(query_emb, dtype=torch.float32).to(device)
    doc_tensor = torch.tensor(doc_emb, dtype=torch.float32).to(device)
    return float((torch.matmul(query_tensor, doc_tensor.T)).max(dim=1).values.sum())

def compute_ap_recall(supporting_pairs, retrieved_ids, sentence_metadata):
    """Compute Average Precision and Recall"""
    retrieved_pairs = {
        (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) 
        for i in retrieved_ids
    }
    
    hits = [
        1 if (sentence_metadata[i]["title"], sentence_metadata[i]["sent_idx"]) in supporting_pairs else 0 
        for i in retrieved_ids
    ]
    
    ap = sum(hits[i] / (i + 1) for i in range(len(hits)) if hits[i]) / max(sum(hits), 1)
    recall = sum(hits) / len(supporting_pairs) if supporting_pairs else 0
    
    return ap, recall

## Query Generation with T5 Flan

In [None]:
def generate_query(question, context=""):
    """Generate a single query using T5 Flan"""
    if context:
        prompt = f"Context: {context}\n\nGenerate a search query for: {question}"
    else:
        prompt = f"Generate a search query for: {question}"
    
    inputs = query_tokenizer(
        prompt,
        return_tensors="pt",
        max_length=512,
        truncation=True
    ).to(device)
    
    with torch.no_grad():
        outputs = query_generator.generate(
            **inputs,
            max_new_tokens=15,
            do_sample=True,
            temperature=0.8,
            top_p=0.9,
            num_return_sequences=1
        )
    
    query = query_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return query

In [None]:
# === Configuration Parameters ===
NUM_HOPS = 2          # Number of retrieval hops
NUM_QUERIES = 5       # Generate 5 queries per hop for ranking
TOP_K = 5            # Top-K documents to retrieve
BATCH_SIZE = 10      # Process samples in batches for speed

print(f"Configuration:")
print(f"- Number of hops: {NUM_HOPS}")
print(f"- Queries per hop: {NUM_QUERIES}")
print(f"- Top-K retrieval: {TOP_K}")
print(f"- Batch size: {BATCH_SIZE}")

## Data Processing and Context Preparation

In [None]:
def prepare_sample_context(sample):
    """Prepare and flatten context for a single sample"""
    context_titles = sample['context']['title']
    context_sentences_grouped = sample['context']['sentences']
    flattened_sentences = []
    sentence_metadata = []
    
    for title, sentences in zip(context_titles, context_sentences_grouped):
        for i, sent in enumerate(sentences):
            flattened_sentences.append(sent)
            sentence_metadata.append({"title": title, "sent_idx": i})
    
    return flattened_sentences, sentence_metadata

print("Data processing functions defined")

## Single Hop Processing Function

In [None]:
def process_single_hop(question, current_context, flattened_sentences, 
                      context_embeddings, supporting_pairs, sentence_metadata):
    """Process a single hop: generate queries, score them, create preferences"""
    
    # Generate multiple queries for this hop
    queries = []
    for _ in range(NUM_QUERIES):
        query = generate_query(question, current_context)
        if query and query not in queries:  # Avoid duplicates
            queries.append(query)
    
    if not queries:
        return None
        
    # Score each query
    scored_queries = []
    for query in queries:
        # Get query embedding
        query_emb = compute_colbert_embeddings([query])[0]
        
        # Score against all documents
        scores = [maxsim_score(query_emb, doc_emb) for doc_emb in context_embeddings]
        
        # Get top-k retrieval
        top_indices = np.argsort(scores)[-TOP_K:][::-1]
        
        # Compute AP and recall
        ap, recall = compute_ap_recall(supporting_pairs, top_indices, sentence_metadata)
        
        scored_queries.append({
            "query": query,
            "ap": ap,
            "recall": recall,
            "top_indices": top_indices.tolist(),
            "retrieved_context": [flattened_sentences[i] for i in top_indices]
        })
    
    # Sort by AP (descending)
    scored_queries.sort(key=lambda x: x["ap"], reverse=True)
    
    # Create preference pairs (better > worse)
    preference_pairs = []
    for i in range(len(scored_queries)):
        for j in range(i + 1, len(scored_queries)):
            if scored_queries[i]["ap"] > scored_queries[j]["ap"]:
                preference_pairs.append((i, j))  # i is preferred over j
    
    return {
        "queries": [x["query"] for x in scored_queries],
        "aps": [x["ap"] for x in scored_queries],
        "recalls": [x["recall"] for x in scored_queries],
        "preference_pairs": preference_pairs,
        "best_retrieved_context": "\n".join(scored_queries[0]["retrieved_context"]) if scored_queries else ""
    }

## Main Processing Loop

In [None]:
preference_dataset = {}
total_processed = 0
total_skipped = 0

print("Starting preference dataset creation...")

for batch_start in tqdm(range(0, len(train_dataset['question']), BATCH_SIZE), desc="Processing batches"):
    batch_end = min(batch_start + BATCH_SIZE, len(train_dataset['question']))
    
    for idx in range(batch_start, batch_end):
        sample = {k: train_dataset[k][idx] for k in train_dataset.keys()}
        question = sample['question']
        supporting_facts = sample['supporting_facts']
        
        # Skip if no supporting facts
        if not supporting_facts['title']:
            total_skipped += 1
            continue
            
        # Prepare context
        flattened_sentences, sentence_metadata = prepare_sample_context(sample)
        context_embeddings = compute_colbert_embeddings(flattened_sentences)
        supporting_pairs = set(zip(supporting_facts['title'], supporting_facts['sent_id']))
        
        # Initialize dataset entry
        preference_dataset[question] = {
            "question": question,
            "hops": {}
        }
        
        current_context = ""
        
        # Process each hop
        for hop in range(NUM_HOPS):
            hop_data = process_single_hop(
                question, current_context, flattened_sentences, 
                context_embeddings, supporting_pairs, sentence_metadata
            )
            
            if hop_data:
                preference_dataset[question]["hops"][f"hop_{hop}"] = hop_data
                # Update context with best retrieval for next hop
                if hop_data["queries"]:
                    best_context = hop_data.get("best_retrieved_context", "")
                    current_context = best_context
        
        total_processed += 1

print(f"Processing completed!")
print(f"- Total processed: {total_processed}")
print(f"- Total skipped: {total_skipped}")
print(f"- Final dataset size: {len(preference_dataset)} questions")

## Save Preference Dataset

In [None]:
import datetime

# Save the preference dataset
# Generate timestamped filename
timestamp = CURRENT_TIME_STAMP.replace(" ", "_").replace(":", "-")
output_file = f"preference_dataset_t5_flan_{timestamp}.json"
with open(output_file, "w") as f:
    json.dump(preference_dataset, f, indent=2)
    
print(f"Preference dataset saved to {output_file}")

# Display statistics
total_preference_pairs = 0
total_hops = 0

for question, data in preference_dataset.items():
    for hop_key, hop_data in data["hops"].items():
        total_hops += 1
        total_preference_pairs += len(hop_data["preference_pairs"])

print(f"\nDataset Statistics:")
print(f"- Total questions: {len(preference_dataset)}")
print(f"- Total hops: {total_hops}")
print(f"- Total preference pairs: {total_preference_pairs}")
print(f"- Average preference pairs per hop: {total_preference_pairs/total_hops:.2f}")

In [None]:
def format_preference_data_for_training(preference_dataset_path):
    """Convert preference dataset to training format"""
    with open(preference_dataset_path, 'r') as f:
        data = json.load(f)
    
    training_data = []
    
    for question, entry in data.items():
        for hop_key, hop_data in entry["hops"].items():
            queries = hop_data["queries"]
            aps = hop_data["aps"]
            preference_pairs = hop_data["preference_pairs"]
            
            for preferred_idx, dispreferred_idx in preference_pairs:
                training_data.append({
                    "question": question,
                    "preferred": queries[preferred_idx],
                    "dispreferred": queries[dispreferred_idx],
                    "preferred_ap": aps[preferred_idx],
                    "dispreferred_ap": aps[dispreferred_idx],
                    "hop": hop_key
                })
    
    return training_data

print("Training data formatting function defined")

## Format Data for Training

## Generate Training Data

In [None]:
# Format the preference dataset for training
training_data = format_preference_data_for_training("preference_dataset_t5_flan.json")

print(f"Created {len(training_data)} preference pairs for training")

# Save training data
with open("preference_training_data.json", "w") as f:
    json.dump(training_data, f, indent=2)

print("Training data saved to preference_training_data.json")

## Display Sample Results

In [None]:
# Display sample preference pairs
if training_data:
    print("\n" + "="*80)
    print("SAMPLE PREFERENCE PAIRS")
    print("="*80)
    
    for i, sample in enumerate(training_data[:3]):  # Show first 3 samples
        print(f"\nSample {i+1}:")
        print(f"Question: {sample['question'][:100]}...")
        print(f"Hop: {sample['hop']}")
        print(f"Preferred Query (AP={sample['preferred_ap']:.3f}): {sample['preferred']}")
        print(f"Dispreferred Query (AP={sample['dispreferred_ap']:.3f}): {sample['dispreferred']}")
        print("-" * 60)
        
    print(f"\nTotal training samples: {len(training_data)}")
else:
    print("No training data generated.")

## PyTorch Dataset Class (Optional)

In [None]:
from torch.utils.data import Dataset

class PreferenceDataset(Dataset):
    """PyTorch Dataset for preference learning"""
    
    def __init__(self, json_path):
        with open(json_path, 'r') as f:
            self.data = json.load(f)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

# Create dataset instance
train_dataset_pytorch = PreferenceDataset("preference_training_data.json")
print(f"PyTorch dataset created with {len(train_dataset_pytorch)} samples")
print("Ready for training with preference learning algorithms (DPO, IPO, etc.)")

## Summary

This notebook has successfully created a preference dataset using Google T5 Flan for query generation. The dataset includes:

- **Multi-hop retrieval**: 2 hops of query generation and retrieval
- **Multiple queries per hop**: 5 queries generated and ranked by AP score
- **Preference pairs**: Automatically generated based on retrieval performance
- **Training ready format**: JSON files ready for preference learning

**Output files:**
- `preference_dataset_t5_flan.json`: Raw preference dataset
- `preference_training_data.json`: Formatted training data

**Next steps:**
- Use the training data with preference learning algorithms (DPO, IPO, etc.)
- Fine-tune the T5 model on the preference pairs
- Evaluate the improved model on multi-hop QA tasks