In [None]:
!pip install -U transformers datasets torch tqdm huggingface_hub numpy scikit-learn pylate bitsandbytes accelerate

In [None]:
# pull the git repo
!git clone --branch refactor/adaptive_hop https://github.com/erenyavuz02/Trajectory-Aware-RL-for-Efficient-Multi-Hop-Retrieval.git

## Preference Dataset Creation with Google T5 Flan

# Core imports
import torch
import json
import random
import datetime
import numpy as np
import pickle
import os
from tqdm import tqdm
from collections import defaultdict

# Dataset and model imports
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel,
    RobertaTokenizer, RobertaForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup
)
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import torch.nn.functional as F

# Configuration
CURRENT_TIMESTAMP = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

print(f"Using device: {device}")
print(f"Timestamp: {CURRENT_TIMESTAMP}")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, RobertaTokenizer, RobertaForSequenceClassification
import torch
import json
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import random
import datetime
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset
import sys
import os
import importlib
import glob

# Add the functions directory to Python path
functions_path = os.path.join(os.getcwd(), "functions", "adaptive_hop_training")
if functions_path not in sys.path:
    sys.path.append(functions_path)

# Import the models module
from models import QueryGenerator, get_colbert_model, get_classifier_model
from classifier import Classifier


CURRENT_TIME_STAMP = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

query_generator = QueryGenerator(device = device)

colbert_tokenizer, colbert_model = get_colbert_model(device)

classifier_tokenizer, classifier_model = get_classifier_model(device)

classifier = Classifier(classifier_model, classifier_tokenizer)

print("All models initialized successfully")


In [None]:


# === Quick test ===
print(is_context_relevant(
    "Who is the father of Michael Jordan?",
    "M. Jordan plays basketball and is a famous athlete. "
))  # Expect: yes


# Dataset Loading and Preparation

In [None]:
# Dataset Configuration
DATASET_CONFIG = {
    'train_size': 5000,
    'val_size': 1000,
    'dataset_name': 'hotpot_qa',
    'dataset_config': 'fullwiki',
    'save_dataset': False,
}

# if dataset exists give file paths
hotpot_dataset = HotpotDataset(DATASET_CONFIG, train_file_path = None, val_file_path = None)

# Train Classifier Model

In [None]:
CLASSIFIER_TRAINING_CONFIG = {
    'learning_rate': 2e-5,
    'batch_size': 16,
    'num_epochs': 3,
    'max_length': 512,
    'warmup_steps': 100,
    'weight_decay': 0.01,
    'save_model': True
}

classifier.train(training_config=CLASSIFIER_TRAINING_CONFIG,
                 train_data=hotpot_dataset.get_train_data(),
                 val_data=hotpot_dataset.get_val_data())

# Few-Shot Examples

In [None]:
# Load the detailed few-shot examples and format them as comprehensive strings
def load_fewshot_examples(filepath='datasets/hotpotqa_fewshot_examples_adapt_hop.json'):
    """Load few-shot examples from JSON file"""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Warning: Few-shot examples file not found at {filepath}")
        return []

def format_fewshot_examples(examples, num_examples=3):
    """Format few-shot examples as strings for prompt engineering"""
    if not examples:
        return []
    
    selected_examples = random.sample(examples, min(num_examples, len(examples)))
    formatted_examples = []

    for example in selected_examples:
        # Format relevant contexts
        relevant_str = "Relevant:\n"
        for j, ctx in enumerate(example['relevant_contexts'][:2]):
            relevant_str += f"{j+1}. {ctx['title']}: {ctx['sentence'][:60]}...\n"
        
        # Format irrelevant contexts
        irrelevant_str = "Irrelevant:\n"
        for j, ctx in enumerate(example['irrelevant_contexts'][:2]):
            irrelevant_str += f"{j+1}. {ctx['title']}: {ctx['sentence'][:60]}...\n"
        
        # Combine into example string
        formatted_example = f"""Question: {example['question']}

{relevant_str.rstrip()}

{irrelevant_str.rstrip()}

Query: {example['query']}"""
        
        formatted_examples.append(formatted_example)

    return formatted_examples

# Load few-shot examples
fewshot_data = load_fewshot_examples()
fewshot_examples = format_fewshot_examples(fewshot_data, 3)

if fewshot_examples:
    print(f"Loaded {len(fewshot_examples)} few-shot examples for query generation")
else:
    print("No few-shot examples available - will use zero-shot generation")

# Scoring and Evaluation Functions

In [None]:
def compute_colbert_embeddings(texts):
    """Compute ColBERT embeddings for a list of texts
    
    Args:
        texts: List of text strings
        
    Returns:
        List of embeddings (numpy arrays)
    """
    encoded = colbert_tokenizer(
        texts,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        output = colbert_model(**encoded).last_hidden_state
    
    masks = encoded["attention_mask"].bool()
    return [output[i][masks[i]].cpu().numpy() for i in range(len(texts))]

def maxsim_score(query_embedding, document_embedding):
    """Compute MaxSim score between query and document embeddings
    
    Args:
        query_embedding: Query embedding (numpy array)
        document_embedding: Document embedding (numpy array)
        
    Returns:
        MaxSim score (float)
    """
    query_tensor = torch.tensor(query_embedding, dtype=torch.float32).to(device)
    doc_tensor = torch.tensor(document_embedding, dtype=torch.float32).to(device)
    
    similarity_matrix = torch.matmul(query_tensor, doc_tensor.T)
    maxsim_score = similarity_matrix.max(dim=1).values.sum()
    
    return float(maxsim_score)

def compute_relevance(supporting_pairs, retrieved_index, sentence_metadata):
    """Check if retrieved document is relevant based on supporting facts
    
    Args:
        supporting_pairs: Set of (title, sentence_index) tuples for supporting facts
        retrieved_index: Index of retrieved document
        sentence_metadata: List of metadata for each sentence
        
    Returns:
        True if relevant, False otherwise
    """
    if retrieved_index >= len(sentence_metadata):
        return False
        
    metadata = sentence_metadata[retrieved_index]
    title = metadata["title"]
    sent_idx = metadata["sent_idx"]
    
    return (title, sent_idx) in supporting_pairs


# Query Generation

In [None]:
# Query Generation Configuration
QUERY_GEN_CONFIG = {
    'max_input_length': 512,
    'max_new_tokens': 15,
    'temperature': 0.8,
    'top_p': 0.9,
    'do_sample': True
}



In [None]:
# Preference Dataset Creation Configuration
PREFERENCE_CONFIG = {
    'num_hops': 2,          # Target number of relevant hops
    'max_hops': 10,         # Maximum hops allowed
    'num_queries': 5,       # Queries per hop for ranking (unused in current implementation)
    'top_k': 5,            # Top-K documents to retrieve (unused in current implementation)
    'batch_size': 10,      # Processing batch size
    'save_interval': 100,  # Save checkpoint every N samples
    'discount_factor': 0.9, # Reward discount for future hops
    'hop_penalty': -0.1,   # Penalty for each hop
    'big_penalty': -10.0   # Large penalty for missing relevant context
}

print("Preference dataset configuration:")
for key, value in PREFERENCE_CONFIG.items():
    print(f"  {key}: {value}")

# Data Processing Functions

In [None]:
def prepare_sample_context(sample):
    """Prepare and flatten context for a single HotpotQA sample
    
    Args:
        sample: HotpotQA sample dictionary
        
    Returns:
        tuple: (flattened_sentences, sentence_metadata)
    """
    context_titles = sample['context']['title']
    context_sentences_grouped = sample['context']['sentences']
    
    flattened_sentences = []
    sentence_metadata = []
    
    for title, sentences in zip(context_titles, context_sentences_grouped):
        for sent_idx, sentence in enumerate(sentences):
            flattened_sentences.append(sentence)
            sentence_metadata.append({
                "title": title, 
                "sent_idx": sent_idx
            })
    
    return flattened_sentences, sentence_metadata

print("Data processing functions defined")

# Single Hop Processing

In [None]:
def process_single_hop(question, current_context, flattened_sentences, 
                      context_embeddings, supporting_pairs, sentence_metadata, 
                      banned_indices=None):
    """Process a single retrieval hop
    
    Args:
        question: The question being answered
        current_context: Current accumulated context
        flattened_sentences: List of all available sentences
        context_embeddings: Precomputed embeddings for all sentences
        supporting_pairs: Set of supporting fact (title, sent_idx) pairs
        sentence_metadata: Metadata for each sentence
        banned_indices: Set of indices to exclude from retrieval
        
    Returns:
        Dictionary with hop results or None if no query generated
    """
    # Generate query for this hop
    query = query_generator.generate_query(question, QUERY_GEN_CONFIG, current_context)

    if not query:
        return None
        
    # Get query embedding
    query_embedding = compute_colbert_embeddings([query])[0]
    
    # Score against all documents
    scores = [maxsim_score(query_embedding, doc_emb) for doc_emb in context_embeddings]
    
    # Apply banned indices filter
    if banned_indices:
        filtered_scores = scores.copy()
        for banned_idx in banned_indices:
            if banned_idx < len(filtered_scores):
                filtered_scores[banned_idx] = float('-inf')
        top_index = np.argmax(filtered_scores)
    else:
        top_index = np.argmax(scores)

    # Determine relevance
    relevance = compute_relevance(supporting_pairs, top_index, sentence_metadata)
    
    
    # Update context based on relevance decision
    relevance_decision = is_context_relevant(question, flattened_sentences[top_index])

    return {
        "question": question,
        "current_context": current_context,
        "query": query,
        "relevance": relevance,
        "relevance_decision": relevance_decision,
        "retrieved_context": flattened_sentences[top_index],
        "retrieved_index": top_index,
    }


# Main Preference Dataset Creation

In [None]:
import pickle
import os
import json

def create_preference_dataset(dataset, config=None):
    """Create preference dataset from HotpotQA data
    
    Args:
        dataset: HotpotQA dataset
        config: Configuration dictionary (uses PREFERENCE_CONFIG if None)
        
    Returns:
        Dictionary containing preference dataset
    """
    if config is None:
        config = PREFERENCE_CONFIG
    
    preference_dataset = {}
    stats = {
        'total_processed': 0,
        'total_skipped': 0,
        'total_errors': 0
    }
    
    # Setup checkpointing
    save_dir = "checkpoints"
    os.makedirs(save_dir, exist_ok=True)
    
    print("Starting preference dataset creation...")
    
    # Process in batches
    for batch_start in tqdm(range(0, len(dataset['question']), config['batch_size']), 
                           desc="Processing batches"):
        batch_end = min(batch_start + config['batch_size'], len(dataset['question']))
        
        for idx in range(batch_start, batch_end):
            try:
                # Extract sample data
                sample = {k: dataset[k][idx] for k in dataset.column_names}
                question = sample['question']
                supporting_facts = sample['supporting_facts']
                
                # Skip samples without supporting facts
                if not supporting_facts['title']:
                    stats['total_skipped'] += 1
                    continue
                
                # Prepare context and embeddings
                flattened_sentences, sentence_metadata = prepare_sample_context(sample)
                context_embeddings = compute_colbert_embeddings(flattened_sentences)
                supporting_pairs = set(zip(supporting_facts['title'], supporting_facts['sent_id']))
                
                # Initialize preference entry
                preference_dataset[question] = {
                    "question": question,
                    "hops": {}
                }
                
                # Process hops
                hop_results = process_question_hops(
                    question, flattened_sentences, context_embeddings, 
                    supporting_pairs, sentence_metadata, config
                )
                
                preference_dataset[question]["hops"] = hop_results
                stats['total_processed'] += 1
                
            except Exception as e:
                print(f"Error processing sample {idx}: {e}")
                stats['total_errors'] += 1
                continue
            
            # Save checkpoint
            if stats['total_processed'] % config['save_interval'] == 0:
                save_checkpoint(preference_dataset, stats, save_dir, stats['total_processed'])
    
    # Final save
    save_checkpoint(preference_dataset, stats, save_dir, 'final')
    
    print(f"Dataset creation completed:")
    print(f"  Processed: {stats['total_processed']}")
    print(f"  Skipped: {stats['total_skipped']}")
    print(f"  Errors: {stats['total_errors']}")
    print(f"  Final size: {len(preference_dataset)} questions")
    
    return preference_dataset, stats

def process_question_hops(question, flattened_sentences, context_embeddings, 
                         supporting_pairs, sentence_metadata, config):
    """Process all hops for a single question"""
    hops = {}
    relevant_context = "Retrieved context: \n"
    irrelevant_context = "Irrelevant context: \n"
    banned_indices = set()
    
    hop_num = 0
    relevant_hops = 0
    
    few_shot_examples = format_fewshot_examples(fewshot_data, 3)

    question = few_shot_examples + "\n Question: " + question if few_shot_examples else question

    while (hop_num < config['max_hops']):
        
        current_context = relevant_context + irrelevant_context
        
        # Process single hop
        hop_data = process_single_hop(
            question, current_context, flattened_sentences,
            context_embeddings, supporting_pairs, sentence_metadata,
            banned_indices if hop_num > 0 else None
        )
        
        if not hop_data:
            break
        
        # Calculate reward based on relevance and action
        query_reward = calculate_hop_reward(
            question, hop_data, hop_num, config
        )
        
        hop_data["query_reward"] = query_reward
        hop_data["stop_hopping"] = should_stop
        hop_data["relevant_context"] = relevant_context
        
        hops[f"hop_{hop_num}"] = hop_data

        relevance_decision = hop_data["classifier_decision"]
        
        ground_truth_relevant = hop_data['relevance']
        
        if relevance_decision == "yes": # Accept relevant context
            banned_indices.add(hop_data["retrieved_index"])
            relevant_context += " " + hop_data["retrieved_context"]
            relevant_hops += 1
        elif relevance_decision == "no": # Accept irrelevant context
            irrelevant_context += " " + hop_data["retrieved_context"]
            banned_indices.add(hop_data["retrieved_index"])
        
        if relevant_hops >= config['num_hops']:
            # Stop if we reached the required number of relevant hops
            hop_data["query_reward"] += 1.0  # Bonus for reaching target hops
            hop_data["stop_hopping"] = True
            break
        
        hop_num += 1
    
    return hops

def calculate_hop_reward(question, hop_data, hop_num, config):
    """Calculate reward for a hop based on relevance and action"""
    relevance_decision = is_context_relevant(question, hop_data["retrieved_context"])
    ground_truth_relevant = hop_data['relevance']
    action_yes = (relevance_decision == "yes")
    
    discount_factor = config['discount_factor'] ** hop_num
    hop_penalty = config['hop_penalty']
    
    if ground_truth_relevant and action_yes:
        # Correct positive: relevant context accepted
        reward = (1.0 * discount_factor) + hop_penalty
        # update question to find new information
        question = "Find missing information about: " + question
    elif not ground_truth_relevant and action_yes:
        # False positive: irrelevant context accepted
        reward = (-1.0 * discount_factor) + hop_penalty
    elif ground_truth_relevant and not action_yes:
        # False negative: relevant context rejected (big penalty)
        reward = (1.0 * discount_factor) + hop_penalty
    else:
        # Correct negative: irrelevant context rejected
        reward = (-1 * discount_factor) + hop_penalty
    
    return reward

def save_checkpoint(preference_dataset, stats, save_dir, checkpoint_id):
    """Save checkpoint of preference dataset as JSON"""
    checkpoint_path = os.path.join(save_dir, f'preference_dataset_{checkpoint_id}.json')
    
    with open(checkpoint_path, 'w', encoding='utf-8') as f:
        json.dump(preference_dataset, f, indent=2, ensure_ascii=False)
    
    print(f"Preference dataset saved: {checkpoint_path}")

# Run preference dataset creation if train_dataset is available
if 'train_dataset' in locals() and train_dataset is not None:
    preference_dataset, creation_stats = create_preference_dataset(train_dataset)
else:
    print("Train dataset not loaded. Please run the dataset loading cell first.")

In [None]:
import pickle
import os
import json

def load_preference_dataset(checkpoint_path='pkl_files/checkpoint_final.pkl'):
    """Load preference dataset from checkpoint file
    
    Args:
        checkpoint_path: Path to checkpoint file
        
    Returns:
        tuple: (preference_dataset, stats) or (None, None) if failed
    """
    if not os.path.exists(checkpoint_path):
        print(f"Checkpoint file not found: {checkpoint_path}")
        
        # List available checkpoints
        checkpoint_dir = os.path.dirname(checkpoint_path) or 'checkpoints'
        if os.path.exists(checkpoint_dir):
            available_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pkl')]
            if available_files:
                print("Available checkpoint files:")
                for file in available_files:
                    print(f"  - {file}")
            else:
                print("No checkpoint files found")
        
        return None, None
    
    try:
        with open(checkpoint_path, 'rb') as f:
            checkpoint_data = pickle.load(f)
        
        preference_dataset = checkpoint_data['preference_dataset']
        stats = checkpoint_data.get('stats', {})
        
        print(f"Successfully loaded preference dataset from {checkpoint_path}")
        print(f"Dataset size: {len(preference_dataset)} questions")
        if stats:
            print(f"Statistics: {stats}")
        
        return preference_dataset, stats
        
    except Exception as e:
        print(f"Error loading checkpoint: {e}")
        return None, None

def convert_and_save_dataset(preference_dataset, stats=None):
    """Convert preference dataset to JSON and save"""
    def convert_numpy_types(obj):
        """Recursively convert numpy types to native Python types"""
        if isinstance(obj, dict):
            return {key: convert_numpy_types(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [convert_numpy_types(item) for item in obj]
        elif hasattr(obj, 'item'):  # numpy scalar
            return obj.item()
        else:
            return obj
    
    # Convert dataset
    json_dataset = convert_numpy_types(preference_dataset)
    
    # Save as JSON
    json_filename = f"preference_dataset_adaptive_{'00000'}.json"
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(json_dataset, f, indent=2, ensure_ascii=False)
    
    print(f"Preference dataset saved as JSON: {json_filename}")
    
    # Show sample
    if preference_dataset:
        sample_question = list(preference_dataset.keys())[0]
        sample_data = preference_dataset[sample_question]
        print(f"\nSample question: {sample_question[:100]}...")
        print(f"Number of hops: {len(sample_data['hops'])}")
        
        if sample_data['hops']:
            first_hop = sample_data['hops']['hop_0']
            print(f"First hop query: {first_hop['query']}")
            print(f"First hop relevance: {first_hop['relevance']}")
            print(f"First hop reward: {first_hop['reward']:.3f}")
    
    return json_filename

# Load existing dataset if available
preference_dataset, dataset_stats = load_preference_dataset()

if preference_dataset:
    json_file = convert_and_save_dataset(preference_dataset, dataset_stats)
else:
    print("No preference dataset available. Run the creation process first.")

# Training Data Preparation

In [None]:
# Save the preference dataset for hop decision model
# Generate timestamped filename
timestamp = CURRENT_TIME_STAMP.replace(" ", "_").replace(":", "-")
output_file = f"hop_decision_preference_dataset_{timestamp}.json"

def prepare_hop_decision_training_data(preference_dataset):
    """Transform preference dataset into hop decision training format
    
    Args:
        preference_dataset: Raw preference dataset
        
    Returns:
        List of training examples for hop decision model
    """
    training_data = []
    
    for question, data in preference_dataset.items():
        for hop_key, hop_data in data["hops"].items():
            training_example = {
                "question": question,
                "context": hop_data.get("context", ""),
                "retrieved_context": hop_data["retrieved_context"],
                "query": hop_data["query"],
                "relevance": hop_data["relevance"],
                "reward": hop_data["reward"],
                "should_continue": not hop_data["stop_hopping"],
                "hop_number": int(hop_key.split("_")[1]),
                "action_taken": "yes"  # Placeholder - would be determined by classifier
            }
            training_data.append(training_example)
    
    return training_data

def save_training_data(training_data, prefix="hop_decision"):
    """Save training data to JSON file"""
    filename = f"{prefix}_training_data_{CURRENT_TIMESTAMP}.json"
    
    with open(filename, "w", encoding='utf-8') as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
    
    # Calculate statistics
    continue_decisions = sum(1 for item in training_data if item["should_continue"])
    stop_decisions = len(training_data) - continue_decisions
    positive_rewards = sum(1 for item in training_data if item["reward"] > 0)
    avg_reward = sum(item['reward'] for item in training_data) / len(training_data)
    
    print(f"Training data saved: {filename}")
    print(f"Statistics:")
    print(f"  Total examples: {len(training_data)}")
    print(f"  Continue decisions: {continue_decisions}")
    print(f"  Stop decisions: {stop_decisions}")
    print(f"  Positive rewards: {positive_rewards}")
    print(f"  Average reward: {avg_reward:.3f}")
    
    return filename

# Transform data for hop decision training
hop_decision_dataset = prepare_hop_decision_training_data(preference_dataset)

# Save the hop decision dataset
with open(output_file, "w") as f:
    json.dump(hop_decision_dataset, f, indent=2)
    
print(f"Hop decision preference dataset saved to {output_file}")

# Display statistics
continue_decisions = sum(1 for item in hop_decision_dataset if item["should_continue"])
stop_decisions = len(hop_decision_dataset) - continue_decisions
positive_rewards = sum(1 for item in hop_decision_dataset if item["reward"] > 0)

print(f"\nHop Decision Dataset Statistics:")
print(f"- Total decisions: {len(hop_decision_dataset)}")
print(f"- Continue decisions: {continue_decisions}")
print(f"- Stop decisions: {stop_decisions}")
print(f"- Positive reward decisions: {positive_rewards}")
print(f"- Average reward: {sum(item['reward'] for item in hop_decision_dataset)/len(hop_decision_dataset):.3f}")

# Prepare training data if preference dataset is available
if 'preference_dataset' in locals() and preference_dataset:
    hop_training_data = prepare_hop_decision_training_data(preference_dataset)
    training_data_file = save_training_data(hop_training_data)
else:
    print("No preference dataset available for training data preparation")

# Summary

This notebook implements adaptive hop training for multi-hop retrieval with the following components:

## 1. Model Setup
- **Query Generator**: T5-Flan for generating search queries
- **Retrieval Scorer**: ColBERT for document ranking
- **Relevance Classifier**: RoBERTa for hop decision making

## 2. Preference Dataset Creation
- Multi-hop retrieval simulation with reward-based learning
- Ground truth relevance evaluation
- Reward calculation based on:
  - Correct decisions: +1.0 (accept relevant) / +0.5 (reject irrelevant)
  - Incorrect decisions: -1.0 (accept irrelevant) / -10.0 (reject relevant)
  - Temporal discounting and hop penalties

## 3. Training Data Generation
- Hop decision training examples
- Query generation preference pairs
- Ready for downstream training with RL/preference learning

## Key Configuration
- **Target hops**: 2 relevant retrievals per question
- **Max hops**: 10 maximum attempts
- **Discount factor**: 0.9 for temporal rewards
- **Dataset**: HotpotQA (5000 train, 1000 val)

## Outputs
- `preference_dataset_adaptive_[timestamp].json`: Raw preference dataset
- `hop_decision_training_data_[timestamp].json`: Training examples
- Model checkpoints in `checkpoints/` directory

## Next Steps
1. Train relevance classifier on generated examples
2. Fine-tune query generator with preference learning (DPO/IPO)
3. Evaluate on multi-hop QA benchmarks