## Step 1: Setup


In [1]:
!nvidia-smi

Thu Oct 30 23:11:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.83                 Driver Version: 576.83         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060      WDDM  |   00000000:01:00.0 Off |                  N/A |
| 36%   35C    P8              8W /  170W |     125MiB /  12288MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# === CONFIGURATION (12GB GPU OPTIMIZED) ===

# API Keys (set if available, else will use Ollama)
GEMINI_API_KEY = None  # Set to your API key or leave None for Ollama

# Data paths
DATA_DIR = "data"
MSMARCO_DIR = f"{DATA_DIR}/msmarco"
TYDI_DIR = f"{DATA_DIR}/tydi"
MMARCO_DIR = f"{DATA_DIR}/mmarco/beir"

# Model directories
MODEL_DIR = "./models"
BASE_MODEL = "bert-base-multilingual-cased"

# Training configuration for 12GB GPU
USE_MIXED_PRECISION = True   # FP16 to save memory, but optional with 12GB
GRADIENT_ACCUMULATION_STEPS = 2  # Can use higher batch size, so less accumulation needed
MAX_SEQ_LENGTH = 384         # Safely increase; even 512 should work for most 12GB cards
TRAIN_BATCH_SIZE = 8         # You can set 8, test up to 12 if memory allows

# Sample sizes for development (set to None for full dataset)
DEV_MODE = True              # Set False for full training
DEV_SAMPLE_SIZE = 100 if DEV_MODE else None  # Optionally use more for DEV, or set to full

# Languages for multilingual training
TYDI_LANGUAGES = [
    "arabic", "bengali", "finnish"
]
# , "indonesian", "japanese",
#     "korean", "russian", "swahili", "telugu", "thai"

MMARCO_LANGUAGES = [
    "arabic", "chinese", "dutch", "french", "german",
    "hindi", "indonesian", "italian", "japanese", "portuguese",
    "russian", "spanish", "vietnamese"
]

print("‚úì Configuration loaded for 12GB GPU")
print(f"  Batch size: {TRAIN_BATCH_SIZE}, Max seq: {MAX_SEQ_LENGTH}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Development Mode: {DEV_MODE} ({DEV_SAMPLE_SIZE if DEV_MODE else 'FULL'})")


‚úì Configuration loaded for 12GB GPU
  Batch size: 8, Max seq: 384
  Gradient accumulation: 2
  Development Mode: True (100)


In [None]:
# Run once to install required packages

import subprocess
import sys

def install_packages():
    packages = [
        "transformers",
        "datasets",
        "pandas",
        "tqdm",
        "simpletransformers",
        "faiss-cpu",  # Use faiss-cpu for 4GB GPU, or faiss-gpu if sufficient
        "rank-bm25",
        "sentence-transformers",
        "torch",
    ]
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"‚úì {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install_packages()

# For Ollama (if not using Gemini API)
# Install separately: https://ollama.ai/download
# Then: ollama pull llama3.2:3b
print("\n‚úì All dependencies installed")

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


‚úì transformers already installed
‚úì datasets already installed
‚úì pandas already installed
‚úì tqdm already installed
‚úì simpletransformers already installed
Installing faiss-cpu...
Installing rank-bm25...
‚úì sentence-transformers already installed
‚úì torch already installed

‚úì All dependencies installed


In [3]:
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from typing import List, Dict, Tuple
from dataclasses import dataclass

import torch
from datasets import load_dataset
from transformers import set_seed

# Set seeds for reproducibility
set_seed(42)
np.random.seed(42)
random.seed(42)

# Create directories
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

print("‚úì Libraries imported")
print(f"  PyTorch version: {torch.__version__}")
print(f"  CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


‚úì Libraries imported
  PyTorch version: 2.5.1
  CUDA available: True
  GPU Memory: 12.88 GB


In [4]:
import pandas as pd

print("=== Loading MS MARCO Training Data ===")

# Load training data
msmarco_train = pd.read_csv(
    f"{MSMARCO_DIR}/msmarco-train.tsv",
    sep="\t",
    nrows=DEV_SAMPLE_SIZE if DEV_MODE else None
)

# CRITICAL: Rename columns to match BEIR format requirements
msmarco_train = msmarco_train.rename(columns={
    "query": "query_text",
    "positive_passage": "gold_passage",
    "negative_passage": "hard_negative"
})

print(f"‚úì MS MARCO Train: {len(msmarco_train):,} samples")
print("Columns:", msmarco_train.columns.tolist())
print("\nSample:")
print(msmarco_train.head(2))

# Statistics
print("\nStatistics:")
print(f"  Avg query length: {msmarco_train['query_text'].str.len().mean():.1f} chars")
print(f"  Avg passage length: {msmarco_train['gold_passage'].str.len().mean():.1f} chars")


=== Loading MS MARCO Training Data ===
‚úì MS MARCO Train: 100 samples
Columns: ['query_text', 'gold_passage', 'hard_negative']

Sample:
                                          query_text  \
0                         what are the liberal arts?   
1  what is the mechanism of action of fibrinolyti...   

                                        gold_passage  \
0  liberal arts. 1. the academic course of instru...   
1  Bailli√É¬®re's Clinical Haematology. 6 Mechanism...   

                                       hard_negative  
0  Liberal Education: An approach to college lear...  
1  Be able to diagram the coagulation and fibrino...  

Statistics:
  Avg query length: 34.5 chars
  Avg passage length: 362.6 chars


In [5]:
import os
import pandas as pd
from tqdm.auto import tqdm

print("\n=== Loading Mr. TyDi (BEIR Format) ===")

tydi_data = []

for lang in TYDI_LANGUAGES:
    base_dir = f"{TYDI_DIR}/{lang}"
    queries_file = os.path.join(base_dir, "queries.jsonl")
    corpus_file = os.path.join(base_dir, "corpus.jsonl")
    qrels_file = os.path.join(base_dir, "qrels/test.tsv")

    # Check required files
    if (os.path.exists(queries_file) and 
        os.path.exists(corpus_file) and 
        os.path.exists(qrels_file)):
        
        queries_df = pd.read_json(queries_file, lines=True)
        corpus_df = pd.read_json(corpus_file, lines=True)
        qrels_df = pd.read_csv(qrels_file, sep='\t')

        # Merge queries and corpus with qrels
        merged = qrels_df.merge(
            queries_df.rename(columns={"_id": "query-id"}),
            on="query-id"
        ).merge(
            corpus_df.rename(columns={"_id": "corpus-id", "text": "gold_passage"}),  # Changed here
            on="corpus-id"
        )

        # CRITICAL: Rename to BEIR format columns
        df = merged[['text', 'gold_passage']].rename(columns={'text': 'query_text'})  # Changed here
        df = df.dropna()

        # Sample if dev mode
        if DEV_SAMPLE_SIZE:
            df = df.sample(min(len(df), 100), random_state=42)
        
        tydi_data.append(df)
        print(f"  ‚úì {lang}: {len(df):,}")
    else:
        print(f"  ‚ö† Missing files for {lang}")

# Combine all languages
tydi_combined = pd.concat(tydi_data, ignore_index=True) if tydi_data else pd.DataFrame(columns=["query_text", "gold_passage"])

print(f"\n‚úì Total Mr. TyDi: {len(tydi_combined):,} samples")
print("Columns:", tydi_combined.columns.tolist())
print("\nSample:")
print(tydi_combined.head(2))



=== Loading Mr. TyDi (BEIR Format) ===
  ‚úì arabic: 100
  ‚úì bengali: 100
  ‚úì finnish: 100

‚úì Total Mr. TyDi: 300 samples
Columns: ['query_text', 'gold_passage']

Sample:
                   query_text  \
0   ŸÖŸÜ ŸÖÿÆÿ™ÿ±ÿπ ÿ≠ÿ®Ÿàÿ® ŸÖŸÜÿπ ÿßŸÑÿ≠ŸÖŸÑ ÿü   
1  ŸÖÿ™Ÿâ ÿ™ÿ£ÿ≥ÿ≥ÿ™ ÿßŸÑÿ¨ÿßŸÖÿπÿ© ÿßŸÑÿπÿ±ÿ®Ÿäÿ©ÿü   

                                        gold_passage  
0  ÿßŸÑÿØŸÉÿ™Ÿàÿ± ÿ¨ÿ±Ÿäÿ¨Ÿàÿ±Ÿä ÿ®ŸÜŸÉŸàÿ≥ (22 ÿ£ÿ∫ÿ≥ÿ∑ÿ≥ 1967 9 ÿ£ÿ®ÿ±ŸäŸÑ 1...  
1  ŸàÿπŸÜÿØŸÖÿß ÿßÿ¨ÿ™ŸÖÿπÿ™ ŸÑÿ¨ŸÜÿ© ÿ™ÿ≠ÿ∂Ÿäÿ±Ÿäÿ© ŸÖŸÜ ŸÖŸÖÿ´ŸÑŸäŸÜ ÿπŸÜ ŸÉŸÑ ŸÖŸÜ ...  


In [35]:
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Remove nulls, duplicates, and invalid samples"""
    initial_size = len(df)
    
    # Remove nulls
    df = df.dropna()
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove empty strings
    df = df[
        (df['query_text'].str.strip() != '') & 
        (df['gold_passage'].str.strip() != '')
    ]
    
    # Length constraints (for 12GB GPU)
    df = df[
        (df['query_text'].str.len() >= 10) &
        (df['query_text'].str.len() <= 512) &
        (df['gold_passage'].str.len() >= 20) &
        (df['gold_passage'].str.len() <= 2048)
    ]
    
    # Remove if negative == positive
    if 'hard_negative' in df.columns:
        df = df[df['hard_negative'] != df['gold_passage']]
    
    print(f"  Cleaned: {initial_size:,} ‚Üí {len(df):,} ({len(df)/initial_size*100:.1f}% retained)")
    
    return df.reset_index(drop=True)

print("=== Cleaning Datasets ===")

print("MS MARCO Train:")
msmarco_train = clean_dataset(msmarco_train)

print("\nMr. TyDi:")
tydi_combined = clean_dataset(tydi_combined)

print("\n‚úì Cleaning complete")
print(f"\nFinal counts:")
print(f"  MS MARCO: {len(msmarco_train):,}")
print(f"  Mr. TyDi: {len(tydi_combined):,}")


=== Cleaning Datasets ===
MS MARCO Train:
  Cleaned: 100 ‚Üí 100 (100.0% retained)

Mr. TyDi:
  Cleaned: 298 ‚Üí 298 (100.0% retained)

‚úì Cleaning complete

Final counts:
  MS MARCO: 100
  Mr. TyDi: 298


In [36]:
# Prepare data in format needed for DPR training

@dataclass
class TrainingExample:
    query: str
    positive: str
    negatives: List[str]  # Will be populated by sampling methods

def prepare_training_data(df: pd.DataFrame, has_negatives: bool = True) -> List[TrainingExample]:
    """Convert DataFrame to training examples"""
    examples = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing"):
        example = TrainingExample(
            query=row['query_text'],
            positive=row['gold_passage'],
            negatives=[row['negative_passage']] if has_negatives and 'negative_passage' in row else []
        )
        examples.append(example)
    
    return examples

print("=== Preparing Training Examples ===")

msmarco_train_examples = prepare_training_data(msmarco_train, has_negatives=True)
tydi_train_examples = prepare_training_data(tydi_combined, has_negatives=False)

print(f"‚úì MS MARCO: {len(msmarco_train_examples):,} examples")
print(f"‚úì TyDi: {len(tydi_train_examples):,} examples")

# Save to disk for later use
import pickle

with open('./data_processed.pkl', 'wb') as f:
    pickle.dump({
        'msmarco_train': msmarco_train_examples,
        'tydi_train': tydi_train_examples
    }, f)

print("\n‚úì Data saved to data_processed.pkl")

=== Preparing Training Examples ===


Preparing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 264624.86it/s]
Preparing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 298/298 [00:00<00:00, 31810.61it/s]

‚úì MS MARCO: 100 examples
‚úì TyDi: 298 examples

‚úì Data saved to data_processed.pkl





In [37]:
# Final statistics

print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)

print("\nüìä MS MARCO (English):")
print(f"  Training samples: {len(msmarco_train_examples):,}")
print(f"  Has pre-mined negatives: Yes")

print("\nüìä Mr. TyDi (Multilingual):")
print(f"  Total samples: {len(tydi_train_examples):,}")
print(f"  Languages: {len(TYDI_LANGUAGES)}")
print(f"  Has pre-mined negatives: No (will generate)")

print("\nüìä Configuration:")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  Mixed precision: {USE_MIXED_PRECISION}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")

print("\n‚úÖ Phase 1 Complete: Data Preparation")
print("=" * 60)

DATASET SUMMARY

üìä MS MARCO (English):
  Training samples: 100
  Has pre-mined negatives: Yes

üìä Mr. TyDi (Multilingual):
  Total samples: 298
  Languages: 3
  Has pre-mined negatives: No (will generate)

üìä Configuration:
  Max sequence length: 384
  Mixed precision: True
  Gradient accumulation: 2

‚úÖ Phase 1 Complete: Data Preparation


## Phase 2

In [38]:
# Install BM25 for negative sampling
import subprocess
import sys

try:
    from rank_bm25 import BM25Okapi
    print("‚úì rank_bm25 already installed")
except ImportError:
    print("Installing rank_bm25...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rank-bm25"])
    from rank_bm25 import BM25Okapi
    print("‚úì rank_bm25 installed")

# Load processed data
import pickle

with open('./data_processed.pkl', 'rb') as f:
    data = pickle.load(f)
    msmarco_train_examples = data['msmarco_train']
    tydi_train_examples = data['tydi_train']

print(f"‚úì Loaded {len(msmarco_train_examples):,} MS MARCO examples")
print(f"‚úì Loaded {len(tydi_train_examples):,} TyDi examples")

‚úì rank_bm25 already installed
‚úì Loaded 100 MS MARCO examples
‚úì Loaded 298 TyDi examples


In [39]:
# BM25-based hard negative mining

from rank_bm25 import BM25Okapi
from typing import List
import numpy as np

class BM25NegativeSampler:
    """Mine hard negatives using BM25"""
    
    def __init__(self, corpus: List[str]):
        print("Building BM25 index...")
        # Tokenize corpus
        tokenized_corpus = [doc.lower().split() for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.corpus = corpus
        print(f"‚úì BM25 index built with {len(corpus):,} documents")
    
    def get_hard_negatives(self, query: str, positive_passage: str, top_k: int = 100, n_negatives: int = 1) -> List[str]:
        """Get hard negatives for a query"""
        # Tokenize query
        tokenized_query = query.lower().split()
        
        # Get top-k candidates from BM25
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        # Filter out positive passage and select negatives
        negatives = []
        for idx in top_indices:
            candidate = self.corpus[idx]
            # Skip if it's the positive passage
            if candidate != positive_passage and candidate not in negatives:
                negatives.append(candidate)
            if len(negatives) >= n_negatives:
                break
        
        # If not enough negatives, add random ones
        while len(negatives) < n_negatives:
            random_idx = np.random.randint(0, len(self.corpus))
            candidate = self.corpus[random_idx]
            if candidate != positive_passage and candidate not in negatives:
                negatives.append(candidate)
        
        return negatives[:n_negatives]

# Build corpus from MS MARCO
print("\n=== Building BM25 Corpus ===")
all_passages = set()

for example in tqdm(msmarco_train_examples, desc="Collecting passages"):
    all_passages.add(example.positive)
    all_passages.update(example.negatives)

corpus_list = list(all_passages)
print(f"‚úì Corpus size: {len(corpus_list):,} unique passages")

# Initialize BM25 sampler
bm25_sampler = BM25NegativeSampler(corpus_list)


=== Building BM25 Corpus ===


Collecting passages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<?, ?it/s]

‚úì Corpus size: 100 unique passages
Building BM25 index...
‚úì BM25 index built with 100 documents





In [40]:
# Mine hard negatives for training examples that don't have them

print("\n=== Mining Hard Negatives with BM25 ===")

# For TyDi examples (no pre-existing negatives)
print("\nMining for TyDi examples...")
for example in tqdm(tydi_train_examples[:min(len(tydi_train_examples), 500)], desc="TyDi"):
    if len(example.negatives) == 0:
        hard_negs = bm25_sampler.get_hard_negatives(
            example.query, 
            example.positive, 
            top_k=100, 
            n_negatives=1
        )
        example.negatives = hard_negs

# For MS MARCO examples (already have negatives, but we can add more)
print("\nAdding BM25 negatives to MS MARCO examples (first 100 for demo)...")
for example in tqdm(msmarco_train_examples[:100], desc="MS MARCO"):
    # Add one more hard negative from BM25
    bm25_negs = bm25_sampler.get_hard_negatives(
        example.query,
        example.positive,
        top_k=100,
        n_negatives=1
    )
    # Avoid duplicates
    for neg in bm25_negs:
        if neg not in example.negatives:
            example.negatives.append(neg)

print("\n‚úì Hard negative mining complete")


=== Mining Hard Negatives with BM25 ===

Mining for TyDi examples...


TyDi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 298/298 [00:00<00:00, 7780.91it/s]



Adding BM25 negatives to MS MARCO examples (first 100 for demo)...


MS MARCO: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 9021.75it/s]


‚úì Hard negative mining complete





In [41]:
def convert_to_training_format(examples: List, limit: int = None) -> pd.DataFrame:
    """Convert training examples to DataFrame for SimpleDPR"""
    data = []
    
    for example in (examples[:limit] if limit else examples):
        for negative in example.negatives:
            data.append({
                "query_text": example.query,
                "gold_passage": example.positive,
                "hard_negative": negative
            })
    
    return pd.DataFrame(data)

print("Preparing Training DataFrames")

train_size = 5000 if DEV_MODE else None
msmarco_train_df = convert_to_training_format(msmarco_train_examples, limit=train_size)

print(f"‚úì MS MARCO training: {len(msmarco_train_df):,} triplets")
print("Columns:", msmarco_train_df.columns.tolist())
print("\nSample:")
print(msmarco_train_df.head(2))


Preparing Training DataFrames
‚úì MS MARCO training: 100 triplets
Columns: ['query_text', 'gold_passage', 'hard_negative']

Sample:
                                          query_text  \
0                         what are the liberal arts?   
1  what is the mechanism of action of fibrinolyti...   

                                        gold_passage  \
0  liberal arts. 1. the academic course of instru...   
1  Bailli√É¬®re's Clinical Haematology. 6 Mechanism...   

                                       hard_negative  
0  Atrophy vs dystrophy. What are atrophy and dys...  
1  Labor Day: What it Means. Labor Day, the first...  


In [42]:
from simpletransformers.retrieval import RetrievalModel, RetrievalArgs
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

print("=== Configuring DPR Model ===")

model_args = RetrievalArgs()

# CRITICAL: Data format must be 'beir'
model_args.data_format = "beir"

# Data processing
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.use_cached_eval_features = False
model_args.use_hf_datasets = True

# Model architecture
model_args.include_title = False  # MS MARCO doesn't use titles
model_args.max_seq_length = 256   # Research paper setting

# Training hyperparameters (from research paper)
model_args.num_train_epochs = 5   # Use 40 for full training
model_args.train_batch_size = 8
model_args.learning_rate = 1e-6
model_args.warmup_steps = 5000
model_args.save_steps = 300000

# CRITICAL: Hard negatives enabled for Phase 1
model_args.hard_negatives = True

# CRITICAL: Evaluation disabled during Phase 1 pretraining
model_args.evaluate_during_training = False
model_args.save_model_every_epoch = False

# Hardware optimization
model_args.n_gpu = 1
model_args.fp16 = USE_MIXED_PRECISION
model_args.dataloader_num_workers = 4

# ANCE disabled for Phase 1
model_args.ance_training = False

# Output directory
model_args.output_dir = f"{MODEL_DIR}/DPR-BM-msmarco"

print("‚úì Model configuration complete")
print(f"  Data format: {model_args.data_format}")
print(f"  Hard negatives: {model_args.hard_negatives}")
print(f"  Evaluation during training: {model_args.evaluate_during_training}")
print(f"  Epochs: {model_args.num_train_epochs}")
print(f"  Batch size: {model_args.train_batch_size}")
print(f"  Learning rate: {model_args.learning_rate}")


=== Configuring DPR Model ===
‚úì Model configuration complete
  Data format: beir
  Hard negatives: True
  Evaluation during training: False
  Epochs: 5
  Batch size: 8
  Learning rate: 1e-06


In [43]:
# Cell 13.5: Aggressive GPU Memory Cleanup

import torch
import gc

def clear_gpu_memory():
    """Aggressively clear GPU memory"""
    if torch.cuda.is_available():
        # Clear PyTorch cache
        torch.cuda.empty_cache()
        
        # Force garbage collection
        gc.collect()
        
        # Clear all variables from previous runs
        import sys
        for obj in gc.get_objects():
            try:
                if torch.is_tensor(obj):
                    del obj
            except:
                pass
        
        # Final cleanup
        torch.cuda.empty_cache()
        gc.collect()
        
        # Report memory
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        print("‚úì GPU memory cleared")
        print(f"  Allocated: {allocated:.2f}GB")
        print(f"  Reserved: {reserved:.2f}GB")
        print(f"  Free: {total - reserved:.2f}GB")
        print(f"  Total: {total:.2f}GB")

# Clear before training
clear_gpu_memory()


  return isinstance(obj, torch.Tensor)


‚úì GPU memory cleared
  Allocated: 4.99GB
  Reserved: 5.27GB
  Free: 7.61GB
  Total: 12.88GB


In [44]:
from multiprocessing import set_start_method

print("="*60)
print("PHASE 1: TRAINING DPR WITH BM25 NEGATIVES")
print("="*60)

# Clear GPU memory
clear_gpu_memory()

# Set multiprocessing method
try:
    set_start_method("spawn")
except RuntimeError:
    pass  # Already set

print(f"\nTraining Configuration:")
print(f"  Dataset: MS MARCO")
print(f"  Training samples: {len(msmarco_train):,}")
print(f"  Epochs: {model_args.num_train_epochs}")
print(f"  Batch size: {model_args.train_batch_size}")
print(f"  Learning rate: {model_args.learning_rate}")
print(f"  Max sequence length: {model_args.max_seq_length}")
print(f"  Hard negatives: {model_args.hard_negatives}")
print(f"  Data format: {model_args.data_format}")
print(f"  Evaluation during training: {model_args.evaluate_during_training}")
print(f"  Save model every epoch: {model_args.save_model_every_epoch}")
print()

import time
start_time = time.time()

try:
    # Train for ALL epochs in one call
    # SimpleDPR handles the epoch loop internally
    dpr_model.train_model(
        msmarco_train,
        eval_set="dev"  # Placeholder, not used since evaluation is disabled
    )
    
    training_time = time.time() - start_time
    
    print("\n" + "="*60)
    print("‚úÖ PHASE 1 TRAINING COMPLETE!")
    print("="*60)
    print(f"Total training time: {training_time/60:.1f} minutes")
    print(f"Model saved to: {model_args.output_dir}")
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("‚úì GPU memory cleared")
    
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("\n" + "="*60)
        print("‚ùå OUT OF MEMORY ERROR")
        print("="*60)
        print("\nTry reducing:")
        print(f"  - train_batch_size (currently: {model_args.train_batch_size})")
        print(f"  - max_seq_length (currently: {model_args.max_seq_length})")
        print(f"  - gradient_accumulation_steps (increase to compensate for smaller batch)")
        
        # Clean up
        torch.cuda.empty_cache()
        gc.collect()
    raise
    
except Exception as e:
    print(f"\n‚ùå Training failed with error:")
    print(f"   {type(e).__name__}: {str(e)}")
    raise

print("\n" + "="*60)
print("PHASE 1 COMPLETE - Ready for Phase 2")
print("="*60)


PHASE 1: TRAINING DPR WITH BM25 NEGATIVES
‚úì GPU memory cleared
  Allocated: 4.99GB
  Reserved: 5.27GB
  Free: 7.61GB
  Total: 12.88GB

Training Configuration:
  Dataset: MS MARCO
  Training samples: 100
  Epochs: 5
  Batch size: 8
  Learning rate: 1e-06
  Max sequence length: 256
  Hard negatives: True
  Data format: beir
  Evaluation during training: False
  Save model every epoch: False


‚ùå Training failed with error:
   ValueError: Output directory (outputs/) already exists and is not empty. Set args.overwrite_output_dir = True to overcome.


ValueError: Output directory (outputs/) already exists and is not empty. Set args.overwrite_output_dir = True to overcome.

In [45]:
print("\n" + "="*60)
print("CREATING BEIR FORMAT DEV/TEST DATASETS")
print("="*60)

import os

# Check if already created
eval_data_path = f"{DATA_DIR}/msmarco_dev_beir.tsv"

if os.path.exists(eval_data_path) and os.path.getsize(eval_data_path) > 100:
    print(f"\n‚úì Loading cached evaluation dataset...")
    msmarco_dev = pd.read_csv(eval_data_path, sep="\t")
    print(f"‚úì Loaded: {len(msmarco_dev):,} query-passage pairs")
else:
    print("\nCreating evaluation dataset from MS MARCO files...")
    
    # Load queries with string dtype for IDs
    print("Loading queries...")
    queries_df = pd.read_csv(
        f"{MSMARCO_DIR}/queries.tsv", 
        sep="\t", 
        names=['query_id', 'title', 'query_text'],
        dtype={'query_id': str},
        low_memory=False
    )
    print(f"‚úì Queries: {len(queries_df):,}")
    queries_df['query_id'] = queries_df['query_id'].astype(str)
    
    # Load corpus with string dtype for IDs
    print("Loading corpus...")
    corpus_df = pd.read_csv(
        f"{MSMARCO_DIR}/corpus.tsv", 
        sep="\t", 
        names=['corpus_id', 'title', 'passage'],
        dtype={'corpus_id': str},
        low_memory=False
    )
    print(f"‚úì Corpus: {len(corpus_df):,}")
    corpus_df['corpus_id'] = corpus_df['corpus_id'].astype(str)
    
    # Load qrels with string dtype for IDs
    print("Loading qrels...")
    qrels_df = pd.read_csv(
        f"{MSMARCO_DIR}/devs.tsv", 
        sep="\t", 
        names=['query_id', 'corpus_id', 'score'],
        dtype={'query_id': str, 'corpus_id': str}
    )
    print(f"‚úì Qrels (Dev): {len(qrels_df):,}")
    qrels_df['query_id'] = qrels_df['query_id'].astype(str)
    qrels_df['corpus_id'] = qrels_df['corpus_id'].astype(str)
    
    print("\nMerging into BEIR format...")
    
    # Merge qrels + queries
    print("  Merging qrels with queries...")
    msmarco_dev = qrels_df.merge(
        queries_df[['query_id', 'query_text']],
        on='query_id',
        how='left'
    )
    print(f"    After merge: {len(msmarco_dev):,}")
    
    # Merge with corpus
    print("  Merging with corpus...")
    msmarco_dev = msmarco_dev.merge(
        corpus_df[['corpus_id', 'passage']].rename(columns={'passage': 'gold_passage'}),
        on='corpus_id',
        how='left'
    )
    print(f"    After merge: {len(msmarco_dev):,}")
    
    # Keep only required columns
    msmarco_dev = msmarco_dev[['query_text', 'gold_passage']].drop_duplicates().dropna()
    
    print(f"\n‚úì Created: {len(msmarco_dev):,} query-passage pairs")
    
    if len(msmarco_dev) > 0:
        msmarco_dev.to_csv(eval_data_path, sep="\t", index=False)
        print(f"‚úì Saved to {eval_data_path}")
    else:
        print("‚ùå Merge failed - no data!")


print(f"\nEvaluation dataset:")
print(f"  Size: {len(msmarco_dev):,}")

if len(msmarco_dev) > 0:
    print(f"  Columns: {msmarco_dev.columns.tolist()}")
    print(f"  Sample query: {msmarco_dev.iloc[0]['query_text'][:60]}...")
    print(f"  Sample passage: {msmarco_dev.iloc[0]['gold_passage'][:60]}...")
    
    # Use for evaluation
    test_data = msmarco_dev.head(50).copy()
    
    print("\n" + "="*60)
    print("EVALUATION DATASET READY")
    print("="*60)
    print(f"\n‚úì Total evaluation data: {len(msmarco_dev):,} query-passage pairs")
    print(f"‚úì Test subset (first 50): {len(test_data):,} samples")
    print(f"‚úì Variable 'msmarco_dev' is now available for evaluation")
    print("‚úì Models can now be evaluated on actual MS MARCO dev set")
else:
    print("‚ùå No evaluation data - skipping evaluation")



CREATING BEIR FORMAT DEV/TEST DATASETS

‚úì Loading cached evaluation dataset...
‚úì Loaded: 7,437 query-passage pairs

Evaluation dataset:
  Size: 7,437
  Columns: ['query_text', 'gold_passage']
  Sample query: how many years did william bradford serve as governor of ply...
  Sample passage: http://en.wikipedia.org/wiki/William_Bradford_(Plymouth_Colo...

EVALUATION DATASET READY

‚úì Total evaluation data: 7,437 query-passage pairs
‚úì Test subset (first 50): 50 samples
‚úì Variable 'msmarco_dev' is now available for evaluation
‚úì Models can now be evaluated on actual MS MARCO dev set


In [17]:
# ============================================================
# Evaluation Function
# ============================================================

def evaluate_dpr_model(model, eval_df, model_name, device, top_k=10, max_samples=None):
    """
    Evaluate DPR model on retrieval task
    
    Args:
        model: RetrievalModel instance
        eval_df: DataFrame with columns ['query_text', 'gold_passage']
        model_name: Name for logging
        device: torch device
        top_k: Recall cutoff (default 10)
        max_samples: Limit samples for speed (None = use all)
    
    Returns:
        Dictionary with metrics
    """
    
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*60}\n")
    
    # Limit samples if specified
    if max_samples:
        eval_subset = eval_df.head(max_samples).copy()
        print(f"Using {len(eval_subset)} samples (limited from {len(eval_df)})")
    else:
        eval_subset = eval_df.copy()
        print(f"Using all {len(eval_subset)} samples")
    
    mrr_scores = []
    ndcg_scores = []
    recall_1 = []
    recall_5 = []
    recall_10 = []
    
    # Get all passages as corpus
    all_passages = eval_subset['gold_passage'].tolist()
    print(f"Corpus size: {len(all_passages)} passages\n")
    
    # Evaluate each query
    for idx, row in tqdm(eval_subset.iterrows(), total=len(eval_subset), desc="Evaluating"):
        query = row['query_text']
        gold_passage = row['gold_passage']
        
        try:
            with torch.no_grad():
                # Encode query
                query_features = model.query_tokenizer(
                    query, 
                    padding='max_length',
                    truncation=True, 
                    max_length=256, 
                    return_tensors='pt'
                )
                
                query_input_ids = query_features['input_ids'].to(device)
                query_attention_mask = query_features['attention_mask'].to(device)
                
                query_emb = model.query_encoder(
                    input_ids=query_input_ids,
                    attention_mask=query_attention_mask
                )[1].cpu().numpy()
                
                # Score all passages
                passage_scores = []
                for passage in all_passages:
                    passage_features = model.context_tokenizer(
                        passage,
                        padding='max_length',
                        truncation=True,
                        max_length=256,
                        return_tensors='pt'
                    )
                    
                    passage_input_ids = passage_features['input_ids'].to(device)
                    passage_attention_mask = passage_features['attention_mask'].to(device)
                    
                    passage_emb = model.context_encoder(
                        input_ids=passage_input_ids,
                        attention_mask=passage_attention_mask
                    )[1].cpu().numpy()
                    
                    # Cosine similarity
                    score = np.dot(query_emb[0], passage_emb[0]) / (
                        np.linalg.norm(query_emb[0]) * np.linalg.norm(passage_emb[0]) + 1e-8
                    )
                    passage_scores.append(score)
                
                # Rank passages by score
                ranked_idx = np.argsort(passage_scores)[::-1]
                
                # Find rank of gold passage
                gold_rank = len(all_passages) + 1
                for rank, pidx in enumerate(ranked_idx):
                    if all_passages[pidx] == gold_passage:
                        gold_rank = rank + 1
                        break
                
                # Compute metrics
                if gold_rank <= top_k:
                    mrr_scores.append(1.0 / gold_rank)
                else:
                    mrr_scores.append(0.0)
                
                if gold_rank <= top_k:
                    ndcg_scores.append(1.0 / np.log2(gold_rank + 1))
                else:
                    ndcg_scores.append(0.0)
                
                recall_1.append(1.0 if gold_rank <= 1 else 0.0)
                recall_5.append(1.0 if gold_rank <= 5 else 0.0)
                recall_10.append(1.0 if gold_rank <= 10 else 0.0)
        
        except Exception as e:
            continue
    
    # Aggregate metrics
    metrics = {
        "MRR@10": np.mean(mrr_scores) if mrr_scores else 0.0,
        "nDCG@10": np.mean(ndcg_scores) if ndcg_scores else 0.0,
        "Recall@1": np.mean(recall_1) if recall_1 else 0.0,
        "Recall@5": np.mean(recall_5) if recall_5 else 0.0,
        "Recall@10": np.mean(recall_10) if recall_10 else 0.0,
    }
    
    return metrics

In [19]:
print("\n" + "="*60)
print("PHASE 2 END: EVALUATE BM25 BASELINE MODEL")
print("="*60)

import torch
from simpletransformers.retrieval import RetrievalModel, RetrievalArgs
import json

# ============================================================
# Load BM25 Baseline Model (Trained in Phase 2)
# ============================================================

print("\n=== Loading BM25 Baseline Model ===\n")

MODEL_TO_EVAL = "./models/dpr_bm25_baseline_epoch5"
print(f"Loading model from: {MODEL_TO_EVAL}")

# Configure evaluation args
eval_args = RetrievalArgs()
eval_args.data_format = "beir"
eval_args.max_seq_length = 256
eval_args.include_title = False
eval_args.hard_negatives = False
eval_args.fp16 = USE_MIXED_PRECISION
eval_args.eval_batch_size = 8

# Load YOUR trained BM25 baseline model
try:
    dpr_model = RetrievalModel(
        model_type="custom",
        model_name=MODEL_TO_EVAL,
        args=eval_args,
        use_cuda=torch.cuda.is_available()
    )
    print(f"‚úì BM25 Baseline Model loaded from {MODEL_TO_EVAL}")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Make sure the model was saved during Phase 2 training")
    raise

# Move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dpr_model.query_encoder = dpr_model.query_encoder.to(device)
dpr_model.context_encoder = dpr_model.context_encoder.to(device)
print(f"‚úì Model moved to device: {device}")

# ============================================================
# Run Evaluation using the reusable function
# ============================================================

print("\n" + "="*60)
print("EVALUATING BM25 BASELINE ON MS MARCO DEV")
print("="*60)

if 'msmarco_dev' in locals() and len(msmarco_dev) > 0:
    print(f"\nDev set size: {len(msmarco_dev):,}")
    
    # Use the function you defined earlier
    metrics = evaluate_dpr_model(
        dpr_model, 
        msmarco_dev, 
        model_name="BM25 Baseline",
        device=device,
        top_k=10, 
        max_samples=50  # Evaluate on 50 samples for speed
    )
    
    # Display results
    print("\n" + "="*60)
    print("PHASE 2 BASELINE RESULTS")
    print("="*60)
    
    print(f"\nStage: Phase 2 - BM25 Negative Sampling")
    print(f"Model: {MODEL_TO_EVAL}")
    print(f"Dataset: MS MARCO Dev")
    print(f"Samples evaluated: 50 (from {len(msmarco_dev):,} total)")
    
    print(f"\n{'Metric':<20} {'Score':<15}")
    print("-" * 35)
    for metric, score in metrics.items():
        print(f"{metric:<20} {score:<15.4f}")
    
    print("\nüìä This is your BASELINE for comparison with:")
    print("   - Phase 3: LLM-enhanced model")
    print("   - Phase 4: RAG-enhanced model")
    print("   - Phase 5: Final multilingual evaluation")
    
    # Save baseline results
    results_path = f"{MODEL_DIR}/phase2_baseline_results.json"
    with open(results_path, 'w') as f:
        json.dump(metrics, f, indent=2)
    print(f"\n‚úì Baseline results saved to {results_path}")
    
    # Store for later comparison
    phase2_baseline = metrics.copy()
    print("\n‚úì Baseline metrics stored in 'phase2_baseline' variable for Phase 3 comparison")
    
else:
    print("‚ö† msmarco_dev not loaded")
    print("Make sure you ran the 'CREATING BEIR FORMAT DEV/TEST DATASETS' cell first")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("\n‚úì GPU memory cleared")

print("\n" + "="*60)
print("‚úÖ PHASE 2 EVALUATION COMPLETE - BASELINE ESTABLISHED")
print("="*60)



PHASE 2 END: EVALUATE BM25 BASELINE MODEL

=== Loading BM25 Baseline Model ===

Loading model from: ./models/dpr_bm25_baseline_epoch5
‚úì BM25 Baseline Model loaded from ./models/dpr_bm25_baseline_epoch5
‚úì Model moved to device: cuda

EVALUATING BM25 BASELINE ON MS MARCO DEV

Dev set size: 7,437

Evaluating: BM25 Baseline

Using 50 samples (limited from 7437)
Corpus size: 50 passages



Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:28<00:00,  1.76it/s]


PHASE 2 BASELINE RESULTS

Stage: Phase 2 - BM25 Negative Sampling
Model: ./models/dpr_bm25_baseline_epoch5
Dataset: MS MARCO Dev
Samples evaluated: 50 (from 7,437 total)

Metric               Score          
-----------------------------------
MRR@10               0.2167         
nDCG@10              0.2792         
Recall@1             0.1200         
Recall@5             0.4000         
Recall@10            0.4800         

üìä This is your BASELINE for comparison with:
   - Phase 3: LLM-enhanced model
   - Phase 4: RAG-enhanced model
   - Phase 5: Final multilingual evaluation

‚úì Baseline results saved to ./models/phase2_baseline_results.json

‚úì Baseline metrics stored in 'phase2_baseline' variable for Phase 3 comparison

‚úì GPU memory cleared

‚úÖ PHASE 2 EVALUATION COMPLETE - BASELINE ESTABLISHED





In [104]:
# Save training metadata

import json
from datetime import datetime

checkpoint_info = {
    "timestamp": datetime.now().isoformat(),
    "stage": "1_msmarco_baseline",
    "model_path": f"{MODEL_DIR}/dpr_bm25_msmarco_final",
    "base_model": BASE_MODEL,
    "negative_sampling": "BM25",
    "training_samples": len(msmarco_train_df),
    "epochs": model_args.num_train_epochs,
    "batch_size_effective": model_args.train_batch_size * model_args.gradient_accumulation_steps,
    "max_seq_length": model_args.max_seq_length,
    "fp16": model_args.fp16,
}

# Save metadata
with open(f"{MODEL_DIR}/checkpoint_stage1.json", "w") as f:
    json.dump(checkpoint_info, f, indent=2)

print("‚úì Checkpoint info saved")
print("\n" + "="*60)
print("‚úÖ PHASE 2 COMPLETE: Baseline DPR Training")
print("="*60)
print(f"\nüìÅ Model saved at: {MODEL_DIR}/dpr_bm25_msmarco_epoch5")
print(f"üìä Training samples: {len(msmarco_train_df):,}")
print(f"üîß Next: Phase 3 - LLM Integration")

‚úì Checkpoint info saved

‚úÖ PHASE 2 COMPLETE: Baseline DPR Training

üìÅ Model saved at: ./models/dpr_bm25_msmarco_epoch5
üìä Training samples: 100
üîß Next: Phase 3 - LLM Integration


## Phase 3

In [20]:
# === LLM CONFIGURATION (SET THIS FIRST) ===

# Choose your LLM backend
USE_OLLAMA = True  # Set True for local Ollama, False for Gemini API
GEMINI_API_KEY = None  # Set if using Gemini

# Ollama model (small model for 4GB GPU)
OLLAMA_MODEL = "llama3.1:8b-instruct-q4_K_M"  # 3B parameters, ~2GB RAM
OLLAMA_URL = "http://localhost:11434"  # Default Ollama endpoint

# LLM parameters
LLM_TEMPERATURE = 0.1  # Low for consistent classification
LLM_MAX_TOKENS = 50    # Short responses only

print("‚úì LLM Configuration:")
print(f"  Backend: {'Ollama (Local)' if USE_OLLAMA else 'Gemini (API)'}")
print(f"  Model: {OLLAMA_MODEL if USE_OLLAMA else 'gemini-2.0-flash-exp'}")

‚úì LLM Configuration:
  Backend: Ollama (Local)
  Model: llama3.1:8b-instruct-q4_K_M


In [21]:
import subprocess
import sys

def install_if_needed(package):
    """Install package if not already installed"""
    try:
        __import__(package.replace("-", "_"))
        print(f"  ‚úì {package} already installed")
    except ImportError:
        print(f"  Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("Installing LLM dependencies...")
packages = ["requests", "ollama"] if USE_OLLAMA else ["google-generativeai"]

for pkg in packages:
    install_if_needed(pkg)

print("‚úì All dependencies installed")


Installing LLM dependencies...
  ‚úì requests already installed
  ‚úì ollama already installed
‚úì All dependencies installed


In [22]:
import requests

if USE_OLLAMA:
    print("Testing Ollama connection...")
    try:
        response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=2)
        if response.status_code == 200:
            models = response.json().get("models", [])
            print(f"‚úì Ollama is running at {OLLAMA_URL}")
            print(f"  Available models: {len(models)}")
            
            # Check if our model is available
            model_names = [m.get("name", "") for m in models]
            if any(OLLAMA_MODEL in name for name in model_names):
                print(f"  ‚úì {OLLAMA_MODEL} is available")
            else:
                print(f"  ‚ö† {OLLAMA_MODEL} not found!")
                print(f"  Run: ollama pull {OLLAMA_MODEL}")
        else:
            print("‚ùå Ollama not responding")
    except Exception as e:
        print(f"‚ùå Ollama not running!")
        print("Steps to start Ollama:")
        print("  1. Download: https://ollama.ai/download")
        print("  2. Run: ollama serve")
        print(f"  3. In another terminal: ollama pull {OLLAMA_MODEL}")


Testing Ollama connection...
‚úì Ollama is running at http://localhost:11434
  Available models: 2
  ‚úì llama3.1:8b-instruct-q4_K_M is available


In [53]:
import ollama
import requests
import json
from typing import Dict, List
from tqdm.auto import tqdm

class LLMHardNegativeClassifier:
    """Classify negatives as HARD or EASY using LLM"""
    
    def __init__(self, use_ollama=True, model=None, apikey=None):
        self.use_ollama = use_ollama
        
        if use_ollama:
            self.model = model or OLLAMA_MODEL
            self.url = OLLAMA_URL
        else:
            import google.generativeai as genai
            genai.configure(api_key=apikey or GEMINI_API_KEY)
            self.model = genai.GenerativeModel("gemini-2.0-flash-exp")
    
    def call_ollama(self, prompt: str) -> str:
        """Call Ollama API"""
        try:
            response = requests.post(
                f"{self.url}/api/generate",
                json={
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False,
                    "options": {
                        "temperature": 0.3,
                        "num_predict": 50  # Increased for scoring
                    }
                },
                timeout=30
            )
            
            if response.status_code == 200:
                return response.json()["response"].strip()
            else:
                return "ERROR"
        except Exception as e:
            print(f"Ollama error: {e}")
            return "ERROR"
    
    def _extract_hardness_score(self, response: str) -> float:
        """Extract hardness score (0-1) from LLM response"""
        
        response_upper = response.upper()
        
        # Check for explicit HARD/EASY classification
        if "HARD" in response_upper:
            if "VERY HARD" in response_upper or "EXTREMELY HARD" in response_upper:
                return 0.9
            elif "MODERATELY HARD" in response_upper or "FAIRLY HARD" in response_upper:
                return 0.7
            else:
                return 0.75  # Default HARD
        
        elif "EASY" in response_upper:
            if "VERY EASY" in response_upper or "EXTREMELY EASY" in response_upper:
                return 0.1
            elif "MODERATELY EASY" in response_upper or "FAIRLY EASY" in response_upper:
                return 0.3
            else:
                return 0.25  # Default EASY
        
        # Try to extract numeric score
        import re
        match = re.search(r'(\d+(?:\.\d+)?)\s*(?:%|out of 100|/100)', response)
        if match:
            score = float(match.group(1))
            if '%' in response or 'out of 100' in response:
                return score / 100.0
            else:
                return min(score, 1.0)
        
        # Default
        return 0.5
    
    def classify_negative(self, query: str, gold_passage: str, negative_passage: str) -> Dict:
        """Classify negative and return hardness score (0-1)"""
        
        # Truncate for context
        query = query[:200]
        gold_passage = gold_passage[:300]
        negative_passage = negative_passage[:300]
        
        prompt = f"""Rate how HARD this negative passage is for training (0-100).

Query: {query}

Gold Passage: {gold_passage}

Negative Passage: {negative_passage}

HARD negatives (70-100):
- Topically related to query
- Use similar keywords/entities
- BUT contain different/incorrect information
- Require semantic understanding to distinguish

EASY negatives (0-30):
- Clearly unrelated to query
- Different topic/domain
- No semantic confusion

Rate this negative as a hardness score from 0-100:"""
        
        if self.use_ollama:
            response = self.call_ollama(prompt)
        else:
            response = self.model.generate_content(prompt).text.strip()
        
        # Extract hardness score (0-1)
        hardness_score = self._extract_hardness_score(response)
        is_hard = hardness_score > 0.5
        
        return {
            "is_hard": is_hard,
            "hardness_score": hardness_score,  # ‚Üê NEW: continuous score
            "classification": "HARD" if is_hard else "EASY",
            "response": response
        }
    
    def classify_batch(self, examples: List[Dict], max_samples: int = None) -> List[Dict]:
        """Classify a batch of negatives"""
        
        samples = examples[:max_samples] if max_samples else examples
        results = []
        
        print(f"\nClassifying {len(samples)} negatives with LLM...")
        
        for example in tqdm(samples, desc="LLM Classification"):
            result = self.classify_negative(
                example['query_text'],
                example['gold_passage'],
                example['hard_negative']
            )
            
            example_with_classification = {
                **example,
                'llm_classification': result['classification'],
                'is_hard': result['is_hard'],
                'hardness_score': result['hardness_score']  # ‚Üê NEW
            }
            results.append(example_with_classification)
        
        # Statistics
        hard_count = sum(1 for r in results if r['is_hard'] == True)
        easy_count = sum(1 for r in results if r['is_hard'] == False)
        avg_score = np.mean([r['hardness_score'] for r in results])
        
        print(f"\n‚úì Classification complete!")
        print(f"  HARD: {hard_count} ({hard_count/len(results)*100:.1f}%)")
        print(f"  EASY: {easy_count} ({easy_count/len(results)*100:.1f}%)")
        print(f"  Average hardness: {avg_score:.2f}")
        
        return results

# Initialize
print("Initializing LLM Classifier...")
llm_classifier = LLMHardNegativeClassifier(
    use_ollama=USE_OLLAMA,
    model=OLLAMA_MODEL if USE_OLLAMA else None,
    apikey=GEMINI_API_KEY
)
print("‚úì LLM Classifier ready")


Initializing LLM Classifier...
‚úì LLM Classifier ready


In [None]:
print("\n" + "="*60)
print("COMPREHENSIVE LLM CLASSIFIER TEST")
print("="*60)

# Create diverse test cases with known difficulty levels

test_cases = [
    {
        "name": "Test 1: EASY - Completely Unrelated",
        "query": "what is photosynthesis",
        "gold": "Photosynthesis is the process by which plants convert light energy into chemical energy in the form of glucose. It occurs in the chloroplasts of plant cells.",
        "negative": "The Great Wall of China is one of the most impressive architectural feats in human history, stretching over 13,000 miles across northern China.",
        "expected": "EASY"
    },
    {
        "name": "Test 2: HARD - Topically Similar but Wrong Answer",
        "query": "what is photosynthesis",
        "gold": "Photosynthesis is the process by which plants convert light energy into chemical energy in the form of glucose.",
        "negative": "Chemosynthesis is the process by which certain organisms use chemical energy instead of light energy to produce organic compounds from carbon dioxide.",
        "expected": "HARD"
    },
    {
        "name": "Test 3: EASY - Different Topic",
        "query": "how do vaccines work",
        "gold": "Vaccines work by introducing a weakened or inactive form of a pathogen to stimulate the immune system to produce antibodies.",
        "negative": "Python is a high-level programming language known for its simple syntax and readability, used in web development, data science, and artificial intelligence.",
        "expected": "EASY"
    },
    {
        "name": "Test 4: HARD - Similar Keywords but Different Meaning",
        "query": "how do vaccines work",
        "gold": "Vaccines work by introducing a weakened or inactive form of a pathogen to stimulate the immune system.",
        "negative": "Antibiotics work by killing bacteria or preventing their growth, interfering with bacterial cell walls or protein synthesis.",
        "expected": "HARD"
    },
    {
        "name": "Test 5: EASY - Sports vs Science",
        "query": "what is diabetes",
        "gold": "Diabetes is a metabolic disorder characterized by high blood sugar levels due to insufficient insulin production or insulin resistance.",
        "negative": "Basketball is a team sport where two teams of five players compete to shoot a ball through the opposing team's elevated hoop.",
        "expected": "EASY"
    },
    {
        "name": "Test 6: HARD - Similar Medical Terms but Different",
        "query": "what is diabetes",
        "gold": "Diabetes is a metabolic disorder characterized by high blood sugar levels due to insufficient insulin production or insulin resistance.",
        "negative": "Hypertension is a condition characterized by elevated blood pressure, which can lead to heart disease and stroke if left untreated.",
        "expected": "HARD"
    },
    {
        "name": "Test 7: EASY - Historical vs Current",
        "query": "what is artificial intelligence",
        "gold": "Artificial intelligence is the field of computer science dedicated to creating intelligent machines capable of performing tasks that typically require human intelligence.",
        "negative": "The Roman Empire was one of the largest and most influential empires in history, lasting over 400 years and spanning three continents.",
        "expected": "EASY"
    },
    {
        "name": "Test 8: HARD - Related ML Concepts",
        "query": "what is artificial intelligence",
        "gold": "Artificial intelligence is the field of computer science dedicated to creating intelligent machines capable of performing tasks that typically require human intelligence.",
        "negative": "Machine learning is a subset of artificial intelligence that focuses on enabling computers to learn and improve from experience without being explicitly programmed.",
        "expected": "HARD"
    }
]

# Run tests
print("\nRunning comprehensive classifier tests...\n")

results_summary = {
    "EASY": {"correct": 0, "incorrect": 0},
    "HARD": {"correct": 0, "incorrect": 0}
}

detailed_results = []

for test in test_cases:
    print(f"\n{'‚îÄ'*70}")
    print(f"{test['name']}")
    print(f"{'‚îÄ'*70}")
    
    print(f"\nQuery: {test['query']}")
    print(f"\nGold Passage (correct answer):")
    print(f"  {test['gold'][:100]}...")
    print(f"\nNegative Passage (candidate):")
    print(f"  {test['negative'][:100]}...")
    
    # Classify
    result = llm_classifier.classify_negative(
        test['query'],
        test['gold'],
        test['negative']
    )
    
    classification = result['classification']
    expected = test['expected']
    is_correct = (classification == expected)
    
    # Display result
    print(f"\nüìä Result:")
    print(f"  Expected: {expected}")
    print(f"  Classified: {classification}")
    print(f"  Status: {'‚úÖ CORRECT' if is_correct else '‚ùå INCORRECT'}")
    print(f"  LLM Response: {result['response'][:80]}...")
    
    # Track statistics
    if expected in results_summary:
        if is_correct:
            results_summary[expected]["correct"] += 1
        else:
            results_summary[expected]["incorrect"] += 1
    
    detailed_results.append({
        "test_name": test['name'],
        "expected": expected,
        "classified": classification,
        "correct": is_correct
    })

# Summary statistics
print(f"\n\n{'='*70}")
print("TEST SUMMARY")
print(f"{'='*70}\n")

total_tests = len(test_cases)
total_correct = sum(r["correct"] for r in results_summary.values())
total_incorrect = sum(r["incorrect"] for r in results_summary.values())
accuracy = (total_correct / total_tests) * 100 if total_tests > 0 else 0

print(f"Total Tests: {total_tests}")
print(f"Correct: {total_correct} ({accuracy:.1f}%)")
print(f"Incorrect: {total_incorrect} ({100-accuracy:.1f}%)")

print(f"\nPer-Category Performance:")
for category, stats in results_summary.items():
    total = stats["correct"] + stats["incorrect"]
    if total > 0:
        cat_accuracy = (stats["correct"] / total) * 100
        print(f"  {category}: {stats['correct']}/{total} ({cat_accuracy:.1f}%)")

# Create results DataFrame
results_df = pd.DataFrame(detailed_results)

print(f"\n\nDetailed Results Table:")
print(results_df.to_string(index=False))

# Evaluation
print(f"\n{'='*70}")
print("CLASSIFIER EVALUATION")
print(f"{'='*70}\n")

if accuracy >= 87.5:  # 7/8 correct
    print("‚úÖ EXCELLENT - Classifier is working correctly!")
    print("   It can distinguish between EASY and HARD negatives reliably.")
elif accuracy >= 75:  # 6/8 correct
    print("‚úÖ GOOD - Classifier is working reasonably well.")
    print("   Some edge cases may need adjustment.")
elif accuracy >= 62.5:  # 5/8 correct
    print("‚ö†Ô∏è  FAIR - Classifier has room for improvement.")
    print("   Consider tuning the LLM temperature or prompt.")
else:
    print("‚ùå POOR - Classifier needs significant improvement.")
    print("   Try using a larger LLM model or different prompt.")

print("\n" + "="*70)



COMPREHENSIVE LLM CLASSIFIER TEST

Running comprehensive classifier tests...


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Test 1: EASY - Completely Unrelated
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

Query: what is photosynthesis

Gold Passage (correct answer):
  Photosynthesis is the process by which plants convert light energy into chemical energy in the form ...

Negative Passage (candidate):
  The Great Wall of China is one of the most impressive architectural feats in human history, stretchi...

üìä Result:
  Expected: EASY
  Classified: HARD
  Status: ‚ùå INCORRECT
  LLM Response: I would rate the hardness of this negative passage as **80**.

Her

In [46]:
print("\n" + "="*60)
print("STEP 1: CLASSIFY TRAINING DATA WITH LLM")
print("="*60)

# Classify samples
LLM_CLASSIFY_SIZE = 100 if DEV_MODE else 1000

print(f"\nClassifying {LLM_CLASSIFY_SIZE} negatives...")

# Convert to list of dicts - FIX column names
train_examples = msmarco_train_df.head(LLM_CLASSIFY_SIZE).to_dict('records')

# Classify with LLM
llm_classified = llm_classifier.classify_batch(train_examples, max_samples=LLM_CLASSIFY_SIZE)

# Convert back to DataFrame
llm_classified_df = pd.DataFrame(llm_classified)

print(f"\n‚úì Classification complete!")
print(f"  Total: {len(llm_classified_df):,}")
print(f"  HARD: {llm_classified_df['is_hard'].sum():,}")
print(f"  EASY: {(~llm_classified_df['is_hard']).sum():,}")

# Show sample
print("\nSample classified data:")
print(llm_classified_df[['query_text', 'llm_classification']].head(3))



STEP 1: CLASSIFY TRAINING DATA WITH LLM

Classifying 100 negatives...

Classifying 100 negatives with LLM...


LLM Classification: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [04:19<00:00,  2.59s/it]


‚úì Classification complete!
  HARD: 96 (96.0%)
  EASY: 4 (4.0%)

‚úì Classification complete!
  Total: 100
  HARD: 96
  EASY: 4

Sample classified data:
                                          query_text llm_classification
0                         what are the liberal arts?               HARD
1  what is the mechanism of action of fibrinolyti...               HARD
2                          what is normal plat count               HARD





In [None]:
print("\n" + "="*60)
print("STEP 2: GENERATE LLM HARD NEGATIVES")
print("="*60)

# Filter to keep only hard negatives from classification
hard_negatives_df = llm_classified_df[llm_classified_df['is_hard'] == True].copy()

# Keep only training columns
train_columns = ['query_text', 'gold_passage', 'hard_negative']
hard_negatives_df = hard_negatives_df[train_columns]

print(f"\n‚úì Filtered HARD negatives: {len(hard_negatives_df):,}")

# Generate additional hard negatives to increase dataset
print("\nGenerating additional hard negatives with LLM...")

# Initialize generator
class LLMHardNegativeGenerator:
    """Generate hard negatives using LLM"""
    
    def __init__(self, model: str):
        self.model = model
    
    def generate_negatives(self, query: str, positive_passage: str, num_negatives: int = 1) -> List[str]:
        """Generate hard negatives"""
        
        prompt = f"""Generate {num_negatives} HARD NEGATIVE passage(s) for this query-passage pair.

Query: {query}

Correct Passage: {positive_passage}

Requirements:
1. Topically related to query
2. Similar keywords as correct passage
3. BUT does NOT answer the query
4. 50-150 words each

Output ONLY passages, numbered 1., 2., etc."""
        
        try:
            response = ollama.generate(
                model=self.model,
                prompt=prompt,
                options={
                    "temperature": 0.7,
                    "num_predict": 400
                }
            )
            
            text = response['response'].strip()
            negatives = self._parse_response(text, num_negatives)
            return negatives
            
        except Exception as e:
            print(f"Error: {str(e)}")
            return []
    
    def _parse_response(self, text: str, num_negatives: int) -> List[str]:
        """Parse LLM response"""
        negatives = []
        lines = text.split('\n')
        current_negative = []
        
        for line in lines:
            line = line.strip()
            if line and any(line.startswith(f"{i}.") for i in range(1, 6)):
                if current_negative:
                    negatives.append(' '.join(current_negative))
                    current_negative = []
                line = line.split('.', 1)[1].strip() if '.' in line else line
            
            if line:
                current_negative.append(line)
        
        if current_negative:
            negatives.append(' '.join(current_negative))
        
        return negatives[:num_negatives] if negatives else [text[:500]]

# Initialize generator
llm_generator = LLMHardNegativeGenerator(OLLAMA_MODEL)

# Generate for subset to increase data
GENERATION_SIZE = 200 if DEV_MODE else 500

print(f"Generating {GENERATION_SIZE} samples...")

import time
start_time = time.time()
failed_count = 0
generated_data = []

for idx, row in tqdm(msmarco_train_df.head(GENERATION_SIZE).iterrows(), total=GENERATION_SIZE, desc="Generating"):
    query = row['query_text']
    positive = row['gold_passage']
    
    # Generate 1-2 hard negatives
    hard_negatives = llm_generator.generate_negatives(query, positive, num_negatives=2)
    
    if hard_negatives:
        for neg in hard_negatives:
            generated_data.append({
                'query_text': query,
                'gold_passage': positive,
                'hard_negative': neg
            })
    else:
        failed_count += 1

generated_df = pd.DataFrame(generated_data)

print(f"\n‚úì Generated {len(generated_df):,} new samples")
print(f"  Failed: {failed_count}")
print(f"  Time: {(time.time() - start_time)/60:.1f} minutes")

# Combine classified hard negatives with newly generated ones
final_train_df = pd.concat([hard_negatives_df, generated_df], ignore_index=True)

print(f"\n‚úì Final dataset size: {len(final_train_df):,}")
print(f"  From classification: {len(hard_negatives_df):,}")
print(f"  From generation: {len(generated_df):,}")

# Save
final_train_df.to_csv(f"{DATA_DIR}/llm_classified_generated.tsv", sep="\t", index=False)
print(f"\n‚úì Saved to {DATA_DIR}/llm_classified_generated.tsv")
pickle_path = f"{DATA_DIR}/hard_negatives_df.pkl"

with open(pickle_path, 'wb') as f:
    pickle.dump(final_train_df, f)

print(f"‚úì Saved pickle: {pickle_path}")




STEP 2: GENERATE LLM HARD NEGATIVES

‚úì Filtered HARD negatives: 98

Generating additional hard negatives with LLM...
Generating 200 samples...


Generating:   0%|          | 0/200 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   0%|          | 1/200 [00:04<14:41,  4.43s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   1%|          | 2/200 [00:07<12:46,  3.87s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   2%|‚ñè         | 3/200 [00:12<13:24,  4.08s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   2%|‚ñè         | 4/200 [00:16<13:22,  4.09s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   2%|‚ñé         | 5/200 [00:20<13:09,  4.05s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
Generating:   3%|‚ñé         | 6/200 [00:22<11:21,  3.51s/it]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


‚úì Generated 200 new samples
  Failed: 0
  Time: 6.5 minutes

‚úì Final dataset size: 298
  From classification: 98
  From generation: 200

‚úì Saved to data/llm_classified_generated.tsv





In [None]:
print("\n" + "="*60)
print("STEP 3: TRAIN DPR WITH LLM-ENHANCED DATA")
print("="*60)

# Clear GPU memory
clear_gpu_memory()

# Configuration
phase3_args = RetrievalArgs()
phase3_args.data_format = "beir"
phase3_args.hard_negatives = True
phase3_args.num_train_epochs = 3
phase3_args.train_batch_size = 8
phase3_args.learning_rate = 5e-7
phase3_args.max_seq_length = 256
phase3_args.output_dir = f"{MODEL_DIR}/dpr_llm_enhanced"
phase3_args.fp16 = USE_MIXED_PRECISION
phase3_args.evaluate_during_training = False
phase3_args.save_model_every_epoch = False
phase3_args.overwrite_output_dir = True
phase3_args.include_title = False  # CRITICAL: No title column in our data

print(f"\nTraining on {len(final_train_df):,} LLM-enhanced samples...")
print(f"  Epochs: {phase3_args.num_train_epochs}")
print(f"  Batch size: {phase3_args.train_batch_size}")
print(f"  Learning rate: {phase3_args.learning_rate}")
print(f"  Include title: {phase3_args.include_title}")

try:
    # Load pretrained model from Phase 1
    print("\nLoading Phase 1 model...")
    
    # Find the latest Phase 1 checkpoint
    phase1_base_path = f"{MODEL_DIR}/dpr_bm25_baseline_epoch5"  # Your latest epoch
    
    if os.path.exists(phase1_base_path):
        print(f"  Found Phase 1 model at: {phase1_base_path}")
        
        # Load with the context encoder path
        dpr_model = RetrievalModel(
            model_type="custom",
            model_name=phase1_base_path,  # Load the full checkpoint
            args=phase3_args,
            use_cuda=torch.cuda.is_available()
        )
        print("‚úì Loaded Phase 1 checkpoint")
    else:
        print(f"  ‚ö† Phase 1 model not found at {phase1_base_path}")
        print("  Initializing fresh model instead...")
        
        dpr_model = RetrievalModel(
            model_type="custom",
            model_name=None,
            context_encoder_name="bert-base-multilingual-cased",
            query_encoder_name="bert-base-multilingual-cased",
            args=phase3_args,
            use_cuda=torch.cuda.is_available()
        )
        print("‚úì Initialized fresh model")
    
    # Verify training data
    print(f"\nTraining data verification:")
    print(f"  Columns: {final_train_df.columns.tolist()}")
    print(f"  Shape: {final_train_df.shape}")
    print(f"  Sample row:")
    print(f"    {final_train_df.iloc[0]}")
    
    # Train
    print(f"\nStarting training...")
    start_time = time.time()
    
    dpr_model.train_model(final_train_df)
    
    training_time = time.time() - start_time
    
    print("\n" + "="*60)
    print("‚úÖ PHASE 3 COMPLETE!")
    print("="*60)
    print(f"Model saved to: {phase3_args.output_dir}")
    print(f"Training time: {training_time/60:.1f} minutes")
    print(f"Samples processed: {len(final_train_df):,}")
    print(f"Avg time per sample: {(training_time/len(final_train_df)):.2f} seconds")
    
    # Clear GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("‚úì GPU memory cleared")
        
except Exception as e:
    print(f"‚ùå Training failed: {str(e)}")
    import traceback
    traceback.print_exc()
    raise

print("\n" + "="*60)
print("PHASE 3 SUMMARY")
print("="*60)
print("  ‚úì Classified training data with LLM")
print("  ‚úì Generated additional hard negatives")
print("  ‚úì Trained DPR with LLM-enhanced data")
print("="*60)



STEP 3: TRAIN DPR WITH LLM-ENHANCED DATA
‚úì GPU memory cleared
  Allocated: 1.44GB
  Reserved: 1.61GB
  Free: 11.28GB
  Total: 12.88GB

Training on 298 LLM-enhanced samples...
  Epochs: 3
  Batch size: 8
  Learning rate: 5e-07
  Include title: False

Loading Phase 1 model...
  Found Phase 1 model at: ./models/dpr_bm25_baseline_epoch5
‚úì Loaded Phase 1 checkpoint

Training data verification:
  Columns: ['query_text', 'gold_passage', 'hard_negative']
  Shape: (298, 3)
  Sample row:
    query_text                              what are the liberal arts?
gold_passage     liberal arts. 1. the academic course of instru...
hard_negative    Atrophy vs dystrophy. What are atrophy and dys...
Name: 0, dtype: object

Starting training...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 298/298 [00:00<00:00, 1492.50 examples/s]
INFO:simpletransformers.retrieval.retrieval_model: Training started
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]INFO:simpletransformers.retrieval.retrieval_model:   Starting fine-tuning.
  scaler = amp.GradScaler()
  with amp.autocast():
  (max_idxs == torch.tensor(labels)).sum().cpu().detach().numpy().item()
Epochs 0/3. Running Loss:   21.7188 Correct count: 0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38/38 [01:11<00:00,  1.88s/it]
Epochs 1/3. Running Loss:    7.4645 Correct count: 0: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38/38 [01:12<00:00,  1.92s/it]
Epochs 2/3. Running Loss:    1.9789 Correct count: 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 38/38 [00:14<00:00,  2.60it/s]
Epoch 3 of 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [02:39<00:00, 53.06s/it]
INFO:simpletransformers.retrieval.retrieval_model:Saving model into ./models/dpr-llm-enhanced
INFO:simpletransformers.retrieval.retrieval_model: Training of ./


‚úÖ PHASE 3 COMPLETE!
Model saved to: ./models/dpr-llm-enhanced
Training time: 2.7 minutes
Samples processed: 298
Avg time per sample: 0.54 seconds
‚úì GPU memory cleared

PHASE 3 SUMMARY
  ‚úì Classified training data with LLM
  ‚úì Generated additional hard negatives
  ‚úì Trained DPR with LLM-enhanced data


In [29]:
print("\n" + "="*60)
print("MODEL EVALUATION ON MS MARCO DEV SET")
print("="*60)

import numpy as np
from tqdm import tqdm

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Model paths
bm25_model_path = f"{MODEL_DIR}/dpr_bm25_baseline_epoch5"
llm_model_path = f"{MODEL_DIR}/dpr_llm_enhanced"

# Load eval args
eval_args = RetrievalArgs()
eval_args.data_format = "beir"
eval_args.max_seq_length = 256
eval_args.include_title = False
eval_args.hard_negatives = False
eval_args.fp16 = USE_MIXED_PRECISION

print(f"\nLoading models...")

# Load models
bm25_model = RetrievalModel(
    model_type="custom",
    model_name=bm25_model_path,
    args=eval_args,
    use_cuda=torch.cuda.is_available()
)
bm25_model.query_encoder = bm25_model.query_encoder.to(device)
bm25_model.context_encoder = bm25_model.context_encoder.to(device)
print(f"‚úì BM25 Model loaded")

clear_gpu_memory()

llm_model = RetrievalModel(
    model_type="custom",
    model_name=llm_model_path,
    args=eval_args,
    use_cuda=torch.cuda.is_available()
)
llm_model.query_encoder = llm_model.query_encoder.to(device)
llm_model.context_encoder = llm_model.context_encoder.to(device)
print(f"‚úì LLM Model loaded")

# ============================================================
# EVALUATE BOTH MODELS 
# ============================================================

try:
    print(f"Evaluation dataset: {len(msmarco_dev):,} query-passage pairs\n")
    
    # Evaluate BM25 Model
    bm25_results = evaluate_dpr_model(
        bm25_model, 
        msmarco_dev, 
        "BM25 Baseline", 
        device,
        max_samples=50  # Evaluate on 50 samples for speed
    )
    
    clear_gpu_memory()
    
    # Evaluate LLM-Enhanced Model
    llm_results = evaluate_dpr_model(
        llm_model, 
        msmarco_dev, 
        "LLM-Enhanced Model", 
        device,
        max_samples=50  # Same 50 samples for fair comparison
    )
    
    # ============================================================
    # Display Results
    # ============================================================
    
    print("\n" + "="*60)
    print("EVALUATION RESULTS - COMPARISON")
    print("="*60)
    
    results_df = pd.DataFrame({
        "Metric": list(bm25_results.keys()),
        "BM25 Baseline": [f"{bm25_results[k]:.4f}" for k in bm25_results.keys()],
        "LLM-Enhanced": [f"{llm_results[k]:.4f}" for k in llm_results.keys()],
        "Improvement %": [
            f"{((llm_results[k] - bm25_results[k])/max(bm25_results[k], 0.0001) * 100):.2f}%"
            for k in bm25_results.keys()
        ]
    })
    
    print("\n" + results_df.to_string(index=False))
    
    # Summary
    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    
    avg_improvement = np.mean([
        ((llm_results[k] - bm25_results[k])/max(bm25_results[k], 0.0001) * 100)
        for k in bm25_results.keys()
    ])
    
    print(f"\nAverage Improvement: {avg_improvement:.2f}%")
    
    if avg_improvement > 0:
        print(f"‚úÖ LLM-Enhanced model is BETTER ({avg_improvement:.2f}% improvement)")
    else:
        print(f"‚ö†Ô∏è  BM25 model performs better ({abs(avg_improvement):.2f}% better)")
    
    # Save results
    import json
    results_summary = {
        "BM25": bm25_results,
        "LLM-Enhanced": llm_results,
        "Improvement %": {
            k: ((llm_results[k] - bm25_results[k])/max(bm25_results[k], 0.0001) * 100)
            for k in bm25_results.keys()
        },
        "Average Improvement %": avg_improvement
    }
    
    results_path = f"{MODEL_DIR}/phase3_comparison_results.json"
    with open(results_path, 'w') as f:
        json.dump(results_summary, f, indent=2)
    print(f"\n‚úì Results saved to {results_path}")
    
    # Clear GPU
    clear_gpu_memory()
    
    print("\n" + "="*60)
    print("‚úÖ PHASE 3 EVALUATION COMPLETE")
    print("="*60)
    
except Exception as e:
    print(f"\n‚ùå Error: {str(e)}")
    import traceback
    traceback.print_exc()



MODEL EVALUATION ON MS MARCO DEV SET

Using device: cuda

Loading models...
‚úì BM25 Model loaded
‚úì GPU memory cleared
  Allocated: 2.86GB
  Reserved: 3.01GB
  Free: 9.87GB
  Total: 12.88GB
‚úì LLM Model loaded
Evaluation dataset: 7,437 query-passage pairs


Evaluating: BM25 Baseline

Using 50 samples (limited from 7437)
Corpus size: 50 passages



Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:38<00:00,  1.31it/s]


‚úì GPU memory cleared
  Allocated: 4.28GB
  Reserved: 4.53GB
  Free: 8.36GB
  Total: 12.88GB

Evaluating: LLM-Enhanced Model

Using 50 samples (limited from 7437)
Corpus size: 50 passages



Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:40<00:00,  1.24it/s]



EVALUATION RESULTS - COMPARISON

   Metric BM25 Baseline LLM-Enhanced Improvement %
   MRR@10        0.2167       0.1125       -48.07%
  nDCG@10        0.2792       0.1426       -48.93%
 Recall@1        0.1200       0.0600       -50.00%
 Recall@5        0.4000       0.1800       -55.00%
Recall@10        0.4800       0.2400       -50.00%

SUMMARY

Average Improvement: -50.40%
‚ö†Ô∏è  BM25 model performs better (50.40% better)

‚úì Results saved to ./models/phase3_comparison_results.json


  return isinstance(obj, torch.Tensor)


‚úì GPU memory cleared
  Allocated: 4.28GB
  Reserved: 4.53GB
  Free: 8.36GB
  Total: 12.88GB

‚úÖ PHASE 3 EVALUATION COMPLETE


## Phase 4 - RAG Integration

In [30]:
# Install FAISS for dense retrieval

import subprocess
import sys

try:
    import faiss
    print("‚úì FAISS already installed")
except ImportError:
    print("Installing FAISS...")
    # Use CPU version for compatibility
    subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu"])
    import faiss
    print("‚úì FAISS installed")

print(f"  FAISS version: {faiss.__version__}")

‚úì FAISS already installed
  FAISS version: 1.12.0


In [47]:
print("\n" + "="*60)
print("Building RAG Retrieval Index from LLM-Processed Negatives")
print("="*60)

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import pickle
import json

# ============================================================
# LOAD FROM SAVED FILES (No re-processing needed!)
# ============================================================

print("\n=== Loading LLM-Processed Dataset ===\n")

# Try pickle first (faster)
pickle_path = f"{DATA_DIR}/hard_negatives_df.pkl"
tsv_path = f"{DATA_DIR}/llm_classified_generated.tsv"

if os.path.exists(pickle_path):
    print(f"Loading from pickle: {pickle_path}")
    with open(pickle_path, 'rb') as f:
        hard_negatives_df = pickle.load(f)
    print(f"‚úì Loaded: {len(hard_negatives_df):,} samples")

elif os.path.exists(tsv_path):
    print(f"Loading from TSV: {tsv_path}")
    hard_negatives_df = pd.read_csv(tsv_path, sep="\t")
    print(f"‚úì Loaded: {len(hard_negatives_df):,} samples")

else:
    raise FileNotFoundError(
        f"LLM-processed data not found!\n"
        f"Expected: {pickle_path} or {tsv_path}\n"
        f"Please run Phase 3 first and save the data."
    )

print(f"  Columns: {hard_negatives_df.columns.tolist()}")
print(f"  Sample query: {hard_negatives_df.iloc[0]['query_text'][:60]}...")

# ============================================================
# Build Corpus from LLM-Processed Negatives
# ============================================================

print("\n=== Building Passage Corpus ===\n")

# Use hard_negative column (already classified as HARD by LLM)
corpus_passages = hard_negatives_df['hard_negative'].unique().tolist()
print(f"‚úì Corpus size: {len(corpus_passages):,} unique HARD negatives")

# ============================================================
# Encode Corpus Using Sentence Transformers
# ============================================================

print("\n=== Encoding Passage Corpus ===\n")

rag_encoder = SentenceTransformer(BASE_MODEL)
rag_encoder.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rag_encoder = rag_encoder.to(device)
print(f"‚úì Encoder loaded: {BASE_MODEL}")

# Encode in batches
batch_size = 32
all_embeddings = []

for i in tqdm(range(0, len(corpus_passages), batch_size), desc="Encoding"):
    batch_end = min(i + batch_size, len(corpus_passages))
    batch = corpus_passages[i:batch_end]
    
    with torch.no_grad():
        embeddings = rag_encoder.encode(
            batch,
            convert_to_numpy=True,
            show_progress_bar=False,
            batch_size=batch_size
        )
    
    all_embeddings.append(embeddings)

corpus_embeddings = np.vstack(all_embeddings)
print(f"\n‚úì Encoded: {corpus_embeddings.shape[0]} passages")
print(f"  Dimension: {corpus_embeddings.shape[1]}")

# ============================================================
# Build and Save FAISS Index
# ============================================================

print("\n=== Building FAISS Index ===\n")

embedding_dim = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)

# Normalize for cosine similarity
faiss.normalize_L2(corpus_embeddings)
index.add(corpus_embeddings.astype('float32'))

print(f"‚úì Index built: {index.ntotal:,} passages")

# ============================================================
# Save Index and Corpus
# ============================================================

print("\n=== Saving RAG Index ===\n")

# Save FAISS index
index_path = f"{MODEL_DIR}/rag_corpus_index.faiss"
faiss.write_index(index, index_path)
print(f"‚úì FAISS index: {index_path}")

# Save corpus passages
corpus_path = f"{MODEL_DIR}/rag_corpus_passages.pkl"
with open(corpus_path, "wb") as f:
    pickle.dump(corpus_passages, f)
print(f"‚úì Corpus: {corpus_path}")

# Save metadata
metadata = {
    "corpus_type": "LLM-processed hard negatives",
    "corpus_size": len(corpus_passages),
    "embedding_model": BASE_MODEL,
    "embedding_dim": embedding_dim,
    "source_file": tsv_path if os.path.exists(tsv_path) else pickle_path,
    "timestamp": pd.Timestamp.now().isoformat()
}

metadata_path = f"{MODEL_DIR}/rag_index_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"‚úì Metadata: {metadata_path}")

print("\n" + "="*60)
print("‚úÖ RAG INDEX READY FOR PHASE 4")
print("="*60)

clear_gpu_memory()


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: bert-base-multilingual-cased



Building RAG Retrieval Index from LLM-Processed Negatives

=== Loading LLM-Processed Dataset ===

Loading from TSV: data/llm_classified_generated.tsv
‚úì Loaded: 298 samples
  Columns: ['query_text', 'gold_passage', 'hard_negative']
  Sample query: what are the liberal arts?...

=== Building Passage Corpus ===

‚úì Corpus size: 162 unique HARD negatives

=== Encoding Passage Corpus ===



INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda


‚úì Encoder loaded: bert-base-multilingual-cased


Encoding: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00,  6.90it/s]



‚úì Encoded: 162 passages
  Dimension: 768

=== Building FAISS Index ===

‚úì Index built: 162 passages

=== Saving RAG Index ===

‚úì FAISS index: ./models/rag_corpus_index.faiss
‚úì Corpus: ./models/rag_corpus_passages.pkl
‚úì Metadata: ./models/rag_index_metadata.json

‚úÖ RAG INDEX READY FOR PHASE 4


  return isinstance(obj, torch.Tensor)


‚úì GPU memory cleared
  Allocated: 4.99GB
  Reserved: 5.31GB
  Free: 7.57GB
  Total: 12.88GB


In [50]:
print("\n" + "="*60)
print("Cell 27: Initializing RAG Context Retriever")
print("="*60)

import os
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class RAGContextRetriever:
    """Retrieve context passages for negatives using FAISS"""
    
    def __init__(self, index_path, corpus_path, model_name=None, device=None):
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        print(f"\n=== Loading FAISS Index ===\n")
        
        # Validate files
        if not os.path.exists(index_path):
            raise FileNotFoundError(f"Index not found: {index_path}")
        if not os.path.exists(corpus_path):
            raise FileNotFoundError(f"Corpus not found: {corpus_path}")
        
        # Load FAISS index
        self.index = faiss.read_index(index_path)
        print(f"‚úì FAISS index loaded: {os.path.basename(index_path)}")
        
        # Load corpus
        with open(corpus_path, 'rb') as f:
            self.corpus = pickle.load(f)
        print(f"‚úì Corpus loaded: {os.path.basename(corpus_path)}")
        
        # Load encoder
        if model_name is None:
            model_name = BASE_MODEL
        
        print(f"‚úì Loading encoder: {model_name}")
        self.encoder = SentenceTransformer(model_name)
        self.encoder.eval()
        self.encoder = self.encoder.to(self.device)
        
        print(f"\n=== Retriever Initialized ===")
        print(f"  Index size: {self.index.ntotal:,} passages")
        print(f"  Corpus size: {len(self.corpus):,} passages")
        print(f"  Device: {self.device}")
        print(f"  Embedding dim: 768")
    
    def retrieve_context(self, query, top_k=5):
        """Retrieve top-k context passages for a query"""
        
        with torch.no_grad():
            query_emb = self.encoder.encode(
                query,
                convert_to_numpy=True,
                show_progress_bar=False
            )
        
        # Normalize
        query_emb_normalized = query_emb / (np.linalg.norm(query_emb) + 1e-8)
        query_emb_normalized = query_emb_normalized.reshape(1, -1).astype('float32')
        
        # Search
        scores, indices = self.index.search(query_emb_normalized, top_k)
        
        contexts = []
        for idx, score in zip(indices[0], scores[0]):
            if idx < len(self.corpus):
                contexts.append({
                    'passage': self.corpus[idx],
                    'score': float(score)
                })
        
        return contexts

# ============================================================
# Initialize Retriever
# ============================================================

print("\n" + "="*60)
print("Initializing RAG Context Retriever")
print("="*60)

try:
    # Use CORRECT file names from Cell 26
    index_path = f"{MODEL_DIR}/rag_corpus_index.faiss"  # ‚Üê Correct name
    corpus_path = f"{MODEL_DIR}/rag_corpus_passages.pkl"  # ‚Üê Correct name
    
    rag_retriever = RAGContextRetriever(
        index_path=index_path,
        corpus_path=corpus_path,
        model_name=BASE_MODEL,
        device=device
    )
    
    print("\n‚úÖ RAG Retriever initialized successfully!")
    
    # Test retrieval
    print("\n=== Testing Retrieval ===\n")
    test_query = "What is machine learning?"
    test_contexts = rag_retriever.retrieve_context(test_query, top_k=3)
    
    print(f"Query: {test_query}")
    print(f"Retrieved {len(test_contexts)} context passages:")
    for i, ctx in enumerate(test_contexts, 1):
        print(f"  {i}. Score: {ctx['score']:.4f}")
        print(f"     Text: {ctx['passage'][:80]}...")
    
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()
    raise


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: bert-base-multilingual-cased



Cell 27: Initializing RAG Context Retriever

Initializing RAG Context Retriever

=== Loading FAISS Index ===

‚úì FAISS index loaded: rag_corpus_index.faiss
‚úì Corpus loaded: rag_corpus_passages.pkl
‚úì Loading encoder: bert-base-multilingual-cased


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda



=== Retriever Initialized ===
  Index size: 162 passages
  Corpus size: 162 passages
  Device: cuda
  Embedding dim: 768

‚úÖ RAG Retriever initialized successfully!

=== Testing Retrieval ===

Query: What is machine learning?
Retrieved 3 context passages:
  1. Score: 0.5318
     Text: Formative assessment has been a topic of debate among education scholars for yea...
  2. Score: 0.4977
     Text: The concept of reinsurance is often misunderstood and has led to numerous miscon...
  3. Score: 0.4771
     Text: The liberal arts have long been a staple of higher education, but their relevanc...


In [51]:

print("\n" + "="*60)
print("Initializing RAG Negative Ranker")
print("="*60)

class RAGNegativeRanker:
    """Score negatives using LLM with RAG context"""
    
    def __init__(self, rag_retriever, llm_classifier):
        self.rag_retriever = rag_retriever
        self.llm_classifier = llm_classifier  # Your LLMHardNegativeClassifier from Phase 3
        self.score_cache = {}  # Cache LLM scores
        
        print("‚úì RAG Negative Ranker initialized with caching")
    
    def _get_cache_key(self, query, gold, negative):
        """Create cache key from text"""
        return hash((query[:80], gold[:80], negative[:80]))
    
    def score_negative_with_context(self, query, gold_passage, negative_passage, top_k_context=3):
        """
        Score a negative using LLM with retrieved context
        
        Uses your LLMHardNegativeClassifier public method
        """
        
        # Check cache first
        cache_key = self._get_cache_key(query, gold_passage, negative_passage)
        if cache_key in self.score_cache:
            return self.score_cache[cache_key]
        
        # Retrieve context using RAG
        contexts = self.rag_retriever.retrieve_context(query, top_k=top_k_context)
        context_str = "\n".join([f"- {c['passage'][:150]}" for c in contexts])
        
        # Create enhanced prompt with context
        enhanced_query = f"{query}\n\nRelevant Context:\n{context_str}"
        
        # Use YOUR public LLMClassifier method
        result = self.llm_classifier.classify_negative(
            query=enhanced_query,
            gold_passage=gold_passage,
            negative_passage=negative_passage
        )
        
        # Convert classification to score
        is_hard = result.get('is_hard', False)
        score = 0.8 if is_hard else 0.2
        
        # Cache result
        self.score_cache[cache_key] = score
        
        return score
    
    def score_batch(self, negatives_df, top_k_context=3):
        """Score a batch of negatives"""
        results = []
        
        print(f"\nScoring {len(negatives_df)} negatives with RAG...")
        
        for idx, row in tqdm(negatives_df.iterrows(), total=len(negatives_df), desc="RAG Scoring"):
            query = row['query_text']
            gold = row['gold_passage']
            negative = row['hard_negative']
            
            try:
                score = self.score_negative_with_context(
                    query, gold, negative,
                    top_k_context=top_k_context
                )
                
                results.append({
                    'query_text': query,
                    'gold_passage': gold,
                    'hard_negative': negative,
                    'rag_score': score
                })
            except Exception as e:
                print(f"Error scoring: {e}")
                continue
        
        print(f"‚úì Scored {len(results)} negatives")
        print(f"  Cache hit rate: {len(self.score_cache)} cached scores")
        
        return pd.DataFrame(results)

# Initialize ranker
rag_ranker = RAGNegativeRanker(rag_retriever, llm_classifier)


Initializing RAG Negative Ranker
‚úì RAG Negative Ranker initialized with caching


In [52]:

print("\n" + "="*60)
print("Applying RAG Ranking")
print("="*60)

# Limit to subset for speed
RAG_SAMPLE_SIZE = 50 if DEV_MODE else 200
rag_input = hard_negatives_df.head(RAG_SAMPLE_SIZE).copy()

print(f"\nRanking {len(rag_input)} hard negatives...")

# Apply RAG scoring
rag_ranked = rag_ranker.score_batch(rag_input, top_k_context=3)

# Analyze score distribution
print(f"\n=== RAG Score Distribution ===")
print(f"  Mean: {rag_ranked['rag_score'].mean():.4f}")
print(f"  Std:  {rag_ranked['rag_score'].std():.4f}")
print(f"  Min:  {rag_ranked['rag_score'].min():.4f}")
print(f"  Max:  {rag_ranked['rag_score'].max():.4f}")

# Select high-quality negatives
QUALITY_PERCENTILE = 0.5  # Keep top 50%
selection_threshold = rag_ranked['rag_score'].quantile(1 - QUALITY_PERCENTILE)

print(f"\n=== Negative Selection ===")
print(f"  Selection threshold: {selection_threshold:.4f}")
print(f"  Keeping quality percentile: {QUALITY_PERCENTILE*100:.0f}%")

rag_final = rag_ranked[rag_ranked['rag_score'] >= selection_threshold].copy()

print(f"‚úì Selected {len(rag_final)} high-quality negatives")
print(f"  Original: {len(rag_input)}")
print(f"  Selected: {len(rag_final)} ({len(rag_final)/len(rag_input)*100:.1f}%)")



Applying RAG Ranking

Ranking 50 hard negatives...

Scoring 50 negatives with RAG...


RAG Scoring: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [02:21<00:00,  2.83s/it]

‚úì Scored 50 negatives
  Cache hit rate: 50 cached scores

=== RAG Score Distribution ===
  Mean: 0.8000
  Std:  0.0000
  Min:  0.8000
  Max:  0.8000

=== Negative Selection ===
  Selection threshold: 0.8000
  Keeping quality percentile: 50%
‚úì Selected 50 high-quality negatives
  Original: 50
  Selected: 50 (100.0%)





In [None]:

print("\n" + "="*60)
print("Training DPR with RAG-Selected Negatives")
print("="*60)

# Prepare training data
train_df = rag_final[['query_text', 'gold_passage', 'hard_negative']].copy()
train_df.columns = ['query_text', 'gold_passage', 'hard_negative']

print(f"\n=== Training Data ===")
print(f"  Samples: {len(train_df)}")
print(f"  Columns: {train_df.columns.tolist()}")

# Configure training args
phase4_args = RetrievalArgs()
phase4_args.data_format = "beir"
phase4_args.max_seq_length = 256
phase4_args.include_title = False
phase4_args.hard_negatives = True
phase4_args.num_train_epochs = 3
phase4_args.train_batch_size = 8
phase4_args.learning_rate = 5e-7
phase4_args.output_dir = f"{MODEL_DIR}/dpr_rag_phase4"
phase4_args.fp16 = USE_MIXED_PRECISION
phase4_args.evaluate_during_training = False
phase4_args.save_model_every_epoch = False
phase4_args.overwrite_output_dir = True
phase4_args.use_cached_eval_features = False

print(f"\n=== Training Configuration ===")
print(f"  Epochs: {phase4_args.num_train_epochs}")
print(f"  Batch size: {phase4_args.train_batch_size}")
print(f"  Learning rate: {phase4_args.learning_rate}")
print(f"  Max sequence length: {phase4_args.max_seq_length}")
print(f"  FP16: {phase4_args.fp16}")

# Load Phase 3 model as starting point
print(f"\n=== Loading Phase 3 Model ===")

phase3_model_path = f"{MODEL_DIR}/dpr-llm-enhanced"

try:
    dpr_rag_model = RetrievalModel(
        model_type="custom",
        model_name=phase3_model_path,
        args=phase4_args,
        use_cuda=torch.cuda.is_available()
    )
    print(f"‚úì Loaded Phase 3 model from {phase3_model_path}")
except Exception as e:
    print(f"‚ö† Could not load Phase 3 model: {e}")
    print("Initializing fresh model from HuggingFace...")
    
    dpr_rag_model = RetrievalModel(
        model_type="dpr",
        model_name="facebook/dpr-ctx_encoder-single-nq-base",
        query_encoder_name="facebook/dpr-question_encoder-single-nq-base",
        args=phase4_args,
        use_cuda=torch.cuda.is_available()
    )
    print("‚úì Initialized fresh DPR model")

# Train
print(f"\n=== Starting Training ===\n")

clear_gpu_memory()

import time
start_time = time.time()

try:
    dpr_rag_model.train_model(train_df)
    training_time = (time.time() - start_time) / 60
    
    print(f"\n‚úì Training complete: {training_time:.2f} minutes")
    print(f"‚úì Model saved to {phase4_args.output_dir}")
    
except Exception as e:
    print(f"‚ùå Training failed: {e}")
    import traceback
    traceback.print_exc()

clear_gpu_memory()


In [None]:

print("\n" + "="*60)
print("Saving Phase 4 Metadata")
print("="*60)

metadata = {
    "stage": "phase_4_rag_integration",
    "timestamp": pd.Timestamp.now().isoformat(),
    "model_path": phase4_args.output_dir,
    "training_samples": len(train_df),
    "rag_samples_scored": len(rag_ranked),
    "rag_samples_selected": len(rag_final),
    "quality_threshold": float(selection_threshold),
    "quality_percentile": QUALITY_PERCENTILE,
    "epochs": phase4_args.num_train_epochs,
    "batch_size": phase4_args.train_batch_size,
    "learning_rate": phase4_args.learning_rate,
    "rag_context_retrieval": "FAISS IndexFlatIP",
    "rag_scorer": "LLMClassifier with context",
    "training_time_minutes": training_time if 'training_time' in locals() else None
}

metadata_path = f"{MODEL_DIR}/phase4_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"‚úì Metadata saved to {metadata_path}")


In [None]:

print("\n" + "="*60)
print("Phase 4 Evaluation")
print("="*60)

# Load RAG model
print("\n=== Loading RAG Model ===")

eval_args = RetrievalArgs()
eval_args.data_format = "beir"
eval_args.max_seq_length = 256
eval_args.include_title = False
eval_args.hard_negatives = False
eval_args.eval_batch_size = 8
eval_args.fp16 = USE_MIXED_PRECISION

rag_eval_model = RetrievalModel(
    model_type="custom",
    model_name=f"{MODEL_DIR}/dpr_rag_phase4",
    args=eval_args,
    use_cuda=torch.cuda.is_available()
)
rag_eval_model.query_encoder = rag_eval_model.query_encoder.to(device)
rag_eval_model.context_encoder = rag_eval_model.context_encoder.to(device)

print("‚úì RAG model loaded")

# Evaluate using existing function
print("\n=== Evaluating RAG Model ===")

if 'msmarco_dev' in locals() and len(msmarco_dev) > 0:
    rag_metrics = evaluate_dpr_model(
        rag_eval_model,
        msmarco_dev,
        model_name="RAG-Enhanced Model",
        device=device,
        top_k=10,
        max_samples=50
    )
    
    # Compare with baseline
    print("\n" + "="*60)
    print("PHASE 4 COMPARISON: BM25 vs LLM vs RAG")
    print("="*60)
    
    if 'phase2_baseline' in locals():
        comparison_df = pd.DataFrame({
            "Metric": list(phase2_baseline.keys()),
            "Phase 2 (BM25)": [f"{phase2_baseline[k]:.4f}" for k in phase2_baseline.keys()],
            "Phase 4 (RAG)": [f"{rag_metrics[k]:.4f}" for k in rag_metrics.keys()],
            "Improvement %": [
                f"{((rag_metrics[k] - phase2_baseline[k])/max(phase2_baseline[k], 0.0001) * 100):.2f}%"
                for k in phase2_baseline.keys()
            ]
        })
        
        print("\n" + comparison_df.to_string(index=False))
        
        # Calculate average improvement
        avg_improvement = np.mean([
            ((rag_metrics[k] - phase2_baseline[k])/max(phase2_baseline[k], 0.0001) * 100)
            for k in phase2_baseline.keys()
        ])
        
        print(f"\nAverage Improvement: {avg_improvement:.2f}%")
        
        if avg_improvement > 0:
            print(f"‚úÖ RAG model shows {avg_improvement:.2f}% improvement!")
        else:
            print(f"‚ö† RAG model performs {abs(avg_improvement):.2f}% worse than baseline")

clear_gpu_memory()

print("\n" + "="*60)
print("‚úÖ PHASE 4 COMPLETE - RAG INTEGRATION FINISHED")
print("="*60)