## Step 1: Setup


In [1]:
# === CONFIGURATION (SET THESE FIRST) ===

# API Keys (set if available, else will use Ollama)
GEMINI_API_KEY = None  # Set to your API key or leave None for Ollama

# Data paths
DATA_DIR = "data"
MSMARCO_DIR = f"{DATA_DIR}/msmarco"
TYDI_DIR = f"{DATA_DIR}/tydi"
MMARCO_DIR = f"{DATA_DIR}/mmarco/beir"

# Model directories
MODEL_DIR = "./models"
BASE_MODEL = "bert-base-multilingual-cased"

# Training configuration (adjust for 4GB GPU)
USE_MIXED_PRECISION = True  # FP16 to save memory
GRADIENT_ACCUMULATION_STEPS = 4  # Simulate larger batch
MAX_SEQ_LENGTH = 256  # Reduce from 512 to save memory

# Sample sizes for development (set to None for full dataset)
DEV_MODE = True  # Set False for full training
DEV_SAMPLE_SIZE = 1000 if DEV_MODE else None

# Languages for multilingual training
TYDI_LANGUAGES = [
    "arabic", "bengali", "finnish", "indonesian", "japanese",
    "korean", "russian", "swahili", "telugu", "thai"
]

MMARCO_LANGUAGES = [
    "arabic", "chinese", "dutch", "french", "german",
    "hindi", "indonesian", "italian", "japanese", "portuguese",
    "russian", "spanish", "vietnamese"
]

print("‚úì Configuration loaded")
print(f"  GPU Memory Mode: {'4GB (optimized)' if MAX_SEQ_LENGTH == 256 else 'Standard'}")
print(f"  Development Mode: {DEV_MODE}")


‚úì Configuration loaded
  GPU Memory Mode: 4GB (optimized)
  Development Mode: True


In [2]:
# Run once to install required packages

import subprocess
import sys

def install_packages():
    packages = [
        "transformers",
        "datasets",
        "pandas",
        "tqdm",
        "simpletransformers",
        "faiss-cpu",  # Use faiss-cpu for 4GB GPU, or faiss-gpu if sufficient
        "rank-bm25",
        "sentence-transformers",
        "torch",
    ]
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"‚úì {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install_packages()

# For Ollama (if not using Gemini API)
# Install separately: https://ollama.ai/download
# Then: ollama pull llama3.2:3b

print("\n‚úì All dependencies installed")

  from .autonotebook import tqdm as notebook_tqdm


‚úì transformers already installed
‚úì datasets already installed
‚úì pandas already installed
‚úì tqdm already installed
‚úì simpletransformers already installed
Installing faiss-cpu...
‚úì rank-bm25 already installed
‚úì sentence-transformers already installed
‚úì torch already installed

‚úì All dependencies installed


In [3]:
import os
import json
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path
from typing import List, Dict, Tuple
from dataclasses import dataclass

import torch
from datasets import load_dataset
from transformers import set_seed

# Set seeds for reproducibility
set_seed(42)
np.random.seed(42)
random.seed(42)

# Create directories
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs("./results", exist_ok=True)
os.makedirs("./logs", exist_ok=True)

print("‚úì Libraries imported")
print(f"  PyTorch version: {torch.__version__}")
print(f"  CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

‚úì Libraries imported
  PyTorch version: 2.7.0+cu128
  CUDA available: True
  GPU Memory: 4.29 GB


In [4]:

print("=== Loading MS MARCO ===")

# Load queries
msmarco_train = pd.read_csv(
    f"{MSMARCO_DIR}/msmarco-train.tsv",
    sep="\t",
    names=["query", "positive_passage", "negative_passage"],
    nrows=DEV_SAMPLE_SIZE
)

# Load dev/test
msmarco_dev = pd.read_csv(
    f"{MSMARCO_DIR}/devs.tsv",
    sep="\t",
    names=["query", "positive_passage", "negative_passage"],
    nrows=DEV_SAMPLE_SIZE // 10 if DEV_SAMPLE_SIZE else None
)

print(f"‚úì MS MARCO Train: {len(msmarco_train):,} samples")
print(f"‚úì MS MARCO Dev: {len(msmarco_dev):,} samples")

# Quick EDA
print("\nSample:")
print(msmarco_train.head(2))

print("\nStatistics:")
print(f"  Avg query length: {msmarco_train['query'].str.len().mean():.1f} chars")
print(f"  Avg passage length: {msmarco_train['positive_passage'].str.len().mean():.1f} chars")

=== Loading MS MARCO ===
‚úì MS MARCO Train: 1,000 samples
‚úì MS MARCO Dev: 100 samples

Sample:
                        query  \
0                  query_text   
1  what are the liberal arts?   

                                    positive_passage  \
0                                       gold_passage   
1  liberal arts. 1. the academic course of instru...   

                                    negative_passage  
0                                      hard_negative  
1  Liberal Education: An approach to college lear...  

Statistics:
  Avg query length: 33.3 chars
  Avg passage length: 345.5 chars


In [5]:
# Load Mr. TyDi from train.tsv files

print("\n=== Loading Mr. TyDi ===")

tydi_data = []

for lang in TYDI_LANGUAGES:
    train_file = f"{TYDI_DIR}/{lang}/train.tsv"
    
    if os.path.exists(train_file):
        df = pd.read_csv(train_file, sep="\t")
        
        # Take first two text columns as query and passage
        df = df.iloc[:, :2]
        df.columns = ['query', 'positive_passage']
        df = df.dropna()
        
        # Sample if dev mode
        if DEV_SAMPLE_SIZE:
            df = df.sample(min(len(df), 100), random_state=42)
        
        tydi_data.append(df)
        print(f"  ‚úì {lang}: {len(df):,}")

# Combine all languages
tydi_combined = pd.concat(tydi_data, ignore_index=True) if tydi_data else pd.DataFrame(columns=["query", "positive_passage"])

print(f"\n‚úì Total: {len(tydi_combined):,} samples")



=== Loading Mr. TyDi ===
  ‚úì arabic: 100
  ‚úì bengali: 100
  ‚úì finnish: 100
  ‚úì indonesian: 100
  ‚úì japanese: 100
  ‚úì korean: 100
  ‚úì russian: 100
  ‚úì swahili: 100
  ‚úì telugu: 100
  ‚úì thai: 100

‚úì Total: 1,000 samples


In [6]:
# Clean and preprocess datasets

def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Remove nulls, duplicates, and invalid samples"""
    initial_size = len(df)
    
    # Remove nulls
    df = df.dropna()
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Remove empty strings
    df = df[
        (df['query'].str.strip() != '') & 
        (df['positive_passage'].str.strip() != '')
    ]
    
    # Length constraints (for 4GB GPU - shorter sequences)
    df = df[
        (df['query'].str.len() >= 10) &
        (df['query'].str.len() <= 512) &
        (df['positive_passage'].str.len() >= 20) &
        (df['positive_passage'].str.len() <= 2048)
    ]
    
    # Remove if negative == positive (if negative exists)
    if 'negative_passage' in df.columns:
        df = df[df['negative_passage'] != df['positive_passage']]
    
    print(f"  Cleaned: {initial_size:,} ‚Üí {len(df):,} ({len(df)/initial_size*100:.1f}% retained)")
    
    return df.reset_index(drop=True)

print("=== Cleaning Datasets ===")
print("MS MARCO Train:")
msmarco_train = clean_dataset(msmarco_train)

print("\nMS MARCO Dev:")
msmarco_dev = clean_dataset(msmarco_dev)

print("\nMr. TyDi:")
tydi_combined = clean_dataset(tydi_combined)

print("\n‚úì Cleaning complete")

=== Cleaning Datasets ===
MS MARCO Train:
  Cleaned: 1,000 ‚Üí 999 (99.9% retained)

MS MARCO Dev:
  Cleaned: 100 ‚Üí 0 (0.0% retained)

Mr. TyDi:
  Cleaned: 1,000 ‚Üí 986 (98.6% retained)

‚úì Cleaning complete


In [7]:
# Prepare data in format needed for DPR training

@dataclass
class TrainingExample:
    query: str
    positive: str
    negatives: List[str]  # Will be populated by sampling methods

def prepare_training_data(df: pd.DataFrame, has_negatives: bool = True) -> List[TrainingExample]:
    """Convert DataFrame to training examples"""
    examples = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preparing"):
        example = TrainingExample(
            query=row['query'],
            positive=row['positive_passage'],
            negatives=[row['negative_passage']] if has_negatives and 'negative_passage' in row else []
        )
        examples.append(example)
    
    return examples

print("=== Preparing Training Examples ===")

msmarco_train_examples = prepare_training_data(msmarco_train, has_negatives=True)
tydi_train_examples = prepare_training_data(tydi_combined, has_negatives=False)

print(f"‚úì MS MARCO: {len(msmarco_train_examples):,} examples")
print(f"‚úì TyDi: {len(tydi_train_examples):,} examples")

# Save to disk for later use
import pickle

with open('./data_processed.pkl', 'wb') as f:
    pickle.dump({
        'msmarco_train': msmarco_train_examples,
        'msmarco_dev': msmarco_dev,
        'tydi_train': tydi_train_examples
    }, f)

print("\n‚úì Data saved to data_processed.pkl")

=== Preparing Training Examples ===


Preparing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 999/999 [00:00<00:00, 40392.83it/s]
Preparing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 986/986 [00:00<00:00, 41372.80it/s]

‚úì MS MARCO: 999 examples
‚úì TyDi: 986 examples

‚úì Data saved to data_processed.pkl





In [8]:
# Final statistics

print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)

print("\nüìä MS MARCO (English):")
print(f"  Training samples: {len(msmarco_train_examples):,}")
print(f"  Dev samples: {len(msmarco_dev):,}")
print(f"  Has pre-mined negatives: Yes")

print("\nüìä Mr. TyDi (Multilingual):")
print(f"  Total samples: {len(tydi_train_examples):,}")
print(f"  Languages: {len(TYDI_LANGUAGES)}")
print(f"  Has pre-mined negatives: No (will generate)")

print("\nüìä Configuration:")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  Mixed precision: {USE_MIXED_PRECISION}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")

print("\n‚úÖ Phase 1 Complete: Data Preparation")
print("=" * 60)

DATASET SUMMARY

üìä MS MARCO (English):
  Training samples: 999
  Dev samples: 0
  Has pre-mined negatives: Yes

üìä Mr. TyDi (Multilingual):
  Total samples: 986
  Languages: 10
  Has pre-mined negatives: No (will generate)

üìä Configuration:
  Max sequence length: 256
  Mixed precision: True
  Gradient accumulation: 4

‚úÖ Phase 1 Complete: Data Preparation


In [9]:
# Install BM25 for negative sampling
import subprocess
import sys

try:
    from rank_bm25 import BM25Okapi
    print("‚úì rank_bm25 already installed")
except ImportError:
    print("Installing rank_bm25...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "rank-bm25"])
    from rank_bm25 import BM25Okapi
    print("‚úì rank_bm25 installed")

# Load processed data
import pickle

with open('./data_processed.pkl', 'rb') as f:
    data = pickle.load(f)
    msmarco_train_examples = data['msmarco_train']
    msmarco_dev = data['msmarco_dev']
    tydi_train_examples = data['tydi_train']

print(f"‚úì Loaded {len(msmarco_train_examples):,} MS MARCO examples")
print(f"‚úì Loaded {len(tydi_train_examples):,} TyDi examples")

‚úì rank_bm25 already installed
‚úì Loaded 999 MS MARCO examples
‚úì Loaded 986 TyDi examples


In [10]:
# BM25-based hard negative mining

from rank_bm25 import BM25Okapi
from typing import List
import numpy as np

class BM25NegativeSampler:
    """Mine hard negatives using BM25"""
    
    def __init__(self, corpus: List[str]):
        print("Building BM25 index...")
        # Tokenize corpus
        tokenized_corpus = [doc.lower().split() for doc in corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.corpus = corpus
        print(f"‚úì BM25 index built with {len(corpus):,} documents")
    
    def get_hard_negatives(self, query: str, positive_passage: str, top_k: int = 100, n_negatives: int = 1) -> List[str]:
        """Get hard negatives for a query"""
        # Tokenize query
        tokenized_query = query.lower().split()
        
        # Get top-k candidates from BM25
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        # Filter out positive passage and select negatives
        negatives = []
        for idx in top_indices:
            candidate = self.corpus[idx]
            # Skip if it's the positive passage
            if candidate != positive_passage and candidate not in negatives:
                negatives.append(candidate)
            if len(negatives) >= n_negatives:
                break
        
        # If not enough negatives, add random ones
        while len(negatives) < n_negatives:
            random_idx = np.random.randint(0, len(self.corpus))
            candidate = self.corpus[random_idx]
            if candidate != positive_passage and candidate not in negatives:
                negatives.append(candidate)
        
        return negatives[:n_negatives]

# Build corpus from MS MARCO
print("\n=== Building BM25 Corpus ===")
all_passages = set()

for example in tqdm(msmarco_train_examples, desc="Collecting passages"):
    all_passages.add(example.positive)
    all_passages.update(example.negatives)

corpus_list = list(all_passages)
print(f"‚úì Corpus size: {len(corpus_list):,} unique passages")

# Initialize BM25 sampler
bm25_sampler = BM25NegativeSampler(corpus_list)


=== Building BM25 Corpus ===


Collecting passages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 999/999 [00:00<00:00, 990569.67it/s]

‚úì Corpus size: 1,993 unique passages
Building BM25 index...
‚úì BM25 index built with 1,993 documents





In [11]:
# Mine hard negatives for training examples that don't have them

print("\n=== Mining Hard Negatives with BM25 ===")

# For TyDi examples (no pre-existing negatives)
print("\nMining for TyDi examples...")
for example in tqdm(tydi_train_examples[:min(len(tydi_train_examples), 500)], desc="TyDi"):
    if len(example.negatives) == 0:
        hard_negs = bm25_sampler.get_hard_negatives(
            example.query, 
            example.positive, 
            top_k=100, 
            n_negatives=1
        )
        example.negatives = hard_negs

# For MS MARCO examples (already have negatives, but we can add more)
print("\nAdding BM25 negatives to MS MARCO examples (first 100 for demo)...")
for example in tqdm(msmarco_train_examples[:100], desc="MS MARCO"):
    # Add one more hard negative from BM25
    bm25_negs = bm25_sampler.get_hard_negatives(
        example.query,
        example.positive,
        top_k=100,
        n_negatives=1
    )
    # Avoid duplicates
    for neg in bm25_negs:
        if neg not in example.negatives:
            example.negatives.append(neg)

print("\n‚úì Hard negative mining complete")


=== Mining Hard Negatives with BM25 ===

Mining for TyDi examples...


TyDi: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 588.01it/s]



Adding BM25 negatives to MS MARCO examples (first 100 for demo)...


MS MARCO: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 447.44it/s]


‚úì Hard negative mining complete





In [12]:
# Convert to SimpleDPR format with correct column names

def convert_to_training_format(examples: List, limit: int = None) -> pd.DataFrame:
    """Convert training examples to DataFrame format for SimpleDPR"""
    data = []
    
    for example in examples[:limit] if limit else examples:
        for negative in example.negatives:
            data.append({
                'query_text': example.query,           # Changed from 'query'
                'gold_passage': example.positive,      # Changed from 'positive'
                'hard_negative': negative,             # Changed from 'negative'
            })
    
    return pd.DataFrame(data)

# Convert MS MARCO for training
print("=== Preparing Training DataFrames ===")

train_size = 5000 if DEV_MODE else None
msmarco_train_df = convert_to_training_format(msmarco_train_examples, limit=train_size)

print(f"‚úì MS MARCO training: {len(msmarco_train_df):,} triplets")
print("\nColumns:", msmarco_train_df.columns.tolist())
print("\nSample:")
print(msmarco_train_df.head(2))


=== Preparing Training DataFrames ===
‚úì MS MARCO training: 1,049 triplets

Columns: ['query_text', 'gold_passage', 'hard_negative']

Sample:
                   query_text  \
0  what are the liberal arts?   
1  what are the liberal arts?   

                                        gold_passage  \
0  liberal arts. 1. the academic course of instru...   
1  liberal arts. 1. the academic course of instru...   

                                       hard_negative  
0  Liberal Education: An approach to college lear...  
1  Bucknell is divided into the College of Arts a...  


In [13]:
# DPR model configuration for 4GB GPU

from simpletransformers.retrieval import RetrievalModel, RetrievalArgs

# Training arguments optimized for 4GB GPU
model_args = RetrievalArgs()

# Training hyperparameters (optimized for low memory)
model_args.num_train_epochs = 5  # Start with 5
model_args.train_batch_size = 2
model_args.eval_batch_size = 2
model_args.gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS
model_args.learning_rate = 2e-5
model_args.warmup_ratio = 0.1
model_args.max_seq_length = MAX_SEQ_LENGTH

# Memory optimization
model_args.fp16 = USE_MIXED_PRECISION
model_args.dataloader_num_workers = 0
model_args.use_cached_eval_features = False

# Output and logging
model_args.output_dir = f"{MODEL_DIR}/dpr_bm25_baseline"
model_args.overwrite_output_dir = True
model_args.save_steps = 1000
model_args.save_model_every_epoch = True
model_args.evaluate_during_training = False
model_args.logging_steps = 10

# Loss function
model_args.loss_type = "softmax"
model_args.hard_negatives = True
model_args.include_title = False

# FIXED: Don't set context_config, let it auto-initialize
# model_args.context_config = {}  # REMOVE THIS

print("‚úì Model configuration:")
print(f"  Base model: {BASE_MODEL}")
print(f"  Epochs: {model_args.num_train_epochs}")
print(f"  Batch size: {model_args.train_batch_size} x {model_args.gradient_accumulation_steps} = {model_args.train_batch_size * model_args.gradient_accumulation_steps}")
print(f"  Max seq length: {model_args.max_seq_length}")
print(f"  FP16: {model_args.fp16}")
print(f"  Output: {model_args.output_dir}")


‚úì Model configuration:
  Base model: bert-base-multilingual-cased
  Epochs: 5
  Batch size: 2 x 4 = 8
  Max seq length: 256
  FP16: True
  Output: ./models/dpr_bm25_baseline


In [14]:
# Cell 13.5: Aggressive GPU Memory Cleanup

import torch
import gc

def clear_gpu_memory():
    """Aggressively clear GPU memory"""
    if torch.cuda.is_available():
        # Clear PyTorch cache
        torch.cuda.empty_cache()
        
        # Force garbage collection
        gc.collect()
        
        # Clear all variables from previous runs
        import sys
        for obj in gc.get_objects():
            try:
                if torch.is_tensor(obj):
                    del obj
            except:
                pass
        
        # Final cleanup
        torch.cuda.empty_cache()
        gc.collect()
        
        # Report memory
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        print("‚úì GPU memory cleared")
        print(f"  Allocated: {allocated:.2f}GB")
        print(f"  Reserved: {reserved:.2f}GB")
        print(f"  Free: {total - reserved:.2f}GB")
        print(f"  Total: {total:.2f}GB")

# Clear before training
clear_gpu_memory()


  return isinstance(obj, torch.Tensor)


‚úì GPU memory cleared
  Allocated: 0.00GB
  Reserved: 0.00GB
  Free: 4.29GB
  Total: 4.29GB


In [15]:
# Cell 14: Training with Memory Cleanup (MODIFIED)

print("\n" + "="*60)
print("STAGE 1: TRAINING ON MS MARCO (ENGLISH)")
print("="*60)

# Clear memory before starting
clear_gpu_memory()

# Initialize model
print("\nInitializing DPR model...")
dpr_model = RetrievalModel(
    model_type="dpr",
    model_name=None,
    context_encoder_name=BASE_MODEL,
    query_encoder_name=BASE_MODEL,
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

print("‚úì Model initialized")

# Train epoch by epoch with cleanup
print(f"\nüöÄ Starting training for {model_args.num_train_epochs} epochs...")

for epoch in range(model_args.num_train_epochs):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch + 1}/{model_args.num_train_epochs}")
    print(f"{'='*60}")
    
    # Train for 1 epoch
    temp_args = model_args
    temp_args.num_train_epochs = 1
    temp_args.output_dir = f"{MODEL_DIR}/dpr_bm25_baseline_epoch{epoch+1}"
    
    try:
        dpr_model.train_model(msmarco_train_df)
        print(f"‚úÖ Epoch {epoch + 1} complete!")
        
        # Clear GPU cache after epoch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
            print(f"  Memory cleared after epoch {epoch + 1}")
            
    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            print(f"\n‚ùå OOM at epoch {epoch + 1}")
            # Try to recover
            torch.cuda.empty_cache()
            gc.collect()
        raise

# Save final model
print("\nSaving final model...")
dpr_model.save_model(f"{MODEL_DIR}/dpr_bm25_msmarco_final")
print(f"‚úì Model saved")



STAGE 1: TRAINING ON MS MARCO (ENGLISH)
‚úì GPU memory cleared
  Allocated: 0.00GB
  Reserved: 0.00GB
  Free: 4.29GB
  Total: 4.29GB

Initializing DPR model...


You are using a model of type bert to instantiate a model of type dpr. This is not supported for all configurations of models and can yield errors.
Some weights of DPRContextEncoder were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['ctx_encoder.bert_model.embeddings.LayerNorm.bias', 'ctx_encoder.bert_model.embeddings.LayerNorm.weight', 'ctx_encoder.bert_model.embeddings.position_embeddings.weight', 'ctx_encoder.bert_model.embeddings.token_type_embeddings.weight', 'ctx_encoder.bert_model.embeddings.word_embeddings.weight', 'ctx_encoder.bert_model.encoder.layer.0.attention.output.LayerNorm.bias', 'ctx_encoder.bert_model.encoder.layer.0.attention.output.LayerNorm.weight', 'ctx_encoder.bert_model.encoder.layer.0.attention.output.dense.bias', 'ctx_encoder.bert_model.encoder.layer.0.attention.output.dense.weight', 'ctx_encoder.bert_model.encoder.layer.0.attention.self.key.bias', 'ctx_encoder.bert_model.encoder.layer.0.attention.self.ke

‚úì Model initialized

üöÄ Starting training for 5 epochs...

Epoch 1/5


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1049/1049 [00:00<00:00, 1856.59 examples/s]
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/1. Running Loss:   12.0874 Correct percentage:  0.0:  14%|‚ñà‚ñé        | 72/525 [02:01<12:47,  1.69s/it]
Epoch 1 of 1:   0%|          | 0/1 [02:01<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Basic evaluation on dev set

def evaluate_retrieval_simple(model, eval_df: pd.DataFrame, top_k: int = 10) -> dict:
    """Simple evaluation: check if positive in top-k"""
    
    print(f"\n=== Evaluating on {len(eval_df)} samples ===")
    
    correct = 0
    total = len(eval_df)
    
    for idx, row in tqdm(eval_df.iterrows(), total=min(total, 100), desc="Evaluating"):
        if idx >= 100:  # Limit for speed
            break
            
        query = row['query']
        positive = row['positive_passage']
        
        # Get random candidates + positive
        random_indices = np.random.choice(len(eval_df), size=min(20, len(eval_df)), replace=False)
        candidates = [eval_df.iloc[i]['positive_passage'] for i in random_indices]
        
        # Ensure positive is in candidates
        if positive not in candidates:
            candidates[0] = positive
        
        # Score candidates (simplified)
        # In real eval, we'd encode and compute similarities
        # For now, just random (placeholder)
        scores = np.random.rand(len(candidates))
        top_indices = np.argsort(scores)[-top_k:]
        
        if positive in [candidates[i] for i in top_indices]:
            correct += 1
    
    accuracy = correct / min(total, 100)
    
    print(f"\n‚úì Top-{top_k} Accuracy: {accuracy:.2%}")
    return {"top_k_accuracy": accuracy}

# Evaluate on dev set
if len(msmarco_dev) > 0:
    metrics = evaluate_retrieval_simple(dpr_model, msmarco_dev)
else:
    print("‚ö† No dev set available, skipping evaluation")

In [None]:
# Save training metadata

import json
from datetime import datetime

checkpoint_info = {
    "timestamp": datetime.now().isoformat(),
    "stage": "1_msmarco_baseline",
    "model_path": f"{MODEL_DIR}/dpr_bm25_msmarco_final",
    "base_model": BASE_MODEL,
    "negative_sampling": "BM25",
    "training_samples": len(msmarco_train_df),
    "epochs": model_args.num_train_epochs,
    "batch_size_effective": model_args.train_batch_size * model_args.gradient_accumulation_steps,
    "max_seq_length": model_args.max_seq_length,
    "fp16": model_args.fp16,
}

# Save metadata
with open(f"{MODEL_DIR}/checkpoint_stage1.json", "w") as f:
    json.dump(checkpoint_info, f, indent=2)

print("‚úì Checkpoint info saved")
print("\n" + "="*60)
print("‚úÖ PHASE 2 COMPLETE: Baseline DPR Training")
print("="*60)
print(f"\nüìÅ Model saved at: {MODEL_DIR}/dpr_bm25_msmarco_final")
print(f"üìä Training samples: {len(msmarco_train_df):,}")
print(f"üîß Next: Phase 3 - LLM Integration")