# Predictor Agent Training Notebook

This notebook trains a transformer-based predictor using processed datasets.
The model learns to assess query complexity and predict hallucination risk scores or specialization scores.

## Features:
- Uses processed_hallucination_dataset.json with train/val/test splits
- Supports multiple architectures (DeBERTa, RoBERTa, DistilBERT, BERT)
- Comprehensive evaluation metrics and model checkpointing
- Configurable training parameters with early stopping
- Detailed logging and regression metrics

## Alternative model suggestions:
- microsoft/deberta-v3-base: Best performance, recommended for production
- microsoft/deberta-v3-small: Faster training, good balance
- roberta-base: Excellent general-purpose performance
- distilbert-base-uncased: Fastest training, good for development
- bert-base-uncased: Classic baseline, reliable performance

## Setup and Imports

In [None]:
import os
import sys
import json
import warnings
import logging
import shutil
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
import gc

# Kaggle dataset integration (already available on Kaggle)
import kagglehub

# Environment setup
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
import numpy as np
import pandas as pd
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)

from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
    AutoModelForSequenceClassification,
)
from datasets import Dataset, DatasetDict

torch.cuda.empty_cache()
gc.collect()

warnings.filterwarnings("ignore", category=FutureWarning)
print("✅ All imports successful!")

## Utility Functions

In [None]:
def setup_logging():
    """Setup logging for notebook."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[logging.StreamHandler()]
    )
    return logging.getLogger(__name__)

def log_device_info():
    """Log information about the available device for training."""
    if torch.cuda.is_available():
        device_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
        compute_capability = torch.cuda.get_device_capability()[0]
        
        print(f"🚀 CUDA GPU detected: {device_name}")
        print(f"   GPU Memory: {gpu_memory:.1f} GB")
        print(f"   Compute Capability: {compute_capability}.x")
        
    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
        print(f"🍎 Apple Silicon GPU (MPS) detected")
        
    else:
        print(f"⚠️  No GPU detected - training will use CPU (much slower)")

def create_data_splits(
    data: List[Dict],
    train_size: float = 0.7,
    val_size: float = 0.15,
    test_size: float = 0.15,
    stratify: bool = True,
    label_key: str = 'label',
    seed: int = 42
) -> Tuple[List[Dict], List[Dict], List[Dict]]:
    """Create stratified train/validation/test splits."""
    
    # Validate split sizes
    total_size = train_size + val_size + test_size
    if total_size <= 0.1:  # Testing mode with small splits
        logger.info(f"[TESTING MODE] Using small dataset splits: {total_size:.3f} of total data")
    elif abs(total_size - 1.0) > 1e-6:
        raise ValueError("Split sizes must sum to 1.0")
    
    df = pd.DataFrame(data)

    if total_size < 1.0:
        # Sample the required portion of data first
        sample_size = max(10, int(len(df) * total_size))  # Minimum 10 samples
        sample_size = min(sample_size, len(df))  # Don't exceed available data
        
        if sample_size < 10:
            logger.warning(f"Dataset too small for splits. Using all {len(df)} samples.")
            sample_size = len(df)
            
        df = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
        
        # Normalize split sizes to work with sampled data
        norm_factor = 1.0 / total_size
        train_size *= norm_factor
        val_size *= norm_factor
        test_size *= norm_factor
    
    # For testing mode, disable stratification to avoid issues with small datasets
    if total_size <= 0.1:  # Testing mode
        stratify = False
        # logger.info("[TESTING MODE] Disabled stratification for small dataset")
    
    # Simple stratification
    if stratify:
        try:
            stratify_col = pd.qcut(df[label_key], q=5, labels=False, duplicates='drop')
        except (ValueError, TypeError):
            stratify_col = None
    else:
        stratify_col = None
    
    # First split: train vs (val + test)
    train_df, temp_df = train_test_split(
        df,
        test_size=(val_size + test_size),
        random_state=seed,
        stratify=stratify_col,
        shuffle=True
    )
    
    # Second split: val vs test
    val_df, test_df = train_test_split(
        temp_df,
        test_size=test_size / (val_size + test_size),
        random_state=seed,
        stratify=None,
        shuffle=True
    )
    
    # Convert to records
    train_data = train_df.to_dict('records')
    val_data = val_df.to_dict('records')
    test_data = test_df.to_dict('records')
    
    # Log split info
    print(f"📊 Dataset splits created:")
    print(f"  Train: {len(train_data)} ({len(train_data)/len(data)*100:.1f}%)")
    print(f"  Validation: {len(val_data)} ({len(val_data)/len(data)*100:.1f}%)")
    print(f"  Test: {len(test_data)} ({len(test_data)/len(data)*100:.1f}%)")
    
    return train_data, val_data, test_data

# Setup logging
logger = setup_logging()
print("✅ Utility functions loaded!")

## Configuration

In [None]:
# Training Configuration for Kaggle
CONFIG = {
    # Model settings
    "model_name": "microsoft/deberta-v3-base",  # Change this to try different models
    "num_labels": 1,  # Regression output
    "max_length": 512,
    "seed": 512,
    
    # Training parameters (adjust for your needs)
    "learning_rate": 1e-5,
    "num_epochs": 15,  # Start with fewer epochs for testing
    "batch_size": 8,
    "eval_batch_size": 16,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    "lr_scheduler_type": "cosine",
    
    # Evaluation settings
    "save_steps": 100,
    "eval_steps": 50,
    "logging_steps": 25,
    "early_stopping_patience": 3,
    
    # Data splits
    "train_size": 0.7,
    "val_size": 0.15,
    "test_size": 0.15,
    
    # Kaggle-specific paths
    "datasets_dir": Path('/kaggle/input/arag-set/hallucination_risk_dataset.json'),  # Use downloaded Kaggle dataset
    "output_dir": Path("/kaggle/working/hallucination_predictor_model"),  # Kaggle working directory
}
#   hallucination_risk_dataset.json
# Quick testing mode (uncomment for faster iterations)
TESTING_MODE = False
if TESTING_MODE:
    print("🧪 TESTING MODE: Using smaller dataset and fewer epochs")
    CONFIG.update({
        "num_epochs": 1,
        "batch_size": 4,
        "eval_batch_size": 4,
        "save_steps": 20,
        "eval_steps": 20,
        "logging_steps": 5,
        "train_size": 0.05,  # Use only small portion of data
        "val_size": 0.01,
        "test_size": 0.01,
    })

# Create output directory
CONFIG["output_dir"].mkdir(parents=True, exist_ok=True)

print(f"📋 Configuration loaded for model: {CONFIG['model_name']}")
print(f"📂 Dataset directory: {CONFIG['datasets_dir']}")
print(f"📁 Output directory: {CONFIG['output_dir']}")
log_device_info()

## Hallucination Predictor Class

In [None]:
class PredictorAgent:
    """
    Prediction Agent model trainer.
    
    Trains transformer models to predict score from query text.
    Supports single label regression with comprehensive evaluation.
    """
    
    def __init__(self, config: Dict):
        """Initialize the predictor agent trainer."""
        self.config = config
        self.model_name = config["model_name"]
        self.num_labels = config["num_labels"]
        self.max_length = config["max_length"]
        self.seed = config["seed"]
        
        # Set random seeds
        set_seed(self.seed)
        
        # Setup directories
        self.output_dir = Path(config["output_dir"])
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.datasets_dir = Path(config["datasets_dir"])
        
        # Model components
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.data_collator = None
        
        print(f"🤗 [PREDICTOR AGENT] Initialized")
        print(f"  Model: {self.model_name}")
        print(f"  Labels: {self.num_labels}")
        print(f"  Max Length: {self.max_length}")
        print(f"  Output: {self.output_dir}")
    
    def load_processed_dataset(self) -> List[Dict]:
        """Load processed dataset from Kaggle."""
        # Look for the dataset file in the Kaggle dataset directory
        dataset_path = self.datasets_dir
        
        with open(dataset_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        print(f"📚 [DATASET] Loaded {len(data)} samples ")
        return data
    
    def setup_tokenizer(self):
        """Initialize tokenizer with proper configuration."""
        if self.tokenizer is not None:
            return
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name, use_fast=True
        )
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        #self.data_collator = DataCollatorWithPadding(
        #    tokenizer=self.tokenizer, padding=True
        #)
        
        print(f"🔤 [TOKENIZER] Initialized: {self.tokenizer.__class__.__name__}")
        print(f"  Vocab size: {self.tokenizer.vocab_size}")
        print(f"  Max length: {self.max_length}")
    
    def tokenize_data(self, data):
        """Tokenize text data for model input."""
        tokenized_data = self.tokenizer(
            data["question"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
            return_attention_mask=True,
            add_special_tokens=True
        )
        return {
            "input_ids": tokenized_data["input_ids"],
            "attention_mask": tokenized_data["attention_mask"],
            "labels": data["score"]
        }
    
    def normalize_score(self, data: List[Dict]) -> List[Dict]:
        """Normalize score to 0-1 range."""
        scores = [item["score"] for item in data]
        min_score, max_score = min(scores), max(scores)
        self.min_score = min_score
        self.max_score = max_score
        print(f"Original score range: {min_score} to {max_score}")
        for item in data:
            item["score"] = (item["score"] - min_score) / (max_score - min_score)
        return data
        
    def denormalize_score(self, norm_score: float) -> float:
        return norm_score * (self.max_score - self.min_score) + self.min_score

    def prepare_datasets(self) -> DatasetDict:
        """Load and prepare datasets for training."""
        # Load and preprocess data
        raw_data = self.load_processed_dataset()
        raw_data = self.normalize_score(raw_data)
        # Create splits
        train_data, val_data, test_data = create_data_splits(
            data=raw_data, 
            train_size=self.config["train_size"],
            val_size=self.config["val_size"],
            test_size=self.config["test_size"],
            seed=self.seed,
            label_key="score",
            stratify=True
        )
        
        # Setup tokenizer
        self.setup_tokenizer()
        
        # Create HuggingFace datasets
        train_dataset = Dataset.from_list(train_data)
        val_dataset = Dataset.from_list(val_data)
        test_dataset = Dataset.from_list(test_data)
        
        remove_columns = ["id","question", "answer", "votes", "signals", "source", "score"]
        
        print("🔄 [TOKENIZATION] Tokenizing datasets...")
        train_dataset = train_dataset.map(
            self.tokenize_data,
            batched=True,
            desc="Tokenizing train",
            remove_columns=remove_columns,
        )
        val_dataset = val_dataset.map(
            self.tokenize_data,
            batched=True,
            desc="Tokenizing validation",
            remove_columns=remove_columns,
        )
        test_dataset = test_dataset.map(
            self.tokenize_data,
            batched=True,
            desc="Tokenizing test",
            remove_columns=remove_columns,
        )
        
        datasets = DatasetDict({
            "train": train_dataset,
            "validation": val_dataset,
            "test": test_dataset
        })
        
        print("✅ [DATASETS] Prepared tokenized datasets")
        return datasets
    
    def compute_metrics(self, eval_pred):
        """Compute regression evaluation metrics for 0-1 score prediction."""
        predictions, labels = eval_pred
        predictions = predictions.flatten() # Convert to 1D array
        labels = labels.flatten() 

        metrics = {
            "mse": mean_squared_error(labels, predictions),
            "mae": mean_absolute_error(labels, predictions),
            "rmse": np.sqrt(mean_squared_error(labels, predictions)),
            "r2": 1 - (np.sum((labels - predictions) ** 2) / np.sum((labels - np.mean(labels)) ** 2))
        }
        return metrics
    
    def setup_model(self):
        """Initialize the model."""
        if self.model is not None:
            return
        
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=1,
            problem_type="regression"
        )
        
        if self.tokenizer is not None:
            self.model.resize_token_embeddings(len(self.tokenizer))
        
        print(f"🤖 [MODEL] Loaded regression model: {self.model_name}")
        print(f"  Parameters: {self.model.num_parameters():,}")
        print("  Score outputs: 1 (regression [0-1])")

print("✅ PredictorAgent class loaded!")

##  Custom Trainer

In [None]:
class RegressionTrainer(Trainer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels").float()
        outputs = model(**inputs)
        logits = outputs.get('logits').squeeze() # Remove extra dimension
        loss_fn = torch.nn.MSELoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

print("Trainer loaded!")

## Training Function

In [None]:
def train_predictor(predictor: PredictorAgent, datasets: DatasetDict) -> Dict:
    """Train the prediction model."""
    config = predictor.config
    
    # Setup model and tokenizer
    predictor.setup_tokenizer()
    predictor.setup_model()
    
    # Auto-detect fp16 capability
    fp16 = False
    if torch.cuda.is_available():
        fp16 = torch.cuda.get_device_capability()[0] >= 7
        print(f" FP16: ✅, Cap: {torch.cuda.get_device_capability()[0]}")
        
    # Training arguments
    training_args = TrainingArguments(
        output_dir=str(predictor.output_dir),
        learning_rate=config["learning_rate"],
        num_train_epochs=config["num_epochs"],
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["eval_batch_size"],
        warmup_ratio=config["warmup_ratio"],
        weight_decay=config["weight_decay"],
        logging_steps=config["logging_steps"],
        eval_steps=config["eval_steps"],
        save_steps=config["save_steps"],
        eval_strategy="steps",
        save_strategy="steps",
        save_total_limit=3,
        metric_for_best_model="loss",
        greater_is_better=False,
        load_best_model_at_end=True,
        report_to="none",
        push_to_hub=False,
        fp16=fp16,
        dataloader_num_workers=2,
        dataloader_pin_memory=False,
        remove_unused_columns=True,
        seed=config["seed"],
        data_seed=config["seed"],
        gradient_checkpointing=True,
        optim="adafactor",
        gradient_accumulation_steps=4,
        max_grad_norm=config["max_grad_norm"],
        lr_scheduler_type=config["lr_scheduler_type"]
    )
    
    # Initialize trainer
    predictor.trainer = RegressionTrainer(
        model=predictor.model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["validation"],
        compute_metrics=predictor.compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config["early_stopping_patience"])]
    )
    
    # Log training setup
    print("🚀 [TRAINING] Starting model training")
    print(f"  Model: {predictor.model_name}")
    print(f"  Device: {training_args.device}")
    print(f"  Learning rate: {config['learning_rate']}")
    print(f"  Epochs: {config['num_epochs']}")
    print(f"  Batch size: {config['batch_size']} (train) / {config['eval_batch_size']} (eval)")
    print(f"  Warmup ratio: {config['warmup_ratio']}")
    print(f"  Weight decay: {config['weight_decay']}")
    
    # Train model
    train_result = predictor.trainer.train()
    
    # Save model and tokenizer
    print("💾 [TRAINING] Saving model...")
    predictor.trainer.save_model()
    predictor.tokenizer.save_pretrained(predictor.output_dir)
    
    # Evaluate on test set
    print("📊 [EVALUATION] Evaluating on test set...")
    test_results = predictor.trainer.evaluate(datasets["test"], metric_key_prefix="test")
    
    # Compile results
    results = {
        "model_name": predictor.model_name,
        "task_type": "regression",
        "num_labels": predictor.num_labels,
        "training_args": training_args.to_dict(),
        "train_results": train_result.metrics,
        "test_results": test_results,
        "dataset_sizes": {
            "train": len(datasets["train"]),
            "validation": len(datasets["validation"]),
            "test": len(datasets["test"]),
        },
        "training_time": train_result.metrics.get("train_runtime", 0),
        "trained_at": datetime.now().isoformat(),
        "seed": config["seed"],
    }
    
    # Save results
    results_path = predictor.output_dir / "training_results.json"
    with open(results_path, "w") as f:
        json.dump(results, f, indent=2, default=str)
    
    # Log final results
    print("🎉 [TRAINING] Training completed successfully!")
    print(f"  Test MSE: {test_results.get('test_mse', 0):.4f}")
    print(f"  Test MAE: {test_results.get('test_mae', 0):.4f}")
    print(f"  Test RMSE: {test_results.get('test_rmse', 0):.4f}")
    print(f"  Test R2: {test_results.get('test_r2', 0):.4f}")
    print(f"  Training time: {train_result.metrics.get('train_runtime', 0):.1f}s")
    print(f"  Results saved: {results_path}")
    
    return results

print("✅ Training function loaded!")

## Run Training

In [None]:
# Initialize predictor
print("🚀 Initializing Predictor Agent...")
predictor = PredictorAgent(CONFIG)

# Prepare datasets
print("\n📚 Preparing datasets...")
datasets = predictor.prepare_datasets()

# Start training
print("\n🏋️ Starting training...")
print("=" * 70)
results = train_predictor(predictor, datasets)
print("=" * 70)

print("\n🎯 Training Summary:")
print(f"Model: {results['model_name']}")
print(f"Final Test MSE: {results['test_results'].get('test_mse', 0):.4f}")
print(f"Final Test MAE: {results['test_results'].get('test_mae', 0):.4f}")
print(f"Final Test R2: {results['test_results'].get('test_r2', 0):.4f}")
print(f"Final Test RMSE: {results['test_results'].get('test_rmse', 0):.4f}")
print(f"Training Time: {results['training_time']:.1f}s")
print(f"\n📁 Model saved to: {predictor.output_dir}")

## Save Final Model Information

In [None]:
# Display model information
print("\n🤖 Model Information:")
print(f"Architecture: {predictor.model_name}")
print(f"Parameters: {predictor.model.num_parameters():,}")
print(f"Vocabulary Size: {predictor.tokenizer.vocab_size:,}")
print(f"Max Sequence Length: {predictor.max_length}")
print(f"Problem Type: Regression (0-1 risk scores)")

print("\n📈 Training Configuration:")
for key, value in CONFIG.items():
    if key not in ['datasets_dir', 'output_dir']:
        print(f"  {key}: {value}")

print("\n✅ Predictor Agent Training Complete!")
print(f"📂 All files saved to: {predictor.output_dir}")
print("\nFiles created:")
print("  - pytorch_model.bin (or model.safetensors)")
print("  - config.json")
print("  - tokenizer files")
print("  - training_results.json")