In [None]:
!pip install sentence-transformers --quiet

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import warnings
warnings.filterwarnings('ignore')

import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge
from collections import Counter
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
import random
import copy
import pickle

print("DA5401 - Data Challenge")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG = {
    'data_dir': '/kaggle/input/da5401-2025-data-challenge',
    'output_dir': '/kaggle/working',
    'encoder_name': 'paraphrase-multilingual-mpnet-base-v2',

    # Training
    'batch_size': 64,
    'learning_rate': 1e-4,  # Head LR (encoder will use smaller via param groups)
    'epochs': 100,
    'dropout': 0.4,
    'patience': 20,
    'weight_decay': 1e-5,

    # Model
    'model_type': 'heteroscedastic',  # 'standard', 'ordinal', 'heteroscedastic' (Option 3: better for uncertainty)
    'embedding_dim': 768,
    'hidden_dims': [512, 256, 128],

    # Advanced Features
    'fine_tune_encoder': True,
    'encoder_ft_epochs': 3,  # Epochs for encoder fine-tuning
    'encoder_ft_lr': 2e-5,  # Small encoder LR (much smaller than head LR)
    'encoder_ft_batch_size': 64,

    'use_ensemble': False,
    'n_cv_folds': 5,
    'cv_random_state': 42,  # Random state for consistent folds
    'ensemble_configs': [],

    # Data Augmentation (Option 2: Synthetic Negatives)
    'use_synthetic_negatives': True,  # Helps with class imbalance
    'synthetic_ratio': 0.10,  #  prevent over-biasing toward low scores
    'synthetic_score_range': (1, 6),  # Focused on very low scores (1-6) to avoid biasing away from 9-10

    # Loss (use heteroscedastic for Option 3)
    'loss_type': 'heteroscedastic',  # 'ordinal', 'hybrid', 'focal', 'ce', or 'heteroscedastic' (Option 3)
    'focal_gamma': 2.0,
    'hybrid_mse_weight': 0.15,  # more emphasis on regression aspect
    'use_clipped_weights': True,
    'use_weights': True,  # Apply sample weights to loss
    'max_weight': 3.0,  # less aggressive weighting
    'min_weight': 0.5,
    # Special weighting for high scores to prevent under-prediction
    'boost_high_scores': True,  # Additional weight boost for scores 9-10
    'sample_weight_temp': 0.5,  # Temperature for weight calculation

    # Checkpointing
    'resume_from_checkpoint': True,
    'checkpoint_dir': '/kaggle/working/checkpoints',
    'save_interval': 10,
    'auto_save': True,

    # Advanced
    'use_amp': True,
    'gradient_clip': 1.0,
    'min_lr': 1e-6,
    'lr_patience': 5,
}

os.makedirs(CONFIG['checkpoint_dir'], exist_ok=True)

In [None]:
# ============================================================================
# ADVANCED LOSSES
# ============================================================================

class OrdinalRegressionLoss(nn.Module):
    """Ordinal regression loss for ordered classes."""

    def __init__(self, reduction='mean'):
        super().__init__()
        self.reduction = reduction

    def forward(self, logits, target, sample_weights=None, return_per_sample=False):
        """
        Args:
            logits: (batch_size, 11) logits for cumulative probabilities
            target: (batch_size,) integer targets in [0, 10]
            sample_weights: (batch_size,) optional sample weights
            return_per_sample: if True, return per-sample losses without reduction
        """
        batch_size = logits.shape[0]
        targets_cum = torch.zeros_like(logits)

        # Create cumulative targets: P(score <= k) = 1 for k >= target
        for i in range(batch_size):
            k = target[i].item()
            targets_cum[i, :k+1] = 1.0

        # Compute BCE loss per sample
        bce_per_sample = F.binary_cross_entropy_with_logits(
            logits, targets_cum, reduction='none'
        )
        # Average over the 11 thresholds
        loss_per_sample = bce_per_sample.mean(dim=1)

        # Apply sample weights if provided
        if sample_weights is not None:
            loss_per_sample = loss_per_sample * sample_weights

        if return_per_sample:
            return loss_per_sample

        if self.reduction == 'mean':
            return loss_per_sample.mean()
        elif self.reduction == 'sum':
            return loss_per_sample.sum()
        else:
            return loss_per_sample


class FocalLoss(nn.Module):
    """Focal loss for imbalanced data."""

    def __init__(self, alpha=1.0, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, target):
        ce = F.cross_entropy(logits, target, reduction='none')
        pt = torch.exp(-ce)
        return (self.alpha * (1 - pt) ** self.gamma * ce).mean()


class WeightedFocalLoss(nn.Module):
    """Focal loss with clipped sample weights."""

    def __init__(self, gamma=2.0, max_weight=5.0, min_weight=0.5):
        super().__init__()
        self.gamma = gamma
        self.max_weight = max_weight
        self.min_weight = min_weight

    def forward(self, logits, target, sample_weights):
        ce = F.cross_entropy(logits, target, reduction='none')
        pt = torch.exp(-ce)
        focal_weight = (1 - pt) ** self.gamma

        combined_weights = sample_weights * focal_weight
        combined_weights = torch.clamp(combined_weights, self.min_weight, self.max_weight)

        return (ce * combined_weights).mean()


class HybridLoss(nn.Module):
    """Combination of CrossEntropyLoss and MSE on expected score (more stable than ordinal alone)."""

    def __init__(self, ce_weight=1.0, mse_weight=0.1):
        super().__init__()
        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
        self.mse_weight = mse_weight
        self.ce_weight = ce_weight

    def forward(self, logits, target, sample_weights=None, return_per_sample=False):
        """
        Args:
            logits: (batch_size, 11) class logits
            target: (batch_size,) integer targets in [0, 10]
            sample_weights: (batch_size,) optional sample weights
            return_per_sample: if True, return per-sample losses without reduction
        """
        # CrossEntropyLoss
        ce = self.ce_loss(logits, target)  # (B,)

        # MSE on expected score
        probs = F.softmax(logits, dim=1)
        expected_scores = torch.sum(probs * torch.arange(11, device=logits.device).float(), dim=1)
        target_float = target.float()
        mse = (expected_scores - target_float) ** 2  # (B,)

        # Combined loss per sample
        loss_per_sample = self.ce_weight * ce + self.mse_weight * mse

        # Apply sample weights if provided
        if sample_weights is not None:
            loss_per_sample = loss_per_sample * sample_weights

        if return_per_sample:
            return loss_per_sample

        return loss_per_sample.mean()


class HeteroscedasticLoss(nn.Module):
    """Heteroscedastic loss: models both mean and uncertainty (logvar)."""

    def __init__(self):
        super().__init__()

    def forward(self, mean, logvar, target, sample_weights=None, return_per_sample=False):
        """
        Args:
            mean: (batch_size,) predicted mean scores
            logvar: (batch_size,) predicted log-variance
            target: (batch_size,) integer targets in [0, 10]
            sample_weights: (batch_size,) optional sample weights
            return_per_sample: if True, return per-sample losses without reduction
        """
        target_float = target.float()
        variance = torch.exp(logvar)

        # Heteroscedastic loss: MSE weighted by inverse variance + penalty for high variance
        precision = 1.0 / (variance + 1e-6)
        squared_error = (mean - target_float) ** 2
        loss_per_sample = precision * squared_error + logvar

        # Apply sample weights if provided
        if sample_weights is not None:
            loss_per_sample = loss_per_sample * sample_weights

        if return_per_sample:
            return loss_per_sample

        return loss_per_sample.mean()

In [None]:
# ============================================================================
# UNIFIED LOSS COMPUTATION
# ============================================================================

def compute_weighted_loss(criterion, logits_or_tuple, target, sample_weights):
    """
    Unified helper to compute per-sample losses, apply sample weights, and reduce.

    Args:
        criterion: Loss function instance (HybridLoss, OrdinalRegressionLoss, FocalLoss,
                   HeteroscedasticLoss, or CrossEntropyLoss)
        logits_or_tuple: If heteroscedastic -> (mean, logvar), else logits tensor
        target: LongTensor (B,)
        sample_weights: FloatTensor (B,) or None

    Returns:
        Scalar loss tensor
    """
    # Heteroscedastic expects (mean, logvar)
    if isinstance(criterion, HeteroscedasticLoss):
        mean, logvar = logits_or_tuple
        loss_per_sample = criterion(mean, logvar, target, sample_weights=None, return_per_sample=True)
        if sample_weights is not None:
            loss_per_sample = loss_per_sample * sample_weights
        return loss_per_sample.mean()

    # For others, compute per-sample losses
    if isinstance(criterion, nn.CrossEntropyLoss):
        # CrossEntropyLoss defaults to 'mean' reduction - use 'none'
        loss_per_sample = F.cross_entropy(logits_or_tuple, target, reduction='none')
    elif isinstance(criterion, (HybridLoss, OrdinalRegressionLoss)):
        # These support return_per_sample
        loss_per_sample = criterion(logits_or_tuple, target, sample_weights=None, return_per_sample=True)
    elif isinstance(criterion, FocalLoss):
        # FocalLoss needs per-sample CE first
        ce = F.cross_entropy(logits_or_tuple, target, reduction='none')
        pt = torch.exp(-ce)
        loss_per_sample = criterion.alpha * (1 - pt) ** criterion.gamma * ce
    elif isinstance(criterion, WeightedFocalLoss):
        # WeightedFocalLoss applies weights internally but we want per-sample
        ce = F.cross_entropy(logits_or_tuple, target, reduction='none')
        pt = torch.exp(-ce)
        focal_weight = (1 - pt) ** criterion.gamma
        combined_weights = sample_weights * focal_weight if sample_weights is not None else focal_weight
        combined_weights = torch.clamp(combined_weights, criterion.min_weight, criterion.max_weight)
        loss_per_sample = ce * combined_weights
    else:
        # Fallback: try to get per-sample loss
        try:
            # Try calling with reduction='none' if possible
            if hasattr(criterion, 'reduction'):
                old_reduction = criterion.reduction
                criterion.reduction = 'none'
                loss_per_sample = criterion(logits_or_tuple, target)
                criterion.reduction = old_reduction
            else:
                # Last resort: compute mean then expand (not ideal but works)
                loss_scalar = criterion(logits_or_tuple, target)
                loss_per_sample = loss_scalar.expand(logits_or_tuple.shape[0])
        except:
            # If all else fails, return a scalar (weighting won't work but training continues)
            return criterion(logits_or_tuple, target)

    # Apply sample weights if provided
    if sample_weights is not None:
        loss_per_sample = loss_per_sample * sample_weights

    # Reduce to scalar
    return loss_per_sample.mean()

In [None]:
# ============================================================================
# ORDINAL MODEL
# ============================================================================

class OrdinalMetricMatchingModel(nn.Module):
    """Ordinal regression model."""

    def __init__(self, embedding_dim=768, hidden_dims=[512, 256, 128], dropout=0.4):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bilinear = nn.Bilinear(embedding_dim, embedding_dim, embedding_dim)

        self.fc_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        input_dim = embedding_dim * 3

        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
            input_dim = hidden_dim

        self.output = nn.Linear(input_dim, 11)
        self.dropout = nn.Dropout(dropout)

    def forward(self, metric_emb, text_emb):
        bilinear_out = self.bilinear(metric_emb, text_emb)
        combined = torch.cat([metric_emb, text_emb, bilinear_out], dim=1)

        x = combined
        for fc, ln in zip(self.fc_layers[:-1], self.layer_norms[:-1]):
            x = fc(x)
            x = ln(x)
            x = F.relu(x)
            x = self.dropout(x)

        x = self.fc_layers[-1](x)
        x = self.layer_norms[-1](x)
        x = F.relu(x)
        x = self.dropout(x)

        return self.output(x)

    def predict_score(self, metric_emb, text_emb):
        """Predict score from ordinal cumulative logits.

        Converts cumulative probabilities P(score <= k) to per-class probabilities.
        """
        logits = self.forward(metric_emb, text_emb)
        cum_probs = torch.sigmoid(logits)  # shape (B, 11) representing P(score <= k) for k=0..10

        # Convert cumulative probs to per-class probs
        probs = torch.zeros_like(cum_probs)
        probs[:, 0] = cum_probs[:, 0]  # P(score = 0) = P(score <= 0)

        # For k=1..10: P(score = k) = P(score <= k) - P(score <= k-1)
        for k in range(1, cum_probs.shape[1]):  # 1..10 inclusive
            probs[:, k] = cum_probs[:, k] - cum_probs[:, k-1]

        # Numeric safety & renormalize
        probs = torch.clamp(probs, min=0.0)
        probs = probs / (probs.sum(dim=1, keepdim=True) + 1e-8)

        # Compute expected score
        expected_score = torch.sum(probs * torch.arange(probs.shape[1], device=metric_emb.device).float(), dim=1)
        return torch.clamp(expected_score, 0.0, 10.0)


class StandardMetricMatchingModel(nn.Module):
    """Standard classification model."""

    def __init__(self, embedding_dim=768, hidden_dims=[512, 256, 128], dropout=0.4):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bilinear = nn.Bilinear(embedding_dim, embedding_dim, embedding_dim)

        self.fc_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        input_dim = embedding_dim * 3

        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
            input_dim = hidden_dim

        self.output = nn.Linear(input_dim, 11)
        self.dropout = nn.Dropout(dropout)

    def forward(self, metric_emb, text_emb):
        bilinear_out = self.bilinear(metric_emb, text_emb)
        combined = torch.cat([metric_emb, text_emb, bilinear_out], dim=1)

        x = combined
        for fc, ln in zip(self.fc_layers[:-1], self.layer_norms[:-1]):
            x = fc(x)
            x = ln(x)
            x = F.relu(x)
            x = self.dropout(x)

        x = self.fc_layers[-1](x)
        x = self.layer_norms[-1](x)
        x = F.relu(x)
        x = self.dropout(x)

        return self.output(x)

    def predict_score(self, metric_emb, text_emb):
        logits = self.forward(metric_emb, text_emb)
        probs = F.softmax(logits, dim=1)
        expected_score = torch.sum(probs * torch.arange(11, device=metric_emb.device).float(), dim=1)
        return expected_score


class HeteroscedasticMatchingModel(nn.Module):
    """Heteroscedastic regression model: predicts mean (mu) and log-variance (logvar)."""

    def __init__(self, embedding_dim=768, hidden_dims=[512, 256, 128], dropout=0.4):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bilinear = nn.Bilinear(embedding_dim, embedding_dim, embedding_dim)

        self.fc_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        input_dim = embedding_dim * 3

        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
            input_dim = hidden_dim

        # Two outputs: mean and log-variance
        self.mean_head = nn.Linear(input_dim, 1)
        self.logvar_head = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, metric_emb, text_emb):
        bilinear_out = self.bilinear(metric_emb, text_emb)
        combined = torch.cat([metric_emb, text_emb, bilinear_out], dim=1)

        x = combined
        for fc, ln in zip(self.fc_layers[:-1], self.layer_norms[:-1]):
            x = fc(x)
            x = ln(x)
            x = F.relu(x)
            x = self.dropout(x)

        x = self.fc_layers[-1](x)
        x = self.layer_norms[-1](x)
        x = F.relu(x)
        x = self.dropout(x)

        mean = self.mean_head(x)
        logvar = self.logvar_head(x)

        # Clip logvar to prevent extremes (recommended: [-10, 10])
        logvar = torch.clamp(logvar, -10.0, 10.0)

        return mean.squeeze(-1), logvar.squeeze(-1)

    def predict_score(self, metric_emb, text_emb):
        """Predict expected mean score."""
        mean, logvar = self.forward(metric_emb, text_emb)
        return mean

In [None]:
# ============================================================================
# SIMCSE-STYLE CONTRASTIVE LEARNING
# ============================================================================

def combine_text_fields(record):
    """Helper function to combine text fields from a record."""
    parts = []
    if record.get('system_prompt'): parts.append(record['system_prompt'])
    if record.get('user_prompt'): parts.append(record['user_prompt'])
    elif record.get('prompt'): parts.append(record['prompt'])
    if record.get('response'): parts.append(record['response'])
    elif record.get('expected_response'): parts.append(record['expected_response'])
    return ' '.join(parts)


def fine_tune_encoder_simcse(text_encoder, train_data, device, epochs=3, batch_size=64, lr=2e-5):
    """
    Improved SimCSE-style contrastive fine-tuning using SentenceTransformer.fit() API.

    Updates:
    - Uses MultipleNegativesRankingLoss (recommended for large batches)
    - Larger batch size (64) or gradient accumulation to simulate 128-256
    - Proper warmup_steps calculation (~10% of total steps)
    - Falls back to ContrastiveLoss if MultipleNegativesRankingLoss fails

    References:
    - https://sbert.net/examples/training/unsupervised_learning/README.html#simcse
    - https://www.pinecone.io/learn/series/nlp/fine-tune-sentence-transformers-mnr/
    """
    print(f"\n{'='*60}")
    print("SimCSE-Style Contrastive Fine-tuning of Text Encoder")
    print(f"{'='*60}")

    # Disable wandb using the recommended approach (report_to='none')
    os.environ['WANDB_DISABLED'] = 'true'
    os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

    # Prepare InputExamples - create positive pairs (same sentence for SimCSE)
    # SimCSE: positive pair = same sentence encoded twice with different dropout
    texts = [combine_text_fields(rec) for rec in train_data]
    # Create pairs: each text paired with itself (positive pair)
    # The ContrastiveLoss will use dropout to create different embeddings
    examples = [InputExample(texts=[t, t], label=1.0) for t in texts]

    # Create DataLoader
    from torch.utils.data import DataLoader as TorchDataLoader
    train_dataloader = TorchDataLoader(examples, shuffle=True, batch_size=batch_size, drop_last=False)

    # Calculate warmup_steps (recommended: ~10% of total steps)
    num_steps = len(train_dataloader) * epochs
    warmup_steps = max(100, num_steps // 10)

    # Prefer MultipleNegativesRankingLoss (better for large batches)
    # Falls back to ContrastiveLoss if needed
    try:
        train_loss = losses.MultipleNegativesRankingLoss(model=text_encoder)
        loss_name = "MultipleNegativesRankingLoss"
        print(f"Using {loss_name} (recommended for large batches)")
    except Exception as e:
        train_loss = losses.ContrastiveLoss(model=text_encoder)
        loss_name = "ContrastiveLoss"
        print(f"Using {loss_name} (fallback): {e}")

    print(f"Total steps: {num_steps}, Warmup steps: {warmup_steps}")

    # Fine-tune using SentenceTransformer.fit() API
    output_dir_ft = os.path.join(CONFIG['output_dir'], 'encoder_finetune')
    os.makedirs(output_dir_ft, exist_ok=True)

    try:
        text_encoder.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            warmup_steps=warmup_steps,  # Proper warmup calculation
            optimizer_params={'lr': lr},
            show_progress_bar=True,
            output_path=output_dir_ft,
            use_amp=CONFIG.get('use_amp', True)
        )
    except Exception as e:
        # If fit() still has issues, fall back to simpler approach
        print(f"Warning: SentenceTransformer.fit() failed: {e}")
        print("Skipping encoder fine-tuning, using pre-trained encoder as-is")
        return text_encoder

    print(f"Encoder fine-tuning completed!")
    print("="*60)

    return text_encoder

In [None]:
# ============================================================================
# DATASET WITH WEIGHTS
# ============================================================================

class WeightedDataset(Dataset):
    def __init__(self, data, metric_embeddings, metric_names_map, text_embeddings, score_weights):
        self.data = data
        self.metric_embeddings = torch.FloatTensor(metric_embeddings)
        self.metric_names_map = metric_names_map
        self.text_embeddings = torch.FloatTensor(text_embeddings)
        self.score_weights = score_weights

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        metric_idx = self.metric_names_map[record['metric_name']]
        metric_emb = self.metric_embeddings[metric_idx]
        text_emb = self.text_embeddings[idx]

        if 'score' in record:
            score = int(float(record['score']))
            weight = self.score_weights[score] if self.score_weights else 1.0
            return metric_emb, text_emb, score, weight, idx
        return metric_emb, text_emb, idx



In [None]:
# ============================================================================
# CHECKPOINT MANAGEMENT
# ============================================================================

def save_checkpoint(model, optimizer, epoch, best_rmse, filepath):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_rmse': best_rmse,
        'config': CONFIG,
    }
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved: {filepath}")

def load_checkpoint(filepath, model, optimizer=None):
    if not os.path.exists(filepath):
        return None, 0, float('inf')

    checkpoint = torch.load(filepath, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    return model, checkpoint['epoch'] + 1, checkpoint['best_rmse']

def find_latest_checkpoint(checkpoint_dir):

    priority_paths = [
        '/kaggle/input/best-model-final/checkpoints/best_model_synth.pth',
        '/kaggle/input/best-model-final/best_model_synth.pth',
    ]

    # Check priority paths first
    for path in priority_paths:
        if os.path.exists(path):
            print(f"Found checkpoint at: {path}")
            return path

    # Fall back to standard checkpoint directory
    if not os.path.exists(checkpoint_dir):
        return None

    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pth')]
    if not checkpoints:
        return None

    checkpoints.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)))
    return os.path.join(checkpoint_dir, checkpoints[-1])



In [None]:
# ============================================================================
# LOAD DATA
# ============================================================================

print("\nLoading data...")

data_dir = None
with open(os.path.join(data_dir, 'train_data.json'), encoding='utf-8') as f:
    train_data = json.load(f)

with open(os.path.join(data_dir, 'metric_names.json'), encoding='utf-8') as f:
    metric_names = json.load(f)

metric_embeddings = np.load(os.path.join(data_dir, 'metric_name_embeddings.npy'))

print(f"Loaded {len(train_data)} training samples")


In [None]:
# ============================================================================
# OPTION 2: ADD SYNTHETIC NEGATIVES (BEFORE ENCODING)
# ============================================================================

if CONFIG.get('use_synthetic_negatives', False):
    print("\n" + "="*60)
    print("Generating Synthetic Negatives (Option 2)")
    print("="*60)

    # Get all unique metric names
    with open(os.path.join(CONFIG['data_dir'], 'metric_names.json'), encoding='utf-8') as f:
        metric_names_list = json.load(f)

    # Create synthetic negatives by misaligning metrics with prompts
    num_synthetic = int(len(train_data) * CONFIG.get('synthetic_ratio', 0.08))
    synthetic_negatives = []

    print(f"Creating {num_synthetic} synthetic negative samples...")
    print("  (Misaligning metrics with prompt-response pairs to create low-fitness examples)")
    for _ in tqdm(range(num_synthetic), desc="Generating"):
        # Pick a random training sample
        base_sample = random.choice(train_data)
        synthetic = copy.deepcopy(base_sample)

        # Replace metric with a random OTHER metric (misalignment)
        # This creates low-fitness pairs that the model should predict low scores for
        other_metrics = [m for m in metric_names_list if m != synthetic['metric_name']]
        synthetic['metric_name'] = random.choice(other_metrics)

        # Assign a low score (1-5 range)
        synthetic['score'] = str(float(random.randint(*CONFIG.get('synthetic_score_range', (1, 5)))))

        synthetic_negatives.append(synthetic)

    # Add synthetic samples to training data
    train_data_extended = train_data + synthetic_negatives
    print(f"✓ Added {len(synthetic_negatives)} synthetic negatives")
    print(f"  Total training samples: {len(train_data)} → {len(train_data_extended)}")

    # Show distribution before/after
    scores_original = [int(float(rec['score'])) for rec in train_data]
    scores_extended = [int(float(rec['score'])) for rec in train_data_extended]
    counts_original = Counter(scores_original)
    counts_extended = Counter(scores_extended)

    print("\nDistribution BEFORE synthetic negatives:")
    for score in sorted(counts_original.keys()):
        count = counts_original[score]
        pct = 100 * count / len(scores_original)
        print(f"  Score {score}: {count:5d} ({pct:5.2f}%)")

    print("\nDistribution AFTER synthetic negatives:")
    for score in sorted(counts_extended.keys()):
        count = counts_extended[score]
        pct = 100 * count / len(scores_extended)
        print(f"  Score {score}: {count:5d} ({pct:5.2f}%)")

    # Use extended data
    train_data = train_data_extended
    print("="*60 + "\n")



In [None]:
# ============================================================================
# OPTION 1: COMPUTE IMPROVED SAMPLE WEIGHTS (SQRT INVERSE FREQUENCY)
# ============================================================================

print("\nComputing improved sample weights (sqrt inverse frequency)...")
scores = [int(float(rec['score'])) for rec in train_data]
score_counts = Counter(scores)

score_weights = {}
max_count = max(score_counts.values())

print("Original training distribution:")
for score in sorted(score_counts.keys()):
    count = score_counts[score]
    pct = 100 * count / len(scores)
    print(f"  Score {score}: {count:5d} ({pct:5.2f}%)")

print("\nComputed weights (sqrt inverse frequency, max 8.0):")
for score in range(11):
    count = score_counts.get(score, 1)
    # Use sqrt of inverse frequency (more aggressive upweighting)
    weight = (max_count / count) ** 0.5
    # Clip to 8.0 max (allows higher weights for very rare scores)
    weight = min(8.0, weight)

    # Boost high scores (9-10) if enabled to prevent under-prediction
    if CONFIG.get('boost_high_scores', False):
        if score == 9:
            # Give score 9 a moderate boost (it's common but model under-predicts it)
            weight *= 1.5  # Boost by 50%
        elif score == 10:
            # Give score 10 a boost (common but model under-predicts it)
            weight *= 1.3  # Boost by 30%
        # Clip again after boosting
        weight = min(8.0, weight)

    score_weights[score] = weight
    boost_note = " [boosted]" if (CONFIG.get('boost_high_scores', False) and score in [9, 10]) else ""
    print(f"  Score {score}: count={count:4d}, weight={weight:.3f}{boost_note}")



In [None]:
# ============================================================================
# ENCODE TEXTS
# ============================================================================

print("\nLoading text encoder...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Check if fine-tuned encoder already exists (for checkpoint resumption)
encoder_finetune_path = os.path.join(CONFIG['output_dir'], 'encoder_finetune')
encoder_exists = False
encoder_path_to_use = None

if encoder_exists:
    print(f"Found existing fine-tuned encoder at: {encoder_path_to_use}")
    print("Loading saved encoder (skipping fine-tuning)...")
    text_encoder = SentenceTransformer(encoder_path_to_use)
    text_encoder = text_encoder.to(device)
    print(" Loaded fine-tuned encoder from checkpoint")
else:
    # Load base encoder
    print(f"Note: Fine-tuned encoder not found at: {encoder_finetune_path}")
    print("  (This is normal for fresh runs - encoder will be fine-tuned)")
    print("  To skip fine-tuning, upload encoder_finetune/ as a Kaggle dataset")
    text_encoder = SentenceTransformer(CONFIG['encoder_name'])
    text_encoder = text_encoder.to(device)

    # SimCSE-style contrastive fine-tuning if enabled
    if CONFIG.get('fine_tune_encoder', False):
        print("\nStarting encoder fine-tuning (this will take a few minutes)...")
        text_encoder = fine_tune_encoder_simcse(
            text_encoder, train_data, device,
            epochs=CONFIG.get('encoder_ft_epochs', 3),
            lr=CONFIG.get('encoder_ft_lr', 2e-5),
            batch_size=32
        )
    else:
        print(" Loaded base encoder (fine-tuning disabled)")

print("Encoding text sequences...")
combined_texts = [combine_text_fields(rec) for rec in train_data]

# Use explicit device string ('cuda' or 'cpu') for consistent behavior
device_str = 'cuda' if torch.cuda.is_available() else 'cpu'
text_embeddings = text_encoder.encode(
    combined_texts,
    show_progress_bar=True,
    batch_size=64,  # Reduced from 128 to prevent OOM
    device=device_str  # Explicit 'cuda' or 'cpu' for clarity and version compatibility
)

# Clear GPU memory after encoding if SimCSE fine-tuning not needed
# (Text encoder will be reloaded for SimCSE if enabled)
if not CONFIG.get('fine_tune_encoder', False):
    del text_encoder
    torch.cuda.empty_cache()
    print(" Text encoder freed from GPU (not needed for training)")
else:
    # Even if fine-tuning, clear cache to free up memory
    torch.cuda.empty_cache()
    print(" GPU cache cleared after encoding")


In [None]:
# ============================================================================
# PREPARE DATASET
# ============================================================================

metric_names_map = {name: idx for idx, name in enumerate(metric_names)}
full_dataset = WeightedDataset(train_data, metric_embeddings, metric_names_map, text_embeddings, score_weights)
targets = [int(float(rec['score'])) for rec in train_data]

# Create consistent CV folds (saved for reproducibility)
skf = StratifiedKFold(
    n_splits=CONFIG.get('n_cv_folds', 5),
    shuffle=True,
    random_state=CONFIG.get('cv_random_state', 42)
)
# Generate all folds for ensemble training
all_folds = list(skf.split(range(len(train_data)), targets))

# For single model: use first fold
train_idx, val_idx = all_folds[0]

# Save fold indices for reproducibility (using pickle for Python objects)
fold_indices_path = os.path.join(CONFIG['output_dir'], 'cv_folds.pkl')
with open(fold_indices_path, 'wb') as f:
    pickle.dump(all_folds, f)
print(f"✓ Saved CV fold indices to: {fold_indices_path}")

train_dataset = torch.utils.data.Subset(full_dataset, train_idx)
val_dataset = torch.utils.data.Subset(full_dataset, val_idx)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2)  # Use 0 for Kaggle compatibility
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2)  # Use 0 for Kaggle compatibility

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")



In [None]:
# ============================================================================
# INITIALIZE MODEL
# ============================================================================

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Automatically select model type based on loss type for compatibility
# Hybrid loss requires standard classification logits, not ordinal cumulative logits
effective_model_type = CONFIG['model_type']
if CONFIG['model_type'] == 'heteroscedastic':
    effective_model_type = 'heteroscedastic'
elif CONFIG['loss_type'] == 'hybrid' or CONFIG['loss_type'] == 'ce' or CONFIG['loss_type'] == 'focal':
    effective_model_type = 'standard'
if CONFIG['model_type'] == 'ordinal':
        print("  Warning: Using standard model with {} loss (ordinal model requires ordinal loss)".format(CONFIG['loss_type']))
elif CONFIG['loss_type'] == 'ordinal':
    effective_model_type = 'ordinal'

def create_model(model_type, seed=None):
    """Create a model with optional seed for ensemble diversity."""
    if seed is not None:
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

    if model_type == 'ordinal':
        return OrdinalMetricMatchingModel(**{k: CONFIG[k] for k in ['embedding_dim', 'hidden_dims', 'dropout']})
    elif model_type == 'heteroscedastic':
        return HeteroscedasticMatchingModel(**{k: CONFIG[k] for k in ['embedding_dim', 'hidden_dims', 'dropout']})
    else:
        return StandardMetricMatchingModel(**{k: CONFIG[k] for k in ['embedding_dim', 'hidden_dims', 'dropout']})

def create_loss(loss_type):
    """Create loss function based on type."""
    if loss_type == 'ordinal':
        return OrdinalRegressionLoss()
    elif loss_type == 'hybrid':
        return HybridLoss(ce_weight=1.0, mse_weight=CONFIG.get('hybrid_mse_weight', 0.1))
    elif loss_type == 'focal':
        return FocalLoss(gamma=CONFIG['focal_gamma'])
    elif loss_type == 'heteroscedastic':
        return HeteroscedasticLoss()
    else:
        return nn.CrossEntropyLoss()

# For single model training (non-ensemble mode)
if not CONFIG.get('use_ensemble', False):
    model = create_model(effective_model_type)
    model = model.to(device)
    criterion = create_loss(CONFIG['loss_type'])
else:
    # Ensemble mode - will be handled later
    model = None
    criterion = None

# Optimizer with parameter groups (encoder LR vs head LR)
# Note: Since we precompute embeddings, model only has head params
optimizer = None
scheduler = None
scaler = None

if not CONFIG.get('use_ensemble', False):
    # All model params use head LR (since encoder is separate and pre-computed)
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=CONFIG['learning_rate'],  # Head LR
        weight_decay=CONFIG['weight_decay']
    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=CONFIG['lr_patience'],
        min_lr=CONFIG['min_lr'], verbose=True
    )

    scaler = torch.cuda.amp.GradScaler() if CONFIG['use_amp'] else None

    print(f"Model: {effective_model_type} (configured: {CONFIG['model_type']})")
    print(f"Loss: {CONFIG['loss_type']}")
    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Learning rate: {CONFIG['learning_rate']:.2e} (head LR)")
else:
    print("Ensemble mode: Will train multiple models")



In [None]:
# ============================================================================
# RESUME FROM CHECKPOINT
# ============================================================================

start_epoch = 0
best_rmse = float('inf')

if not CONFIG.get('use_ensemble', False):
    if CONFIG['resume_from_checkpoint']:
        checkpoint_path = find_latest_checkpoint(CONFIG['checkpoint_dir'])
        if checkpoint_path:
            model, start_epoch, best_rmse = load_checkpoint(checkpoint_path, model, optimizer)
            print(f"Resumed from epoch {start_epoch}")
        else:
            print("No checkpoint found, starting fresh")
    else:
        print("Starting fresh training")


In [None]:
# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================

def train_epoch(model, dataloader, criterion, optimizer, device, scaler, gradient_clip):
    """
    Unified training epoch using compute_weighted_loss for consistent per-sample weighting.
    """
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        metric_emb, text_emb, target, weights, _ = batch
        metric_emb = metric_emb.to(device)
        text_emb = text_emb.to(device)
        target = target.to(device)
        weights = weights.to(device) if CONFIG.get('use_weights', True) else None

        optimizer.zero_grad()

        # Forward pass - handle heteroscedastic separately
        if isinstance(criterion, HeteroscedasticLoss):
            logits_or_tuple = model(metric_emb, text_emb)  # Returns (mean, logvar)
        else:
            logits_or_tuple = model(metric_emb, text_emb)  # Returns logits

        # Compute weighted loss using unified helper
        loss = compute_weighted_loss(criterion, logits_or_tuple, target, weights)

        if scaler:
            scaler.scale(loss).backward()
            if gradient_clip:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            if gradient_clip:
                torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
            optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def validate(model, dataloader, criterion, device, verbose=True):
    """
    Validate model with per-bin RMSE logging and prediction distribution.
    Returns RMSE, MAE, and per-bin statistics.
    """
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", disable=not verbose):
            metric_emb, text_emb, target, _, _ = batch
            metric_emb = metric_emb.to(device)
            text_emb = text_emb.to(device)

            scores = model.predict_score(metric_emb, text_emb)
            # Clamp predictions to valid range [0, 10]
            scores = torch.clamp(scores, 0.0, 10.0)
            all_preds.extend(scores.cpu().numpy())
            all_targets.extend(target.cpu().numpy())

    all_preds = np.array(all_preds)
    all_targets = np.array(all_targets)

    # Round predictions for RMSE calculation (matching evaluation)
    all_preds_rounded = np.round(all_preds).clip(0, 10)

    # Overall metrics
    rmse = np.sqrt(mean_squared_error(all_targets, all_preds_rounded))
    mae = mean_absolute_error(all_targets, all_preds_rounded)

    # Per-bin RMSE (recommended validation check)
    if verbose:
        print("\n" + "="*60)
        print("PER-BIN RMSE ANALYSIS")
        print("="*60)
        bin_rmses = {}
        bin_counts = {}
        for score in range(11):
            mask = all_targets == score
            if np.sum(mask) > 0:
                bin_preds = all_preds_rounded[mask]
                bin_targets = all_targets[mask]
                bin_rmse = np.sqrt(mean_squared_error(bin_targets, bin_preds))
                bin_rmses[score] = bin_rmse
                bin_counts[score] = np.sum(mask)
                print(f"  Score {score:2d}: RMSE={bin_rmse:5.2f}, Count={bin_counts[score]:4d}")

        # Prediction distribution histogram
        print("\n" + "="*60)
        print("PREDICTION DISTRIBUTION")
        print("="*60)
        pred_dist = Counter(np.round(all_preds).clip(0, 10).astype(int))
        target_dist = Counter(all_targets.astype(int))

        print(f"{'Score':<8} {'Predicted':<12} {'Actual':<12} {'Diff':<8}")
        print("-"*60)
        for score in range(11):
            pred_count = pred_dist.get(score, 0)
            target_count = target_dist.get(score, 0)
            pred_pct = 100 * pred_count / len(all_preds) if len(all_preds) > 0 else 0
            target_pct = 100 * target_count / len(all_targets) if len(all_targets) > 0 else 0
            diff = pred_pct - target_pct
            print(f"{score:<8} {pred_count:5d} ({pred_pct:5.1f}%) {target_count:5d} ({target_pct:5.1f}%) {diff:+6.1f}%")
        print("="*60)

    return rmse, mae


In [None]:
# ============================================================================
# TRAINING LOOP
# ============================================================================

# ============================================================================
# ENSEMBLE TRAINING
# ============================================================================

def compute_engineered_features(metric_embeddings, text_embeddings, metric_indices, standardize=True):
    """
    Compute engineered features for meta-learner with proper normalization.

    Args:
        metric_embeddings: (M, D) array of metric embeddings
        text_embeddings: (N, D) array of text embeddings
        metric_indices: (N,) array of metric indices for each sample
        standardize: If True, z-score standardize features

    Returns:
        features: (N, F) array of engineered features (standardized if requested)
    """
    eps = 1e-8
    metric_embs = metric_embeddings[metric_indices]  # (N, D)

    dot_prod = np.sum(metric_embs * text_embeddings, axis=1)
    metric_norm = np.linalg.norm(metric_embs, axis=1) + eps
    text_norm = np.linalg.norm(text_embeddings, axis=1) + eps

    features = []

    # 1. Cosine similarity
    features.append((dot_prod / (metric_norm * text_norm)).reshape(-1, 1))

    # 2. Raw dot product (simple, distinct from cosine)
    # Note: Using raw dot product since cosine already captures normalized similarity
    features.append(dot_prod.reshape(-1, 1))

    # 3. L1 distance
    features.append(np.sum(np.abs(metric_embs - text_embeddings), axis=1).reshape(-1, 1))

    # 4. L2 distance
    features.append(np.linalg.norm(metric_embs - text_embeddings, axis=1).reshape(-1, 1))

    # 5. Elementwise product mean
    features.append(np.mean(metric_embs * text_embeddings, axis=1).reshape(-1, 1))

    # 6. Text embedding statistics
    features.append(np.mean(text_embeddings, axis=1).reshape(-1, 1))
    features.append(np.std(text_embeddings, axis=1).reshape(-1, 1))
    features.append(np.max(text_embeddings, axis=1).reshape(-1, 1))
    features.append(np.min(text_embeddings, axis=1).reshape(-1, 1))

    feats = np.concatenate(features, axis=1)

    # Standardize features (important for meta-learner stability)
    if standardize:
        mu = np.nanmean(feats, axis=0)
        sigma = np.nanstd(feats, axis=0) + eps
        feats = (feats - mu) / sigma

    return feats


def train_single_fold_model(model, train_loader, val_loader, device, model_type, loss_type, config):
    """
    Train a single model on one fold and return trained model + OOF predictions.

    Returns:
        trained_model: Best model from training
        val_preds: OOF predictions for validation set (aligned with val indices)
    """
    # Create loss
    if loss_type == 'hybrid':
        criterion = HybridLoss(ce_weight=1.0, mse_weight=CONFIG.get('hybrid_mse_weight', 0.15))
    elif loss_type == 'heteroscedastic':
        criterion = HeteroscedasticLoss()
    elif loss_type == 'ordinal':
        criterion = OrdinalRegressionLoss()
    else:
        criterion = nn.CrossEntropyLoss()

    # Create optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=CONFIG['learning_rate'],
        weight_decay=CONFIG['weight_decay']
    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=CONFIG['lr_patience'],
        min_lr=CONFIG['min_lr'], verbose=False
    )

    scaler = torch.cuda.amp.GradScaler() if CONFIG['use_amp'] else None

    # Train model
    best_rmse = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(CONFIG['epochs']):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, scaler, CONFIG.get('gradient_clip'))
        val_rmse, val_mae = validate(model, val_loader, criterion, device, verbose=False)

        scheduler.step(val_rmse)

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            patience_counter = 0
            # Create independent copy of state dict (safer for GPU tensors)
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            patience_counter += 1
            if patience_counter >= CONFIG['patience']:
                break

    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Get OOF predictions
    model.eval()
    val_preds = []

    with torch.no_grad():
        for batch in val_loader:
            metric_emb, text_emb, target, _, _ = batch
            metric_emb = metric_emb.to(device)
            text_emb = text_emb.to(device)

            scores = model.predict_score(metric_emb, text_emb)
            scores = torch.clamp(scores, 0.0, 10.0)
            val_preds.extend(scores.cpu().numpy())

    val_preds = np.array(val_preds)

    return model, val_preds


def train_ensemble_pipeline(all_folds, full_dataset, train_data, metric_embeddings, text_embeddings, metric_names_map, device):
    """
    Complete ensemble training pipeline:
    1. Collect OOF predictions for all models across all folds
    2. Compute engineered features
    3. Train meta-learner on OOF predictions + features
    4. Train calibrator
    5. Save everything for inference

    Returns:
        trained_models: Dict of trained models per fold per model name
        meta_learner: Trained Ridge meta-learner
        calibrator: Trained IsotonicRegression calibrator
        oof_predictions: Dict of OOF predictions per model
        oof_targets: True targets
        engineered_features: Engineered features array
    """
    from torch.utils.data import DataLoader, Subset

    print("\n" + "="*60)
    print("ENSEMBLE TRAINING PIPELINE")
    print("="*60)

    n_samples = len(train_data)
    ensemble_configs = CONFIG.get('ensemble_configs', [])
    oof_predictions = {}
    oof_targets = np.full(n_samples, np.nan, dtype=float)  # Use NaN sentinel
    trained_models = {}

    # Initialize OOF arrays for each model with NaN sentinel (not zeros - zero is valid prediction)
    for config in ensemble_configs:
        model_name = config['name']
        oof_predictions[model_name] = np.full(n_samples, np.nan, dtype=float)
        trained_models[model_name] = []

    # Get metric indices for engineered features
    metric_indices = [metric_names_map[rec['metric_name']] for rec in train_data]

    # Step 1: Collect OOF predictions for each fold
    print("\n" + "="*60)
    print("STEP 1: Collecting OOF Predictions")
    print("="*60)

    for fold_idx, (train_idx, val_idx) in enumerate(all_folds):
        print(f"\n{'='*60}")
        print(f"Fold {fold_idx+1}/{len(all_folds)}")
        print(f"{'='*60}")
        print(f"Train: {len(train_idx)}, Val: {len(val_idx)}")

        # Create train/val datasets for this fold
        train_dataset = Subset(full_dataset, train_idx)
        val_dataset = Subset(full_dataset, val_idx)

        train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

        # Store targets for this fold
        oof_targets[val_idx] = np.array([int(float(train_data[i]['score'])) for i in val_idx])

        # Train each model type on this fold
        for config in ensemble_configs:
            model_name = config['name']
            model_type = config['type']
            loss_type = config.get('loss_type', 'hybrid')
            seed = config.get('seed', 42)
            dropout = config.get('dropout', CONFIG['dropout'])

            print(f"\nTraining {model_name} (seed={seed}, dropout={dropout}) on fold {fold_idx+1}...")

            # Create model
            model = create_model(model_type, seed=seed)
            # Update dropout if specified
            if hasattr(model, 'dropout'):
                model.dropout = nn.Dropout(dropout)

            model = model.to(device)

            # Train model on this fold and get OOF predictions
            trained_model, val_preds = train_single_fold_model(
                model, train_loader, val_loader, device,
                model_type, loss_type, config
            )

            # Store OOF predictions (aligned by original indices)
            oof_predictions[model_name][val_idx] = val_preds

            # Store trained model (for inference later)
            # Store state_dict on CPU to save GPU memory
            trained_models[model_name].append({
                'fold': fold_idx,
                'train_idx': train_idx,
                'val_idx': val_idx,
                'model_state': {k: v.cpu().clone() for k, v in trained_model.state_dict().items()}  # Move to CPU
            })

            #  Free GPU memory after storing state_dict
            del trained_model
            torch.cuda.empty_cache()

            # Calculate RMSE for this fold
            val_rmse = np.sqrt(mean_squared_error(oof_targets[val_idx], val_preds))
            print(f"✓ {model_name} Fold {fold_idx+1} RMSE: {val_rmse:.4f}")

    # Step 2: Compute engineered features
    print("\n" + "="*60)
    print("STEP 2: Computing Engineered Features")
    print("="*60)

    engineered_features = None
    if CONFIG.get('use_engineered_features', True):
        print("Computing engineered features...")
        engineered_features = compute_engineered_features(
            metric_embeddings, text_embeddings, metric_indices
        )
        print(f" Engineered features shape: {engineered_features.shape}")

        # Save for inference
        features_path = os.path.join(CONFIG['output_dir'], 'engineered_features.npy')
        np.save(features_path, engineered_features)
        print(f" Saved engineered features to: {features_path}")

    # Step 3: Train meta-learner
    print("\n" + "="*60)
    print("STEP 3: Training Meta-Learner")
    print("="*60)

    from sklearn.preprocessing import StandardScaler

    # Stack OOF predictions
    model_names = list(oof_predictions.keys())
    X_oof = np.column_stack([oof_predictions[name] for name in model_names])

    # Add engineered features if provided
    if engineered_features is not None:
        X_oof = np.column_stack([X_oof, engineered_features])

    y_oof = oof_targets

    # NaN-aware filtering: keep rows where at least one model predicted
    valid_mask = ~np.isnan(X_oof).all(axis=1)
    X = X_oof[valid_mask].copy()
    y = y_oof[valid_mask].copy()

    print(f"Meta-learner training samples: {len(X)} (after NaN filtering)")
    print(f"Meta-learner features: {X.shape[1]} ({len(model_names)} models + {engineered_features.shape[1] if engineered_features is not None else 0} engineered)")

    # Fill NaNs with column means (train-time imputation)
    col_mean = np.nanmean(X, axis=0)
    nan_mask = np.isnan(X)
    if nan_mask.any():
        print(f"  Imputing {nan_mask.sum()} NaN values with column means")
        X[nan_mask] = np.take(col_mean, np.where(nan_mask)[1])

    # Standardize features (important for Ridge stability)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print(f"✓ Features standardized using StandardScaler")

    # Train Ridge meta-learner
    meta_method = CONFIG.get('meta_method', 'ridge')
    if meta_method == 'ridge':
        meta_learner = Ridge(alpha=1.0)
        meta_learner.fit(X_scaled, y)
        meta_oof_preds = meta_learner.predict(X_scaled)

        print(f"\nMeta-learner (Ridge) coefficients:")
        for i, name in enumerate(model_names):
            print(f"  {name}: {meta_learner.coef_[i]:.4f}")
        if engineered_features is not None:
            eng_coef_sum = np.sum(meta_learner.coef_[len(model_names):])
            print(f"  Engineered features (sum): {eng_coef_sum:.4f}")

    # Calculate meta-learner RMSE (raw and rounded)
    meta_oof_rmse_raw = np.sqrt(mean_squared_error(y, meta_oof_preds))
    meta_oof_preds_rounded = np.round(meta_oof_preds).clip(0, 10)
    meta_oof_rmse_rounded = np.sqrt(mean_squared_error(y, meta_oof_preds_rounded))

    print(f"\nMeta-learner RMSE (raw): {meta_oof_rmse_raw:.4f}")
    print(f"Meta-learner RMSE (rounded): {meta_oof_rmse_rounded:.4f}")

    # Compare with best base model (using raw RMSE for comparison)
    base_rmses = {}
    for name in model_names:
        mask = ~np.isnan(oof_predictions[name])  # NaN-aware masking
        if mask.sum() > 0:
            base_rmse_raw = np.sqrt(mean_squared_error(oof_targets[mask], oof_predictions[name][mask]))
            base_preds_rounded = np.round(oof_predictions[name][mask]).clip(0, 10)
            base_rmse_rounded = np.sqrt(mean_squared_error(oof_targets[mask], base_preds_rounded))
            base_rmses[name] = base_rmse_raw  # Use raw for comparison

    if base_rmses:
        best_base_rmse = min(base_rmses.values())
        best_base_name = min(base_rmses, key=base_rmses.get)
        print(f"\nBest base model ({best_base_name}) RMSE (raw): {best_base_rmse:.4f}")
        print(f"Meta-learner RMSE (raw): {meta_oof_rmse_raw:.4f}")
        print(f"Meta-learner improvement: {best_base_rmse - meta_oof_rmse_raw:.4f}")

        if meta_oof_rmse_raw < best_base_rmse:
            print(" Meta-learner improves over best base model!")
        else:
            print(" Meta-learner does not improve, consider simpler blending")

    # Step 4: Train calibrator
    calibrator = None
    calibrated_oof_preds = None
    if CONFIG.get('use_calibration', True):
        print("\n" + "="*60)
        print("STEP 4: Training Calibrator")
        print("="*60)

        from sklearn.isotonic import IsotonicRegression

        # Use raw meta-learner predictions (not rounded) for calibration
        X_cal = meta_oof_preds.reshape(-1, 1)
        y_cal = y  # Raw targets (already filtered to valid_mask)

        calibration_method = CONFIG.get('calibration_method', 'isotonic')
        if calibration_method == 'isotonic':
            calibrator = IsotonicRegression(out_of_bounds='clip')
            calibrator.fit(X_cal.flatten(), y_cal)
            calibrated_oof_preds = calibrator.transform(meta_oof_preds)
        else:
            # Platt scaling (LogisticRegression)
            from sklearn.linear_model import LogisticRegression
            calibrator = LogisticRegression()
            calibrator.fit(X_cal, (y_cal * 10).astype(int))
            calibrated_oof_preds = calibrator.predict_proba(X_cal).dot(np.arange(11)) / 10.0

        # Calculate calibrated RMSE (raw and rounded)
        calibrated_rmse_raw = np.sqrt(mean_squared_error(y_cal, calibrated_oof_preds))
        calibrated_oof_preds_rounded = np.round(calibrated_oof_preds).clip(0, 10)
        calibrated_rmse_rounded = np.sqrt(mean_squared_error(y_cal, calibrated_oof_preds_rounded))

        print(f"Meta-learner OOF RMSE (raw): {meta_oof_rmse_raw:.4f}")
        print(f"Meta-learner OOF RMSE (rounded): {meta_oof_rmse_rounded:.4f}")
        print(f"Calibrated OOF RMSE (raw): {calibrated_rmse_raw:.4f}")
        print(f"Calibrated OOF RMSE (rounded): {calibrated_rmse_rounded:.4f}")
        print(f"Calibration improvement (raw): {meta_oof_rmse_raw - calibrated_rmse_raw:.4f}")
        print(f"Calibration improvement (rounded): {meta_oof_rmse_rounded - calibrated_rmse_rounded:.4f}")

        if calibrated_rmse_raw < meta_oof_rmse_raw:
            print("  Calibration improves predictions (raw RMSE)!")
        else:
            print("  Calibration does not improve (raw RMSE), may skip in production")

    # Step 5: Diagnostic report
    print("\n" + "="*60)
    print("STEP 5: Diagnostic Report")
    print("="*60)

    # Per-bin RMSE
    print("\nPer-bin RMSE (by score):")
    print(f"{'Score':<8} " + " ".join([f"{name[:10]:<12}" for name in model_names]) + " Meta")
    print("-" * (8 + 13 * (len(model_names) + 1)))

    for score in range(11):
        mask = oof_targets == score
        if mask.sum() > 0:
            row = f"{score:<8} "
            for name in model_names:
                if mask.sum() > 0:
                    preds = oof_predictions[name][mask]
                    targets = oof_targets[mask]
                    bin_rmse = np.sqrt(mean_squared_error(targets, preds))
                    row += f"{bin_rmse:>12.3f} "
                else:
                    row += f"{'N/A':>12} "

            # Meta-learner per-bin RMSE (only samples in valid_mask)
            meta_mask = mask & valid_mask
            if meta_mask.sum() > 0:
                # Get indices within valid_mask that match score mask
                score_indices_in_valid = np.where(mask[valid_mask])[0]
                if len(score_indices_in_valid) > 0:
                    meta_preds_subset = meta_oof_preds[score_indices_in_valid]
                    targets_subset = y[score_indices_in_valid]
                    bin_rmse = np.sqrt(mean_squared_error(targets_subset, meta_preds_subset))
                    row += f"{bin_rmse:>12.3f} "
                else:
                    row += f"{'N/A':>12} "
            else:
                row += f"{'N/A':>12} "
            print(row)

    # Correlation matrix of errors
    print("\nCorrelation matrix (lower = more diverse, better):")
    errors = {}
    common_mask = np.ones(len(oof_targets), dtype=bool)
    for name in model_names:
        mask = ~np.isnan(oof_predictions[name])  # NaN-aware masking
        errors[name] = oof_predictions[name][mask] - oof_targets[mask]
        common_mask &= mask

    # Filter common_mask to valid_mask for meta comparisons
    common_mask = common_mask & valid_mask

    if common_mask.sum() > 0:
        error_matrix = np.column_stack([errors[name][common_mask] for name in model_names])
        corr_matrix = np.corrcoef(error_matrix.T)

        print(f"{'':<12} " + " ".join([f"{name[:10]:<12}" for name in model_names]))
        print("-" * (12 + 13 * len(model_names)))
        for i, name in enumerate(model_names):
            row = f"{name[:12]:<12} "
            for j in range(len(model_names)):
                row += f"{corr_matrix[i, j]:>12.3f} "
            print(row)

    print("="*60)

    # Save everything
    print("\n" + "="*60)
    print("SAVING ENSEMBLE")
    print("="*60)

    # Save OOF predictions (with NaN sentinels)
    oof_path = os.path.join(CONFIG['output_dir'], 'oof_predictions.npz')
    np.savez(oof_path, **{name: oof_predictions[name] for name in model_names}, targets=oof_targets)
    print(f" Saved OOF predictions to: {oof_path}")

    # Save meta-learner
    if meta_learner is not None:
        meta_path = os.path.join(CONFIG['output_dir'], 'meta_learner.pkl')
        with open(meta_path, 'wb') as f:
            pickle.dump(meta_learner, f)
        print(f"Saved meta-learner to: {meta_path}")

    # Save scaler
    if 'scaler' in locals() and scaler is not None:
        scaler_path = os.path.join(CONFIG['output_dir'], 'meta_scaler.pkl')
        with open(scaler_path, 'wb') as f:
            pickle.dump(scaler, f)
        print(f" Saved meta-learner scaler to: {scaler_path}")

    # Save calibrator
    if calibrator is not None:
        cal_path = os.path.join(CONFIG['output_dir'], 'calibrator.pkl')
        with open(cal_path, 'wb') as f:
            pickle.dump(calibrator, f)
        print(f" Saved calibrator to: {cal_path}")

    # Save column means for NaN imputation
    col_mean_path = os.path.join(CONFIG['output_dir'], 'meta_col_means.npy')
    np.save(col_mean_path, col_mean)
    print(f" Saved column means for NaN imputation to: {col_mean_path}")

    # Save trained models
    for model_name, fold_models in trained_models.items():
        for fold_info in fold_models:
            fold = fold_info['fold']
            model_path = os.path.join(CONFIG['output_dir'], f'{model_name}_fold{fold+1}.pth')
            torch.save(fold_info['model_state'], model_path)
        print(f" Saved {len(fold_models)} models for {model_name}")

    # Save ensemble configuration for inference
    ensemble_config_path = os.path.join(CONFIG['output_dir'], 'ensemble_configs.pkl')
    with open(ensemble_config_path, 'wb') as f:
        pickle.dump(ensemble_configs, f)
    print(f"Saved ensemble configuration to: {ensemble_config_path}")

    # Save training feature statistics (for test set standardization)
    if engineered_features is not None:
        train_feature_stats = {
            'mu': np.nanmean(engineered_features, axis=0),
            'sigma': np.nanstd(engineered_features, axis=0) + 1e-8
        }
        train_stats_path = os.path.join(CONFIG['output_dir'], 'train_feature_stats.pkl')
        with open(train_stats_path, 'wb') as f:
            pickle.dump(train_feature_stats, f)
        print(f" Saved training feature statistics to: {train_stats_path}")

    print("="*60)

    return trained_models, meta_learner, calibrator, oof_predictions, oof_targets, engineered_features


In [None]:
# ============================================================================
# MAIN TRAINING LOOP
# ============================================================================


if CONFIG.get('use_ensemble', False):
    # Ensemble training mode - use comprehensive pipeline
    print("\n" + "="*60)
    print("ENSEMBLE TRAINING MODE")
    print("="*60)
    print(f"Using {len(CONFIG.get('ensemble_configs', []))} ensemble members")
    print(f"CV folds: {CONFIG.get('n_cv_folds', 5)}")

    # Run comprehensive ensemble pipeline
    trained_models, meta_learner, calibrator, oof_predictions, oof_targets, engineered_features = train_ensemble_pipeline(
        all_folds, full_dataset, train_data,
        metric_embeddings, text_embeddings, metric_names_map, device
    )

    print(f"\n{'='*60}")
    print("ENSEMBLE TRAINING COMPLETED!")
    print(f"{'='*60}")
    print("All models, meta-learner, and calibrator saved!")
    print("Ready for test_data.json inference! ")

else:
    # Single model training mode
    print(f"\n{'='*60}")
    print("SINGLE MODEL TRAINING MODE")
    print(f"{'='*60}")
    print(f"Model: {effective_model_type} (configured: {CONFIG['model_type']})")
    print(f"Loss: {CONFIG['loss_type']}")
    print(f"Weight decay: {CONFIG['weight_decay']}")
    print(f"Patience: {CONFIG['patience']}")
    print(f"Batch size: {CONFIG['batch_size']}")
    print(f"Epochs: {CONFIG['epochs']}")
    print("="*60)

    patience_counter = 0
    best_epoch = start_epoch
    metrics_history = []

    for epoch in range(start_epoch, CONFIG['epochs']):
        time_minutes = (epoch - start_epoch + 1) * 8 / 60
        time_hours = time_minutes / 60

        print(f"\n{'='*60}")
        print(f"Epoch {epoch+1}/{CONFIG['epochs']} | Time: {time_hours:.1f}h ({time_minutes:.0f}m)")
        print(f"{'='*60}")

        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, scaler, CONFIG.get('gradient_clip'))

        val_rmse, val_mae = validate(model, val_loader, criterion, device)

        scheduler.step(val_rmse)
        current_lr = optimizer.param_groups[0]['lr']

        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val RMSE: {val_rmse:.4f}, Val MAE: {val_mae:.4f}, LR: {current_lr:.2e}")

        metrics_history.append({
            'epoch': epoch+1,
            'train_loss': train_loss,
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'lr': current_lr
        })

        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_epoch = epoch + 1
            patience_counter = 0
            print(f">>> NEW BEST RMSE: {best_rmse:.4f} (Epoch {best_epoch})")

            save_checkpoint(model, optimizer, epoch, best_rmse,
                           os.path.join(CONFIG['checkpoint_dir'], 'best_model_synth.pth'))
            torch.save(model.state_dict(), os.path.join(CONFIG['output_dir'], 'best_model_synth.pth'))
            print("    Model saved!")
        else:
            patience_counter += 1
            print(f"    Patience: {patience_counter}/{CONFIG['patience']}")

        if CONFIG['auto_save'] and (epoch + 1) % CONFIG['save_interval'] == 0:
            checkpoint_path = os.path.join(CONFIG['checkpoint_dir'], f'checkpoint_epoch_{epoch+1}.pth')
            save_checkpoint(model, optimizer, epoch, best_rmse, checkpoint_path)

        if patience_counter >= CONFIG['patience']:
            print(f"Early stopping at epoch {epoch+1}")
            break

        # Periodic memory cleanup to prevent OOM
        if (epoch + 1) % 5 == 0:
            torch.cuda.empty_cache()

    print(f"\n{'='*60}")
    print("TRAINING COMPLETED!")
    print(f"{'='*60}")
    print(f"Best RMSE: {best_rmse:.4f}")
    print(f"Best epoch: {best_epoch}/{epoch+1}")
    print(f"Total epochs: {epoch+1}")
    print(f"Final LR: {current_lr:.2e}")
    print(f"{'='*60}")

    final_checkpoint = os.path.join(CONFIG['checkpoint_dir'], 'final_model.pth')
    save_checkpoint(model, optimizer, epoch, best_rmse, final_checkpoint)

    metrics_df = pd.DataFrame(metrics_history)
    metrics_df.to_csv(os.path.join(CONFIG['output_dir'], 'training_metrics.csv'), index=False)

    print("\n" + "="*60)
    print("SAVED FILES")
    print("="*60)
    print(f"Best model: /kaggle/working/best_model_synth.pth")
    print(f"Checkpoints: /kaggle/working/checkpoints/")
    print(f"Metrics: /kaggle/working/training_metrics.csv")
    print("="*60)
    print(f"\n Model achieved RMSE: {best_rmse:.4f}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

2025-10-31 19:34:06.936124: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761939247.328688      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761939247.441959      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


DA5401 - Data Challenge
PyTorch: 2.6.0+cu124
CUDA: True
GPU: Tesla T4

Loading data...
Found data at: /kaggle/input/da5401
Loaded 5000 training samples

Generating Synthetic Negatives (Option 2)
Creating 500 synthetic negative samples...
  (Misaligning metrics with prompt-response pairs to create low-fitness examples)


Generating: 100%|██████████| 500/500 [00:00<00:00, 55010.15it/s]

✓ Added 500 synthetic negatives
  Total training samples: 5000 → 5500

Distribution BEFORE synthetic negatives:
  Score 0:    13 ( 0.26%)
  Score 1:     6 ( 0.12%)
  Score 2:     5 ( 0.10%)
  Score 3:     7 ( 0.14%)
  Score 4:     3 ( 0.06%)
  Score 5:     1 ( 0.02%)
  Score 6:    45 ( 0.90%)
  Score 7:    95 ( 1.90%)
  Score 8:   259 ( 5.18%)
  Score 9:  3124 (62.48%)
  Score 10:  1442 (28.84%)

Distribution AFTER synthetic negatives:
  Score 0:    13 ( 0.24%)
  Score 1:   113 ( 2.05%)
  Score 2:    72 ( 1.31%)
  Score 3:    83 ( 1.51%)
  Score 4:    90 ( 1.64%)
  Score 5:    83 ( 1.51%)
  Score 6:   126 ( 2.29%)
  Score 7:    95 ( 1.73%)
  Score 8:   259 ( 4.71%)
  Score 9:  3124 (56.80%)
  Score 10:  1442 (26.22%)


Computing improved sample weights (sqrt inverse frequency)...
Original training distribution:
  Score 0:    13 ( 0.24%)
  Score 1:   113 ( 2.05%)
  Score 2:    72 ( 1.31%)
  Score 3:    83 ( 1.51%)
  Score 4:    90 ( 1.64%)
  Score 5:    83 ( 1.51%)
  Score 6:   126 ( 2.




✓ Loaded fine-tuned encoder from checkpoint
Encoding text sequences...


Batches:   0%|          | 0/86 [00:00<?, ?it/s]

✓ GPU cache cleared after encoding
✓ Saved CV fold indices to: /kaggle/working/cv_folds.pkl
Train: 4400, Val: 1100
Model: heteroscedastic (configured: heteroscedastic)
Loss: heteroscedastic
Parameters: 454,332,034
Learning rate: 1.00e-04 (head LR)
Found checkpoint at: /kaggle/input/best-model-final/checkpoints/best_model_synth.pth


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy.core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe_globals([scalar])` or the `torch.serialization.safe_globals([scalar])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.