In [None]:
!pip install --upgrade transformers sentence-transformers

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m960.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metada

In [None]:
import os
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

print("="*60)
print("Inference - Generate Predictions")
print("="*60)

# Fix warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG = {
    'model_path': None,

    # Model architecture
    'model_type': 'heteroscedastic',
    'embedding_dim': 768,
    'hidden_dims': [512, 256, 128],
    'dropout': 0.4,

    # Inference settings
    'batch_size': 64,
    'encoder_name': 'paraphrase-multilingual-mpnet-base-v2',

    # Paths
    'data_dir': '/kaggle/input/da5401-2025-data-challenge',
    'output_dir': '/kaggle/working',
}

In [None]:
# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================

class StandardMetricMatchingModel(nn.Module):
    """Standard classification model with LayerNorm (matches training)."""

    def __init__(self, embedding_dim=768, hidden_dims=[512, 256, 128], dropout=0.4):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bilinear = nn.Bilinear(embedding_dim, embedding_dim, embedding_dim)

        self.fc_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        input_dim = embedding_dim * 3

        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
            input_dim = hidden_dim

        self.output = nn.Linear(input_dim, 11)
        self.dropout = nn.Dropout(dropout)

    def forward(self, metric_emb, text_emb):
        bilinear_out = self.bilinear(metric_emb, text_emb)
        combined = torch.cat([metric_emb, text_emb, bilinear_out], dim=1)

        x = combined
        for fc, ln in zip(self.fc_layers[:-1], self.layer_norms[:-1]):
            x = fc(x)
            x = ln(x)
            x = F.relu(x)
            x = self.dropout(x)

        x = self.fc_layers[-1](x)
        x = self.layer_norms[-1](x)
        x = F.relu(x)
        x = self.dropout(x)

        return self.output(x)

    def predict_score(self, metric_emb, text_emb):
        logits = self.forward(metric_emb, text_emb)
        probs = F.softmax(logits, dim=1)
        expected_score = torch.sum(probs * torch.arange(11, device=metric_emb.device).float(), dim=1)
        # Clamp to valid range [0, 10]
        expected_score = torch.clamp(expected_score, 0.0, 10.0)
        return expected_score


class HeteroscedasticMatchingModel(nn.Module):
    """Heteroscedastic regression model: predicts mean (mu) and log-variance (logvar)."""

    def __init__(self, embedding_dim=768, hidden_dims=[512, 256, 128], dropout=0.4):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.bilinear = nn.Bilinear(embedding_dim, embedding_dim, embedding_dim)

        self.fc_layers = nn.ModuleList()
        self.layer_norms = nn.ModuleList()
        input_dim = embedding_dim * 3

        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.layer_norms.append(nn.LayerNorm(hidden_dim))
            input_dim = hidden_dim

        # Two outputs: mean and log-variance
        self.mean_head = nn.Linear(input_dim, 1)
        self.logvar_head = nn.Linear(input_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, metric_emb, text_emb):
        bilinear_out = self.bilinear(metric_emb, text_emb)
        combined = torch.cat([metric_emb, text_emb, bilinear_out], dim=1)

        x = combined
        for fc, ln in zip(self.fc_layers[:-1], self.layer_norms[:-1]):
            x = fc(x)
            x = ln(x)
            x = F.relu(x)
            x = self.dropout(x)

        x = self.fc_layers[-1](x)
        x = self.layer_norms[-1](x)
        x = F.relu(x)
        x = self.dropout(x)

        mean = self.mean_head(x)
        logvar = self.logvar_head(x)

        # Clip logvar to prevent extremes (recommended: [-10, 10])
        logvar = torch.clamp(logvar, -10.0, 10.0)

        return mean.squeeze(-1), logvar.squeeze(-1)

    def predict_score(self, metric_emb, text_emb):
        """Predict expected mean score."""
        mean, logvar = self.forward(metric_emb, text_emb)
        # Clamp mean to valid range [0, 10]
        mean = torch.clamp(mean, 0.0, 10.0)
        return mean

# Alias for backward compatibility
MetricMatchingModel = StandardMetricMatchingModel

# ============================================================================
# DATASET CLASS
# ============================================================================

class MetricMatchingDataset(Dataset):
    def __init__(self, data, metric_embeddings, metric_names_map, text_embeddings):
        self.data = data
        self.metric_embeddings = torch.FloatTensor(metric_embeddings)
        self.metric_names_map = metric_names_map
        self.text_embeddings = torch.FloatTensor(text_embeddings)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        record = self.data[idx]
        metric_idx = self.metric_names_map[record['metric_name']]
        metric_emb = self.metric_embeddings[metric_idx]
        text_emb = self.text_embeddings[idx]
        return metric_emb, text_emb, idx



In [None]:
# ============================================================================
# LOAD DATA
# ============================================================================

data_dir = None

# Find model
model_path = CONFIG.get('model_path')
print(f"Using model: {model_path}")

print("\nLoading data...")
with open(os.path.join(data_dir, 'metric_names.json'), encoding='utf-8') as f:
    metric_names = json.load(f)

metric_embeddings = np.load(os.path.join(data_dir, 'metric_name_embeddings.npy'))
metric_names_map = {name: idx for idx, name in enumerate(metric_names)}

print(f"Loaded {len(metric_names)} metrics")

test_data_path = None


if not test_data_path:
    raise FileNotFoundError("Could not find test_data.json")

print(f"Loading test data from: {test_data_path}")
with open(test_data_path, encoding='utf-8') as f:
    test_data = json.load(f)

print(f"Loaded {len(test_data)} test samples")



In [None]:
# ============================================================================
# ENCODE TEXTS
# ============================================================================

def combine_text_fields(record):
    parts = []
    if record.get('system_prompt'): parts.append(record['system_prompt'])
    if record.get('user_prompt'): parts.append(record['user_prompt'])
    elif record.get('prompt'): parts.append(record['prompt'])
    if record.get('response'): parts.append(record['response'])
    elif record.get('expected_response'): parts.append(record['expected_response'])
    return ' '.join(parts)

print("\nLoading text encoder...")
text_encoder = SentenceTransformer(CONFIG['encoder_name'])
print(" Successfully loaded encoder")


print("Encoding test texts...")
test_texts = [combine_text_fields(rec) for rec in test_data]
test_embeddings = text_encoder.encode(
    test_texts,
    show_progress_bar=True,
    batch_size=64,
    convert_to_numpy=True
)



In [None]:
# ============================================================================
# CREATE MODEL AND LOAD WEIGHTS
# ============================================================================

print("\n" + "="*60)
print("Loading trained model...")
print("="*60)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Create model matching training architecture
if CONFIG['model_type'] == 'heteroscedastic':
    print(f"Creating HeteroscedasticMatchingModel (matches training)")
    model = HeteroscedasticMatchingModel(
        embedding_dim=CONFIG['embedding_dim'],
        hidden_dims=CONFIG['hidden_dims'],
        dropout=CONFIG['dropout']
    ).to(device)
elif CONFIG['model_type'] == 'standard' or CONFIG['model_type'] == 'default':
    print(f"Creating StandardMetricMatchingModel")
    model = StandardMetricMatchingModel(
        embedding_dim=CONFIG['embedding_dim'],
        hidden_dims=CONFIG['hidden_dims'],
        dropout=CONFIG['dropout']
    ).to(device)
else:
    # For other model types, use StandardMetricMatchingModel as fallback
    print(f"Warning: model_type '{CONFIG['model_type']}' not fully supported, using StandardMetricMatchingModel")
    model = StandardMetricMatchingModel(
        embedding_dim=CONFIG['embedding_dim'],
        hidden_dims=CONFIG['hidden_dims'],
        dropout=CONFIG['dropout']
    ).to(device)

# Load trained weights - handle both checkpoint dict and state_dict formats
checkpoint = torch.load(CONFIG['model_path'], map_location=device, weights_only=False)

if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
    # Full checkpoint with metadata
    model.load_state_dict(checkpoint['model_state_dict'])
    if 'best_rmse' in checkpoint:
        print(f"Checkpoint RMSE: {checkpoint['best_rmse']:.4f}")
    if 'epoch' in checkpoint:
        print(f" Checkpoint epoch: {checkpoint['epoch']+1}")
    print(f"Loaded model from: {CONFIG['model_path']}")
else:
    # Just state_dict
    model.load_state_dict(checkpoint)
    print(f"Loaded model from: {CONFIG['model_path']}")
    print("  (No RMSE metadata available)")

# ============================================================================
# GENERATE PREDICTIONS
# ============================================================================

print("\n" + "="*60)
print("Generating predictions...")
print("="*60)

test_dataset = MetricMatchingDataset(test_data, metric_embeddings, metric_names_map, test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

model.eval()
all_preds = []
all_indices = []

# Use inference_mode (more efficient) + autocast for memory savings
use_amp = torch.cuda.is_available()
with torch.inference_mode():
    if use_amp:
        # Mixed precision inference (50% memory savings)
        with torch.cuda.amp.autocast():
            for metric_emb, text_emb, idx in tqdm(test_loader, desc="Predicting"):
                metric_emb = metric_emb.to(device)
                text_emb = text_emb.to(device)
                scores = model.predict_score(metric_emb, text_emb)
                all_preds.extend(scores.cpu().numpy())
                all_indices.extend(idx.cpu().numpy())
    else:
        # Standard precision (CPU)
        for metric_emb, text_emb, idx in tqdm(test_loader, desc="Predicting"):
            metric_emb = metric_emb.to(device)
            text_emb = text_emb.to(device)
            scores = model.predict_score(metric_emb, text_emb)
            all_preds.extend(scores.cpu().numpy())
            all_indices.extend(idx.cpu().numpy())

# Ensure predictions are in valid range and rounded
all_preds = np.array(all_preds)
all_preds = np.round(all_preds).clip(0, 10)  # Keep as float to match sample format (7.0 instead of 7)

# ============================================================================
# CREATE SUBMISSION
# ============================================================================

print("\nCreating submission file...")

# Get IDs from test data
# Since test_data.json doesn't have ID field, use sequential indices (0, 1, 2, ...)
# The dataset preserves order (shuffle=False), so indices should be in order
# Create submission with sequential IDs matching the test_data order

# Sort predictions by index to ensure correct order
sorted_pairs = sorted(zip(all_indices, all_preds))
submission_scores = [pred for _, pred in sorted_pairs]

# Create sequential IDs (1, 2, 3, ..., len(test_data))
# Sample submission uses 1-based indexing
submission_ids = list(range(1, len(test_data) + 1))

# Verify we have the correct number of predictions
if len(submission_scores) != len(test_data):
    print(f"Warning: Expected {len(test_data)} predictions, got {len(submission_scores)}")
    # Pad or truncate if needed
    if len(submission_scores) < len(test_data):
        submission_scores.extend([float(np.round(np.mean(all_preds)))] * (len(test_data) - len(submission_scores)))
    else:
        submission_scores = submission_scores[:len(test_data)]

submission = pd.DataFrame({
    'ID': submission_ids,
    'score': submission_scores
})

# Save submission
output_path = os.path.join(CONFIG['output_dir'], 'submission.csv')
submission.to_csv(output_path, index=False)

print(f"\n{'='*60}")
print(" Submission saved!")
print(f"{'='*60}")
print(f"\nFile: {output_path}")
print(f"Shape: {submission.shape}")
print(f"\nScore distribution:")
print(submission['score'].value_counts().sort_index())
print(f"\nFirst 20 predictions:")
print(submission.head(20))

print("\n" + "="*60)
print("Done! Download submission.csv from /kaggle/working/")
print("="*60)



Checking library versions...
Current transformers version: 4.57.1
✓ Transformers version 4.57.1 should be compatible


2025-10-31 11:10:35.987504: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761909036.009548     558 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761909036.016189     558 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✓ Sentence-transformers version: 5.1.2
Kaggle Inference - Generate Predictions
✓ Applied transformers compatibility patch
Found data at: /kaggle/input/da5401-2025-data-challenge
Found model at: /kaggle/input/da5401-data-challenge-1/best_model.pth
Using model: /kaggle/input/da5401-data-challenge-1/best_model.pth

Loading data...
Loaded 145 metrics
Loading test data from: /kaggle/input/da5401-2025-data-challenge/test_data.json
Loaded 3638 test samples

Loading text encoder...


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Successfully loaded encoder
Encoding test texts...


Batches:   0%|          | 0/57 [00:00<?, ?it/s]


Loading trained model...
Using device: cuda
✓ Loaded model from: /kaggle/input/da5401-data-challenge-1/best_model.pth
  (No RMSE metadata available)

Generating predictions...


Predicting: 100%|██████████| 29/29 [00:01<00:00, 20.01it/s]


Creating submission file...

✅ Submission saved!

File: /kaggle/working/submission.csv
Shape: (3638, 2)

Score distribution:
score
8.0       29
9.0     3087
10.0     522
Name: count, dtype: int64

First 20 predictions:
    ID  score
0    1    9.0
1    2    9.0
2    3    9.0
3    4    9.0
4    5    9.0
5    6    9.0
6    7    9.0
7    8   10.0
8    9    9.0
9   10    9.0
10  11    9.0
11  12    9.0
12  13    9.0
13  14    9.0
14  15   10.0
15  16    9.0
16  17   10.0
17  18    9.0
18  19    9.0
19  20    9.0

Done! Download submission.csv from /kaggle/working/



