In [9]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()
# "duenchombo","8f66a46e93b156627591fadd91c3b1ba"


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [10]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

da5401_2025_data_challenge_path = kagglehub.competition_download('da5401-2025-data-challenge')

print('Data source import complete.')


Data source import complete.


In [11]:
da5401_2025_data_challenge_path


'/root/.cache/kagglehub/competitions/da5401-2025-data-challenge'

In [12]:
!pip install -qU sentence-transformers
!pip install -qU transformers huggingface_hub


In [13]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import os

In [14]:
numpy_emb_dir='/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/metric_name_embeddings.npy'
metric_dir='/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/metric_names.json'
train_data_dir='/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/train_data.json'
test_data_dir='/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/test_data.json'


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_NAME = "microsoft/deberta-v3-small"
EPOCHS = 30
BATCH_SIZE = 12
LR = 1e-5
MAX_LENGTH = 160

In [21]:

def load_training_data(train_json_path, metric_json_path, metric_npy_path):
    # load metrics
    with open(metric_json_path, "r") as f:
        metric_names = json.load(f)

    metric_embeddings = np.load(metric_npy_path)
    metric_embeddings = metric_embeddings / np.linalg.norm(metric_embeddings, axis=1, keepdims=True)

    metric_to_embedding = {n: e for n, e in zip(metric_names, metric_embeddings)}

    # load training df
    df = pd.read_json(train_json_path)

    # ---> FIX: ensure score is numeric
    df["score"] = pd.to_numeric(df["score"], errors="coerce")
    df = df.dropna(subset=["score"])

    # combined text
    df["combined_text"] = (
        df["response"].fillna("") + " " +
        df["user_prompt"].fillna("") + " " +
        df["system_prompt"].fillna("")
    )

    return df, metric_to_embedding


# ===============================================================
# SAMPLE WEIGHTS — MORE AGGRESSIVE (1 / freq^1.5)
# ===============================================================
def get_sample_weights(scores):
    scores = np.array(scores)
    unique, counts = np.unique(scores, return_counts=True)
    freq = dict(zip(unique, counts))
    weights = np.array([(1 / (freq[s] ** 1.5)) for s in scores], dtype=float)
    return weights / weights.mean()


# ===============================================================
# DATASET
# ===============================================================
class TextRegressionDataset(Dataset):
    def __init__(self, df, tokenizer, metric_to_embedding):
        self.texts = df["combined_text"].tolist()
        self.scores = df["score_norm"].astype(float).tolist()
        self.metric_names = df["metric_name"].tolist()
        self.metric_to_embedding = metric_to_embedding
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        metric_emb = self.metric_to_embedding[self.metric_names[idx]]

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "metric_emb": torch.tensor(metric_emb, dtype=torch.float),
            "score": torch.tensor(self.scores[idx], dtype=torch.float)
        }


# ===============================================================
# MODEL — HEAVY REGULARIZATION
# ===============================================================
class MSRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.enc = AutoModel.from_pretrained(MODEL_NAME)
        h = self.enc.config.hidden_size

        # heavy regularization
        self.dropout_layers = nn.ModuleList([nn.Dropout(0.45) for _ in range(5)])
        self.fc = nn.Linear(h + 768, 1)

    def forward(self, ids, mask, metric_emb, mc_dropout=False):
        out = self.enc(ids, mask).last_hidden_state[:, 0, :]
        x = torch.cat([out, metric_emb], dim=1)

        if mc_dropout:
            preds = [self.fc(d(x)) for d in self.dropout_layers]
            return torch.mean(torch.stack(preds), dim=0).squeeze(1)

        return self.fc(x).squeeze(1)


# ===============================================================
# TRAIN LOOP
# ===============================================================
def train(train_json_path, metric_json_path, metric_npy_path):

    # load data
    df, metric_to_embedding = load_training_data(
        train_json_path, metric_json_path, metric_npy_path
    )

    # normalize
    score_mean = df["score"].mean()
    score_std = df["score"].std()

    df["score_norm"] = (df["score"] - score_mean) / score_std

    # save normalization for inference
    np.save("score_mean.npy", np.array([score_mean]))
    np.save("score_std.npy", np.array([score_std]))

    # split
    df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

    tok = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_ds = TextRegressionDataset(df_train, tok, metric_to_embedding)
    val_ds = TextRegressionDataset(df_val, tok, metric_to_embedding)
    test_ds = TextRegressionDataset(df_test, tok, metric_to_embedding)

    sampler = WeightedRandomSampler(
        weights=get_sample_weights(df_train["score"].values),
        num_samples=len(df_train),
        replacement=True
    )

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

    model = MSRegressor().to(DEVICE)
    opt = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.06)

    criterion = nn.HuberLoss(delta=1.0)

    best_rmse = 999

    # ============================
    # TRAINING LOOP
    # ============================
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            ids = batch["input_ids"].to(DEVICE)
            mask = batch["attention_mask"].to(DEVICE)
            metric_emb = batch["metric_emb"].to(DEVICE)
            y = batch["score"].to(DEVICE)

            opt.zero_grad()
            preds = model(ids, mask, metric_emb)
            loss = criterion(preds, y)
            loss.backward()
            opt.step()
            train_loss += loss.item()

        # ============================
        # VALIDATION
        # ============================
        model.eval()
        se = 0
        n = 0
        with torch.no_grad():
            for batch in val_loader:
                ids = batch["input_ids"].to(DEVICE)
                mask = batch["attention_mask"].to(DEVICE)
                metric_emb = batch["metric_emb"].to(DEVICE)
                y = batch["score"].to(DEVICE)

                preds = model(ids, mask, metric_emb, mc_dropout=True)
                se += ((preds - y) ** 2).sum().item()
                n += len(y)

        rmse = np.sqrt(se / n)
        print(f"Epoch {epoch+1} | Val RMSE = {rmse:.4f}")

        if rmse < best_rmse:
            best_rmse = rmse
            torch.save(model.state_dict(), "best_model.pt")
            print("Saved new best model!")

    print("\nTraining complete.")
    print("Best Validation RMSE:", best_rmse)

    return {
        "best_val_rmse": best_rmse,
        "mean": score_mean,
        "std": score_std
    }


# ===============================================================
# RUN
# ===============================================================
if __name__ == "__main__":
    results = train(
        train_json_path=train_data_dir,
        metric_json_path=metric_dir,
        metric_npy_path=numpy_emb_dir
    )
    print("RESULTS:", results)


Epoch 1/30: 100%|██████████| 292/292 [01:17<00:00,  3.76it/s]


Epoch 1 | Val RMSE = 5.6104
Saved new best model!


Epoch 2/30: 100%|██████████| 292/292 [01:17<00:00,  3.77it/s]


Epoch 2 | Val RMSE = 5.0603
Saved new best model!


Epoch 3/30: 100%|██████████| 292/292 [01:17<00:00,  3.76it/s]


Epoch 3 | Val RMSE = 4.3480
Saved new best model!


Epoch 4/30: 100%|██████████| 292/292 [01:17<00:00,  3.75it/s]


Epoch 4 | Val RMSE = 3.9902
Saved new best model!


Epoch 5/30: 100%|██████████| 292/292 [01:17<00:00,  3.75it/s]


Epoch 5 | Val RMSE = 3.6999
Saved new best model!


Epoch 6/30: 100%|██████████| 292/292 [01:17<00:00,  3.75it/s]


Epoch 6 | Val RMSE = 2.8651
Saved new best model!


Epoch 7/30: 100%|██████████| 292/292 [01:17<00:00,  3.75it/s]


Epoch 7 | Val RMSE = 3.6801


Epoch 8/30: 100%|██████████| 292/292 [01:17<00:00,  3.77it/s]


Epoch 8 | Val RMSE = 3.4851


Epoch 9/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 9 | Val RMSE = 3.6500


Epoch 10/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 10 | Val RMSE = 3.4766


Epoch 11/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 11 | Val RMSE = 3.4232


Epoch 12/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 12 | Val RMSE = 2.5513
Saved new best model!


Epoch 13/30: 100%|██████████| 292/292 [01:17<00:00,  3.76it/s]


Epoch 13 | Val RMSE = 1.9714
Saved new best model!


Epoch 14/30: 100%|██████████| 292/292 [01:17<00:00,  3.75it/s]


Epoch 14 | Val RMSE = 1.5782
Saved new best model!


Epoch 15/30: 100%|██████████| 292/292 [01:17<00:00,  3.76it/s]


Epoch 15 | Val RMSE = 2.0680


Epoch 16/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 16 | Val RMSE = 2.0307


Epoch 17/30: 100%|██████████| 292/292 [01:17<00:00,  3.77it/s]


Epoch 17 | Val RMSE = 2.1629


Epoch 18/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 18 | Val RMSE = 1.6423


Epoch 19/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 19 | Val RMSE = 1.8098


Epoch 20/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 20 | Val RMSE = 2.2840


Epoch 21/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 21 | Val RMSE = 2.2552


Epoch 22/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 22 | Val RMSE = 1.9946


Epoch 23/30: 100%|██████████| 292/292 [01:17<00:00,  3.77it/s]


Epoch 23 | Val RMSE = 2.0518


Epoch 24/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 24 | Val RMSE = 1.8892


Epoch 25/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 25 | Val RMSE = 1.6219


Epoch 26/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 26 | Val RMSE = 1.6629


Epoch 27/30: 100%|██████████| 292/292 [01:17<00:00,  3.79it/s]


Epoch 27 | Val RMSE = 1.8686


Epoch 28/30: 100%|██████████| 292/292 [01:17<00:00,  3.78it/s]


Epoch 28 | Val RMSE = 1.2834
Saved new best model!


Epoch 29/30: 100%|██████████| 292/292 [01:17<00:00,  3.76it/s]


Epoch 29 | Val RMSE = 1.5555


Epoch 30/30: 100%|██████████| 292/292 [01:18<00:00,  3.70it/s]


Epoch 30 | Val RMSE = 1.7476

Training complete.
Best Validation RMSE: 1.2833802961863976
RESULTS: {'best_val_rmse': np.float64(1.2833802961863976), 'mean': np.float64(9.1195), 'std': 0.9424157147861463}


In [None]:

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "microsoft/deberta-v3-small"

# ---------------------------
# Paths (adjust if needed)
# ---------------------------
METRIC_JSON = "/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/metric_names.json"
METRIC_NPY  = "/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/metric_name_embeddings.npy"
TEST_JSON   = "/root/.cache/kagglehub/competitions/da5401-2025-data-challenge/test_data.json"
MODEL_PATH  = "best_model.pt"
MAX_LENGTH  = 160
BATCH_SIZE  = 16
MC_DROPOUT  = True

In [22]:



# ----------------------------------------------------
# Load metric embeddings (same code as training)
# ----------------------------------------------------
def load_metric_embeddings(json_path, npy_path):
    with open(json_path, "r") as f:
        metric_names = json.load(f)

    metric_embs = np.load(npy_path)
    metric_embs = metric_embs / np.linalg.norm(metric_embs, axis=1, keepdims=True)

    return {n: e for n, e in zip(metric_names, metric_embs)}


# ----------------------------------------------------
# SAME Dataset used in training (without scores)
# ----------------------------------------------------
class TextRegressionDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, metric_to_embedding):
        self.texts = df["combined_text"].tolist()
        self.metric_names = df["metric_name"].tolist()
        self.metric_to_embedding = metric_to_embedding
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LENGTH,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "metric_emb": torch.tensor(
                self.metric_to_embedding[self.metric_names[idx]], dtype=torch.float
            )
        }


# ----------------------------------------------------
# SAME Model class as training
# ----------------------------------------------------
class MSRegressor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        from transformers import AutoModel
        self.enc = AutoModel.from_pretrained(MODEL_NAME)
        h = self.enc.config.hidden_size

        self.dropout_layers = torch.nn.ModuleList([torch.nn.Dropout(0.45) for _ in range(5)])
        self.fc = torch.nn.Linear(h + 768, 1)

    def forward(self, ids, mask, metric_emb, mc_dropout=False):
        out = self.enc(ids, mask).last_hidden_state[:, 0, :]
        x = torch.cat([out, metric_emb], dim=1)

        if mc_dropout:
            preds = [self.fc(d(x)) for d in self.dropout_layers]
            return torch.mean(torch.stack(preds), dim=0).squeeze(1)

        return self.fc(x).squeeze(1)


# ----------------------------------------------------
# PREDICTION FUNCTION
# ----------------------------------------------------
def predict():

    metric_map = load_metric_embeddings(METRIC_JSON, METRIC_NPY)

    # Load test data
    with open(TEST_JSON, "r") as f:
        data = json.load(f)

    df = pd.DataFrame(data)
    df["combined_text"] = (
        df["response"].fillna("") + " " +
        df["user_prompt"].fillna("") + " " +
        df["system_prompt"].fillna("")
    )

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    test_ds = TextRegressionDataset(df, tokenizer, metric_map)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    # -------------------------------
    # Load trained model
    # -------------------------------
    model = MSRegressor().to(DEVICE)
    model.load_state_dict(torch.load("best_model.pt", map_location=DEVICE))
    model.eval()

    # Load normalization stats
    score_mean = np.load("score_mean.npy")[0]
    score_std = np.load("score_std.npy")[0]

    preds = []

    with torch.no_grad():
        for batch in test_loader:
            ids = batch["input_ids"].to(DEVICE)
            mask = batch["attention_mask"].to(DEVICE)
            metric_emb = batch["metric_emb"].to(DEVICE)

            out_norm = model(ids, mask, metric_emb, mc_dropout=MC_DROPOUT)

            # unnormalize
            out = out_norm.cpu().numpy() * score_std + score_mean
            out = np.clip(out, 0, 10)

            preds.extend(out.tolist())

    # Build submission
    sub = pd.DataFrame({
        "ID": range(1, len(preds) + 1),
        "score": preds
    })
    sub.to_csv("submission_fixed_30.csv", index=False)
    print("Saved submission_fixed_30.csv")

    # Rounded version
    sub_r = sub.copy()
    sub_r["score"] = np.round(sub_r["score"]).astype(int).clip(0, 10)
    sub_r.to_csv("submission_fixed_rounded.csv", index=False)
    print("Saved submission_fixed_rounded.csv")


if __name__ == "__main__":
    predict()




Saved submission_fixed_30.csv
Saved submission_fixed_rounded.csv
