In [None]:
# EDA Cell  Clean schema, fix anomalies, quantify leakage, prep grouped CV keys

import pandas as pd
import numpy as np
import json, re, hashlib
from collections import Counter

# load files
def load_jsonl_or_json(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    if not txt:
        return []
    if txt[0] == "[":
        return json.loads(txt)
    return [json.loads(line) for line in txt.splitlines() if line.strip()]

train_path = "train_data.json"
test_path = "test_data.json"
metric_names_path = "metric_names.json"
metric_emb_path = "metric_name_embeddings.npy"

train = load_jsonl_or_json(train_path)
test = load_jsonl_or_json(test_path)

df_tr = pd.DataFrame([
    {
        "metric_name": r.get("metric_name", ""),
        "prompt": r.get("prompt", ""),
        "system_prompt": r.get("system_prompt", ""),
        "response": r.get("expected_response", ""),
        "score": r.get("score", None),
    } for r in train
])

df_te = pd.DataFrame([
    {
        "metric_name": r.get("metric_name", ""),
        "prompt": r.get("prompt", ""),
        "system_prompt": r.get("system_prompt", ""),
        "response": r.get("expected_response", ""),
    } for r in test
])

# 1) Normalize score column to numeric integer in [0,10]
df_tr["score_raw"] = df_tr["score"]
df_tr["score"] = pd.to_numeric(df_tr["score"], errors="coerce")
n_nan_scores = int(df_tr["score"].isna().sum())
print("Scores parsed to numeric; NaNs:", n_nan_scores)

if (df_tr["score"] % 1 != 0).any():
    print("Non-integer labels found; value counts of non-integers:")
    print(df_tr.loc[(df_tr["score"] % 1 != 0), "score"].value_counts())
    df_tr["score"] = df_tr["score"].round().clip(0, 10)

df_tr["score"] = df_tr["score"].astype("Int64")

print("\nScore integer value counts after rounding/clipping:")
print(df_tr["score"].value_counts().sort_index())

# 2) Fill missing text fields with empty strings; compute lengths
for col in ["system_prompt", "prompt", "response"]:
    df_tr[col] = df_tr[col].fillna("")
    df_te[col] = df_te[col].fillna("")
    df_tr[f"{col}_len"] = df_tr[col].map(len)
    df_te[f"{col}_len"] = df_te[col].map(len)

print("\nShare of empty fields (train):")
for col in ["system_prompt", "prompt", "response"]:
    print(f"{col}: {float((df_tr[col].str.len()==0).mean()):.3f}")

print("\nShare of empty fields (test):")
for col in ["system_prompt", "prompt", "response"]:
    print(f"{col}: {float((df_te[col].str.len()==0).mean()):.3f}")

# 3) Construct robust combo hashes; add simpler prompt+response hash too
def normalize_text(s: str):
    s = s or ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def md5(s: str):
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def combo_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("system_prompt","")),
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("response","")),
        normalize_text(r.get("metric_name","")),
    ])
    return md5(key)

def pr_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("response","")),
    ])
    return md5(key)

df_tr["combo_hash"] = df_tr.apply(combo_hash_row, axis=1)
df_te["combo_hash"] = df_te.apply(combo_hash_row, axis=1)
df_tr["pr_hash"] = df_tr.apply(pr_hash_row, axis=1)
df_te["pr_hash"] = df_te.apply(pr_hash_row, axis=1)

# 4) Duplicate/leakage quantification
dup_counts = df_tr["combo_hash"].value_counts()
dup_groups = (dup_counts > 1).sum()
max_dup = int(dup_counts.max())
print(f"\nTrain duplicate combo_hash groups: {dup_groups}, top group size: {max_dup}")

overlap_combo = set(df_tr["combo_hash"]).intersection(set(df_te["combo_hash"]))
overlap_pr = set(df_tr["pr_hash"]).intersection(set(df_te["pr_hash"]))
print("Train–Test overlap by combo_hash:", len(overlap_combo))
print("Train–Test overlap by prompt+response hash:", len(overlap_pr))

# 5) Propose leakage-aware CV keys:
#    Group by combo_hash for strict de-duplication AND by metric_name cluster key (proxy: metric_name itself).
#    We'll create a composite group key to be used with GroupKFold later.
df_tr["group_key"] = df_tr["metric_name"].astype(str) + "##" + df_tr["combo_hash"]

# 6) Quick per-metric score distribution summary for the top metrics
metric_freq = df_tr["metric_name"].value_counts()
top_metrics = metric_freq.head(10).index.tolist()
print("\nTop 10 metrics by frequency and score skew:")
for m in top_metrics:
    vc = df_tr.loc[df_tr["metric_name"] == m, "score"].value_counts().sort_index()
    print(f"- {m} | n={int(metric_freq[m])} | mean={float(df_tr.loc[df_tr['metric_name']==m,'score'].mean()):.2f}")
    print(vc.to_string())

# 7) Save slim EDA cache safely 
for name, df, has_score in [
    ("eda_train_slim.csv", df_tr, True),
    ("eda_test_slim.csv", df_te, False),
]:
    cols = ["metric_name", "system_prompt_len", "prompt_len", "response_len", "combo_hash", "pr_hash"]
    if has_score and "score" in df.columns:
        cols = ["metric_name", "system_prompt_len", "prompt_len", "response_len", "score", "combo_hash", "pr_hash"]
    df[cols].to_csv(name, index=False)

print("\nPrepared grouped CV key: group_key (metric_name##combo_hash).")
print("Saved: eda_train_slim.csv, eda_test_slim.csv")


Scores parsed to numeric; NaNs: 0
Non-integer labels found; value counts of non-integers:
score
9.5    1
Name: count, dtype: int64

Score integer value counts after rounding/clipping:
score
0       13
1        6
2        5
3        7
4        3
5        1
6       45
7       95
8      259
9     3123
10    1443
Name: count, dtype: Int64

Share of empty fields (train):
system_prompt: 0.310
prompt: 1.000
response: 1.000

Share of empty fields (test):
system_prompt: 0.304
prompt: 1.000
response: 1.000

Train duplicate combo_hash groups: 594, top group size: 31
Train–Test overlap by combo_hash: 551
Train–Test overlap by prompt+response hash: 1

Top 10 metrics by frequency and score skew:
- response_out_of_scope/functional_scope_boundaries | n=56 | mean=9.66
score
9     19
10    37
- rejection_rate/under_rejection | n=54 | mean=9.76
score
9     13
10    41
- robustness_against_adversarial_attacks/jailbreak_prompts | n=52 | mean=9.56
score
7      1
9     20
10    31
- misuse/instruction_misuse

In [None]:
# EDA Cell 3: Leakage-safe CV + metric-only baseline 

import numpy as np
import pandas as pd
import json, re, hashlib
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict

# Load slim caches from previous cell
df_tr = pd.read_csv("eda_train_slim.csv")
df_te = pd.read_csv("eda_test_slim.csv")
metric_names = json.load(open("metric_names.json","r", encoding="utf-8"))
metric_emb = np.load("metric_name_embeddings.npy")

# Map metric_name -> embedding row
name_to_idx = {name: i for i, name in enumerate(metric_names)}
def get_metric_vec(name):
    idx = name_to_idx.get(name, None)
    if idx is None:
        return np.zeros(metric_emb.shape[1], dtype=np.float32)
    return metric_emb[idx]

# Reattach columns needed for grouping and targets by reloading full train
def load_jsonl_or_json(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    if not txt:
        return []
    if txt[0] == "[":
        return json.loads(txt)
    return [json.loads(line) for line in txt.splitlines() if line.strip()]

train = load_jsonl_or_json("train_data.json")
full_tr = pd.DataFrame([{
    "metric_name": r.get("metric_name",""),
    "prompt": r.get("prompt",""),
    "system_prompt": r.get("system_prompt","") or "",
    "response": r.get("expected_response",""),
    "score": r.get("score", None),
} for r in train])

# Normalize scores 
full_tr["score"] = pd.to_numeric(full_tr["score"], errors="coerce").round().clip(0,10).astype("Int64")

# Recompute combo_hash for grouping 
import re, hashlib
def normalize_text(s):
    s = s or ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def md5(s):
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def combo_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("system_prompt","")),
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("response","")),
        normalize_text(r.get("metric_name","")),
    ])
    return md5(key)

full_tr["combo_hash"] = full_tr.apply(combo_hash_row, axis=1)

# Build metric-only features:
# - 768-d Gemma embedding
# - One-hot of top-N metric names (N=40) to capture head metrics
# - Basic length of system_prompt (since prompt/response are empty)
N_TOP = 40
top_metrics = full_tr["metric_name"].value_counts().head(N_TOP).index.tolist()
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ohe.fit(pd.DataFrame({"metric_name": top_metrics + ["__OTHER__"]}))

def row_features(row):
    name = row["metric_name"]
    vec = get_metric_vec(name)
    # one-hot
    ohe_input = pd.DataFrame({"metric_name": [name if name in top_metrics else "__OTHER__"]})
    oh = ohe.transform(ohe_input)[0]
    sys_len = len(row["system_prompt"] or "")
    return np.concatenate([vec, oh, np.array([sys_len], dtype=np.float32)], axis=0)

X = np.vstack([row_features(r) for _, r in full_tr.iterrows()])
y = full_tr["score"].astype(int).values
groups = full_tr["combo_hash"].values  # leakage-safe grouping

print("Feature matrix shape:", X.shape, " Target shape:", y.shape)

.
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y)
cw_map = {c: w for c, w in zip(classes, class_weights)}
sample_weight = np.array([cw_map[v] for v in y], dtype=np.float32)

gkf = GroupKFold(n_splits=5)
oof_pred = np.zeros((len(y), len(classes)), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    X_tr, y_tr, w_tr = X[tr_idx], y[tr_idx], sample_weight[tr_idx]
    X_va, y_va = X[va_idx], y[va_idx]

    clf = LogisticRegression(
        multi_class="multinomial",
        max_iter=2000,
        C=1.0,
        n_jobs=4,
        class_weight=None,  # using sample_weight instead
        solver="lbfgs",
    )
    clf.fit(X_tr, y_tr, sample_weight=w_tr)
    proba = clf.predict_proba(X_va)
    # Align columns to classes
    # sklearn orders by classes_ attribute
    col_order = list(clf.classes_)
    col_map = {c:i for i,c in enumerate(col_order)}
    # Fill into oof_pred at indices corresponding to va_idx and proper columns
    for i_c, c in enumerate(classes):
        if c in col_map:
            oof_pred[va_idx, i_c] = proba[:, col_map[c]]
        else:
            # unseen class in this fold
            oof_pred[va_idx, i_c] = 0.0

    y_pred_int = classes[np.argmax(oof_pred[va_idx], axis=1)]
    mae = mean_absolute_error(y_va, y_pred_int)
    print(f"Fold {fold}: MAE={mae:.4f}  (n_va={len(va_idx)})")

# Report overall OOF
y_pred_int_all = classes[np.argmax(oof_pred, axis=1)]
mae_all = mean_absolute_error(y, y_pred_int_all)
print(f"\nOOF MAE (metric-only, leakage-safe groups): {mae_all:.4f}")

#  quadratic weighted kappa
def quadratic_weighted_kappa(a, b, min_rating=None, max_rating=None):
    # a,b are integer arrays
    if min_rating is None: min_rating = int(min(a.min(), b.min()))
    if max_rating is None: max_rating = int(max(a.max(), b.max()))
    n_ratings = max_rating - min_rating + 1
    conf_mat = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(len(a)):
        conf_mat[a[i]-min_rating, b[i]-min_rating] += 1
    hist_a = conf_mat.sum(axis=1)
    hist_b = conf_mat.sum(axis=0)
    expected = np.outer(hist_a, hist_b) / conf_mat.sum()
    w = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(n_ratings):
        for j in range(n_ratings):
            w[i,j] = ((i-j)**2) / ((n_ratings-1)**2)
    kappa = 1.0 - (w * conf_mat).sum() / (w * expected).sum()
    return float(kappa)

qwk_all = quadratic_weighted_kappa(y, y_pred_int_all, min_rating=0, max_rating=10)
print(f"OOF QWK: {qwk_all:.4f}")

# Persist OOF for later calibration
oof_df = pd.DataFrame({
    "score_true": y,
    "score_pred": y_pred_int_all
})
oof_df.to_csv("oof_metric_only.csv", index=False)
print("Saved oof_metric_only.csv")


Feature matrix shape: (5000, 810)  Target shape: (5000,)




Fold 1: MAE=2.3960  (n_va=1000)




Fold 2: MAE=2.3950  (n_va=1000)




Fold 3: MAE=2.2310  (n_va=1000)




Fold 4: MAE=2.0960  (n_va=1000)




Fold 5: MAE=2.1940  (n_va=1000)

OOF MAE (metric-only, leakage-safe groups): 2.2624
OOF QWK: 0.0556
Saved oof_metric_only.csv


In [3]:
# Model Cell: Ordinal cumulative baseline with fold-safe weights + metric priors (leakage-safe OOF)

import numpy as np
import pandas as pd
import json, re, hashlib, warnings
from sklearn.model_selection import GroupKFold
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.optim as optim

warnings.filterwarnings("ignore")
torch.manual_seed(42)
np.random.seed(42)

# -------------------
# Load data
# -------------------
def load_jsonl_or_json(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    if not txt:
        return []
    if txt[0] == "[":
        return json.loads(txt)
    return [json.loads(line) for line in txt.splitlines() if line.strip()]

train = load_jsonl_or_json("train_data.json")
metric_names = json.load(open("metric_names.json","r", encoding="utf-8"))
metric_emb = np.load("metric_name_embeddings.npy")

name_to_idx = {name: i for i, name in enumerate(metric_names)}

df = pd.DataFrame([{
    "metric_name": r.get("metric_name",""),
    "prompt": r.get("prompt",""),
    "system_prompt": (r.get("system_prompt","") or ""),
    "response": r.get("expected_response",""),
    "score": r.get("score", None),
} for r in train])

# Normalize scores to integer 0..10 (handles stray 9.5)
df["score"] = pd.to_numeric(df["score"], errors="coerce").round().clip(0,10).astype(int)
y_all = df["score"].values
K = 11
classes = np.arange(K)

# -------------------
# Combo-hash groups (leakage-safe)
# -------------------
def normalize_text(s):
    s = s or ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def md5(s):
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def combo_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("system_prompt","")),
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("response","")),
        normalize_text(r.get("metric_name","")),
    ])
    return md5(key)

df["combo_hash"] = df.apply(combo_hash_row, axis=1)
groups = df["combo_hash"].values

# -------------------
# Features: Gemma vector + per-metric prior (fold-train only) + system_prompt length
# -------------------
def get_metric_vec(name):
    idx = name_to_idx.get(name, None)
    if idx is None:
        return np.zeros(metric_emb.shape[1], dtype=np.float32)
    return metric_emb[idx]

def build_fold_features(idx_train):
    df_tr = df.iloc[idx_train]
    priors = df_tr.groupby("metric_name")["score"].mean().to_dict()
    gmean = float(df_tr["score"].mean())
    feats = []
    for _, r in df.iterrows():
        name = r["metric_name"]
        vec = get_metric_vec(name)
        prior = priors.get(name, gmean)
        sys_len = len(r["system_prompt"])
        feats.append(np.concatenate([vec, np.array([prior, sys_len], dtype=np.float32)], axis=0))
    X_all = np.vstack(feats).astype(np.float32)
    return X_all

# -------------------
# Ordinal cumulative targets and utilities
# -------------------
def cumulative_targets(y, K=11):
    # For thresholds t=0..K-2, target is 1 if y <= t else 0
    T = np.zeros((len(y), K-1), dtype=np.float32)
    for i, yi in enumerate(y):
        # 0..yi-1 -> 0, yi..K-2 -> 1
        if yi < (K-1):
            T[i, :yi] = 0.0
            T[i, yi:] = 1.0
        else:
            # yi == 10, all thresholds predict 1
            T[i, :] = 1.0
    return T

def probs_from_cum(sig):
    # sig shape [B, K-1] in (0,1); return class probs [B, K]
    B, Km1 = sig.shape
    P_le = torch.cat([torch.zeros(B,1,device=sig.device), sig], dim=1)
    P_gt = torch.cat([sig, torch.ones(B,1,device=sig.device)], dim=1)
    return (P_gt - P_le).clamp(1e-7, 1.0)

def quadratic_weighted_kappa(a, b, min_rating=0, max_rating=10):
    n_ratings = max_rating - min_rating + 1
    conf_mat = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(len(a)):
        conf_mat[a[i]-min_rating, b[i]-min_rating] += 1
    hist_a = conf_mat.sum(axis=1)
    hist_b = conf_mat.sum(axis=0)
    expected = np.outer(hist_a, hist_b) / conf_mat.sum()
    w = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(n_ratings):
        for j in range(n_ratings):
            w[i,j] = ((i-j)**2) / ((n_ratings-1)**2)
    kappa = 1.0 - (w * conf_mat).sum() / (w * expected).sum()
    return float(kappa)

# -------------------
# Model
# -------------------
class OrdinalCumulativeNN(nn.Module):
    def __init__(self, in_dim, hidden=256, K=11):
        super().__init__()
        self.backbone = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
        )
        self.head = nn.Linear(hidden, K-1)  # thresholds
    def forward(self, x):
        h = self.backbone(x)
        logits = self.head(h)
        # enforce monotonicity by cumulative sum across thresholds
        logits = torch.cumsum(logits, dim=1)
        return logits

# -------------------
# Training with fold-safe threshold weights
# -------------------
gkf = GroupKFold(n_splits=5)
oof_proba = np.zeros((len(df), K), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(df, y_all, groups), 1):
    print(f"Fold {fold}: train={len(tr_idx)} valid={len(va_idx)}")
    X_all = build_fold_features(tr_idx)
    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y_all[tr_idx], y_all[va_idx]

    # Fold-safe inverse-frequency weights on present classes
    labels_present, counts = np.unique(y_tr, return_counts=True)
    freq = counts / counts.sum()
    inv = {int(lbl): float(1.0 / f) for lbl, f in zip(labels_present, freq)}
    mean_inv = np.mean(list(inv.values()))
    inv = {k: v / (mean_inv + 1e-12) for k, v in inv.items()}

    # Build threshold weights from class weights (emphasize rare lower labels)
    th_w = np.zeros(K-1, dtype=np.float32)
    for t in range(K-1):
        w_left = inv.get(t, 1.0)
        w_right = inv.get(t+1, 1.0)
        th_w[t] = 0.5 * (w_left + w_right)
    th_w = torch.tensor(th_w, dtype=torch.float32)

    # Targets with light label smoothing for stability
    eps = 0.02
    Yc = cumulative_targets(y_tr, K)
    Yc = Yc * (1 - eps) + 0.5 * eps
    y_tr_T = torch.tensor(Yc, dtype=torch.float32)

    # Tensors
    X_tr_t = torch.tensor(X_tr, dtype=torch.float32)
    X_va_t = torch.tensor(X_va, dtype=torch.float32)

    model = OrdinalCumulativeNN(in_dim=X_tr.shape[1], hidden=256, K=K)
    opt = optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-4)
    bce = nn.BCEWithLogitsLoss(reduction="none")

    # Train
    EPOCHS = 80
    B = 512
    n = len(X_tr_t)
    for epoch in range(EPOCHS):
        model.train()
        perm = torch.randperm(n)
        for i in range(0, n, B):
            idx = perm[i:i+B]
            xb = X_tr_t[idx]
            yb = y_tr_T[idx]
            opt.zero_grad()
            logits = model(xb)
            loss_raw = bce(logits, yb)  # [B, K-1]
            loss = (loss_raw * th_w).mean()
            loss.backward()
            opt.step()

    # Validate
    model.eval()
    with torch.no_grad():
        sig = torch.sigmoid(model(X_va_t))
        P = probs_from_cum(sig)
        oof_proba[va_idx] = P.cpu().numpy()

    y_pred = oof_proba[va_idx].argmax(axis=1)
    mae = np.mean(np.abs(y_va - y_pred))
    print(f"  Fold {fold} MAE: {mae:.4f}")

# -------------------
# OOF metrics
# -------------------
y_pred_all = oof_proba.argmax(axis=1)
mae_all = np.mean(np.abs(y_all - y_pred_all))
qwk_all = quadratic_weighted_kappa(y_all, y_pred_all, 0, 10)

print(f"\nOrdinal OOF MAE: {mae_all:.4f}")
print(f"Ordinal OOF QWK: {qwk_all:.4f}")

# Save for calibration next
cols = {f"p{k}": oof_proba[:,k] for k in range(K)}
oof_ord = pd.DataFrame({"score_true": y_all, **cols})
oof_ord.to_csv("oof_ordinal_metric_only.csv", index=False)
print("Saved oof_ordinal_metric_only.csv")


Fold 1: train=4000 valid=1000
  Fold 1 MAE: 2.0640
Fold 2: train=4000 valid=1000
  Fold 2 MAE: 2.1110
Fold 3: train=4000 valid=1000
  Fold 3 MAE: 3.0330
Fold 4: train=4000 valid=1000
  Fold 4 MAE: 1.2080
Fold 5: train=4000 valid=1000
  Fold 5 MAE: 2.1800

Ordinal OOF MAE: 2.1192
Ordinal OOF QWK: -0.0739
Saved oof_ordinal_metric_only.csv


In [None]:
# Model Cell: Smoothed per-metric priors + stabilized ordinal training and prior baseline

import numpy as np
import pandas as pd
import json, re, hashlib, warnings
from sklearn.model_selection import GroupKFold
import torch
import torch.nn as nn
import torch.optim as optim

warnings.filterwarnings("ignore")
torch.manual_seed(123)
np.random.seed(123)

# Load
def load_jsonl_or_json(path):
    with open(path, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    if not txt:
        return []
    if txt[0] == "[":
        return json.loads(txt)
    return [json.loads(line) for line in txt.splitlines() if line.strip()]

train = load_jsonl_or_json("train_data.json")
metric_names = json.load(open("metric_names.json","r", encoding="utf-8"))
metric_emb = np.load("metric_name_embeddings.npy")
name_to_idx = {name: i for i, name in enumerate(metric_names)}

df = pd.DataFrame([{
    "metric_name": r.get("metric_name",""),
    "prompt": r.get("prompt",""),
    "system_prompt": (r.get("system_prompt","") or ""),
    "response": r.get("expected_response",""),
    "score": r.get("score", None),
} for r in train])
df["score"] = pd.to_numeric(df["score"], errors="coerce").round().clip(0,10).astype(int)
y_all = df["score"].values
K = 11

# Group key
def normalize_text(s):
    s = s or ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s
def md5(s):
    return hashlib.md5(s.encode("utf-8")).hexdigest()
def combo_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("system_prompt","")),
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("response","")),
        normalize_text(r.get("metric_name","")),
    ])
    return md5(key)
df["combo_hash"] = df.apply(combo_hash_row, axis=1)
groups = df["combo_hash"].values

def get_metric_vec(name):
    idx = name_to_idx.get(name, None)
    if idx is None:
        return np.zeros(metric_emb.shape[1], dtype=np.float32)
    return metric_emb[idx]

# Compute fold-safe priors p(y|metric) with Laplace smoothing
def compute_priors(df_fold):
    counts = df_fold.groupby(["metric_name","score"]).size().unstack(fill_value=0)
    # ensure all classes present
    for c in range(K):
        if c not in counts.columns:
            counts[c] = 0
    counts = counts[sorted(counts.columns)]
    # Laplace smoothing
    alpha = 1.0
    counts_sm = counts + alpha
    probs = counts_sm.div(counts_sm.sum(axis=1), axis=0)
    means = (probs * np.arange(K)).sum(axis=1)
    return probs, means  # DataFrame (metric x K), Series (metric)

# Ordinal model
class OrdinalCumulativeNN(nn.Module):
    def __init__(self, in_dim, hidden=128, K=11, l2_thresh=1e-3):
        super().__init__()
        self.backbone = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.LayerNorm(hidden),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        self.head = nn.Linear(hidden, K-1)
        self.l2_thresh = l2_thresh
    def forward(self, x):
        h = self.backbone(x)
        logits = self.head(h)
        logits = torch.cumsum(logits, dim=1)
        return logits
    def reg(self):
        return sum((p**2).sum() for p in self.head.parameters()) * self.l2_thresh

def cumulative_targets(y, K=11):
    T = np.zeros((len(y), K-1), dtype=np.float32)
    for i, yi in enumerate(y):
        if yi < (K-1):
            T[i, :yi] = 0.0
            T[i, yi:] = 1.0
        else:
            T[i, :] = 1.0
    return T

def probs_from_cum(sig):
    B, Km1 = sig.shape
    P_le = torch.cat([torch.zeros(B,1,device=sig.device), sig], dim=1)
    P_gt = torch.cat([sig, torch.ones(B,1,device=sig.device)], dim=1)
    return (P_gt - P_le).clamp(1e-7, 1.0)

def quadratic_weighted_kappa(a, b, min_rating=0, max_rating=10):
    n_ratings = max_rating - min_rating + 1
    conf_mat = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(len(a)):
        conf_mat[a[i]-min_rating, b[i]-min_rating] += 1
    hist_a = conf_mat.sum(axis=1)
    hist_b = conf_mat.sum(axis=0)
    expected = np.outer(hist_a, hist_b) / conf_mat.sum()
    w = np.zeros((n_ratings, n_ratings), dtype=np.float64)
    for i in range(n_ratings):
        for j in range(n_ratings):
            w[i,j] = ((i-j)**2) / ((n_ratings-1)**2)
    kappa = 1.0 - (w * conf_mat).sum() / (w * expected).sum()
    return float(kappa)

gkf = GroupKFold(n_splits=5)
oof_proba = np.zeros((len(df), K), dtype=np.float32)
prior_blend_oof = np.zeros((len(df), K), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(df, y_all, groups), 1):
    print(f"Fold {fold}: train={len(tr_idx)} valid={len(va_idx)}")
    df_tr = df.iloc[tr_idx].copy()
    df_va = df.iloc[va_idx].copy()

    probs_m, means_m = compute_priors(df_tr)
    global_prior = np.ones(K, dtype=np.float32) / K

    # Build features for all rows with fold-train priors (no leakage)
    feats = []
    priors_for_all = []
    for _, r in df.iterrows():
        name = r["metric_name"]
        vec = get_metric_vec(name)
        if name in probs_m.index:
            pvec = probs_m.loc[name].values.astype(np.float32)
            pmean = float(means_m.loc[name])
        else:
            pvec = global_prior.copy()
            pmean = float((np.arange(K) * global_prior).sum())
        sys_len = len(r["system_prompt"])
        feats.append(np.concatenate([vec, np.array([pmean, sys_len], dtype=np.float32), pvec], axis=0))
        priors_for_all.append(pvec)
    X_all = np.vstack(feats).astype(np.float32)
    priors_for_all = np.vstack(priors_for_all).astype(np.float32)

    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y_all[tr_idx], y_all[va_idx]
    pri_va = priors_for_all[va_idx]

    # Prior-only baseline on validation (as a floor)
    prior_pred = pri_va  # already smoothed
    prior_blend_oof[va_idx] = prior_pred

    # Ordinal training
    model = OrdinalCumulativeNN(in_dim=X_tr.shape[1], hidden=128, K=K, l2_thresh=1e-4)
    opt = optim.AdamW(model.parameters(), lr=1.5e-3, weight_decay=5e-4)
    bce = nn.BCEWithLogitsLoss(reduction="mean")

    # Label smoothing
    eps = 0.02
    Yc = cumulative_targets(y_tr, K)
    Yc = Yc * (1 - eps) + 0.5 * eps
    y_tr_T = torch.tensor(Yc, dtype=torch.float32)

    X_tr_t = torch.tensor(X_tr, dtype=torch.float32)
    X_va_t = torch.tensor(X_va, dtype=torch.float32)

    EPOCHS = 40
    B = 512
    n = len(X_tr_t)
    for epoch in range(EPOCHS):
        model.train()
        perm = torch.randperm(n)
        for i in range(0, n, B):
            idx = perm[i:i+B]
            xb = X_tr_t[idx]
            yb = y_tr_T[idx]
            opt.zero_grad()
            logits = model(xb)
            loss = bce(logits, yb) + model.reg()
            loss.backward()
            opt.step()

    # Inference
    model.eval()
    with torch.no_grad():
        sig = torch.sigmoid(model(X_va_t))
        P = probs_from_cum(sig).cpu().numpy()

    #  convex blend with prior to stabilize
    alpha = 0.3
    P_blend = (1 - alpha) * P + alpha * pri_va
    oof_proba[va_idx] = P_blend

    y_pred = oof_proba[va_idx].argmax(axis=1)
    mae = np.mean(np.abs(y_va - y_pred))
    print(f"  Fold {fold} MAE (blended): {mae:.4f}")

# OOF metrics
y_pred_all = oof_proba.argmax(axis=1)
mae_all = np.mean(np.abs(y_all - y_pred_all))

def qwk(a,b):
    return quadratic_weighted_kappa(a,b,0,10)

qwk_all = qwk(y_all, y_pred_all)
print(f"\nOrdinal+Prior OOF MAE: {mae_all:.4f}")
print(f"Ordinal+Prior OOF QWK: {qwk_all:.4f}")

# Prior-only floor
y_prior = prior_blend_oof.argmax(axis=1)
print(f"Prior-only OOF MAE (floor): {np.mean(np.abs(y_all - y_prior)):.4f}")
print(f"Prior-only OOF QWK (floor): {qwk(y_all, y_prior):.4f}")

# Save
cols = {f"p{k}": oof_proba[:,k] for k in range(K)}
pd.DataFrame({"score_true": y_all, **cols}).to_csv("oof_ordinal_prior_blend.csv", index=False)
print("Saved oof_ordinal_prior_blend.csv")


Fold 1: train=4000 valid=1000
  Fold 1 MAE (blended): 0.7250
Fold 2: train=4000 valid=1000
  Fold 2 MAE (blended): 1.1140
Fold 3: train=4000 valid=1000
  Fold 3 MAE (blended): 1.6910
Fold 4: train=4000 valid=1000
  Fold 4 MAE (blended): 1.1180
Fold 5: train=4000 valid=1000
  Fold 5 MAE (blended): 1.3990

Ordinal+Prior OOF MAE: 1.2094
Ordinal+Prior OOF QWK: -0.0864
Prior-only OOF MAE (floor): 0.5106
Prior-only OOF QWK (floor): 0.1090
Saved oof_ordinal_prior_blend.csv


Wrote submission_prior_calibrated_with_cache.csv


In [None]:
# Patched calibration + inference with robust NaN handling and safe fallbacks

import numpy as np
import pandas as pd
import json, re, hashlib
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import RidgeCV

K = 11
idxs = np.arange(K)

def normalize_text(s):
    s = s or ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def md5(s):
    return hashlib.md5(s.encode("utf-8")).hexdigest()

def combo_hash_row(r):
    key = "|||".join([
        normalize_text(r.get("system_prompt","")),
        normalize_text(r.get("prompt","")),
        normalize_text(r.get("expected_response","")),
        normalize_text(r.get("metric_name","")),
    ])
    return md5(key)

def qwk(a,b, min_rating=0, max_rating=10):
    n_ratings = max_rating - min_rating + 1
    conf = np.zeros((n_ratings,n_ratings))
    for i in range(len(a)):
        conf[a[i]-min_rating, b[i]-min_rating] += 1
    hist_a = conf.sum(axis=1); hist_b = conf.sum(axis=0)
    expected = np.outer(hist_a, hist_b) / conf.sum()
    w = np.zeros_like(conf)
    for i in range(n_ratings):
        for j in range(n_ratings):
            w[i,j] = ((i-j)**2)/((n_ratings-1)**2)
    return 1.0 - (w*conf).sum() / (w*expected).sum()

def safe_softmax(logits):
    logits = np.nan_to_num(logits, nan=-1e9, posinf=1e6, neginf=-1e6)
    logits = logits - logits.max(axis=1, keepdims=True)
    P = np.exp(logits)
    P_sum = P.sum(axis=1, keepdims=True)
    P = P / np.clip(P_sum, 1e-12, None)
    return P

def apply_temperatures(P, tau_lo=1.0, tau_mid=1.0, tau_hi=1.0):
    P = np.nan_to_num(P, nan=1.0/K)
    P = P / np.clip(P.sum(axis=1, keepdims=True), 1e-12, None)
    logits = np.log(np.clip(P, 1e-12, 1.0))
    scale = np.ones(K, dtype=np.float32)
    scale[idxs <= 5] = 1.0 / tau_lo
    scale[(idxs >= 6) & (idxs <= 8)] = 1.0 / tau_mid
    scale[idxs >= 9] = 1.0 / tau_hi
    L = logits * scale[None, :]
    return safe_softmax(L)

def expected_value(P):
    return (P * idxs[None,:]).sum(axis=1)

def entropy(P):
    P = np.nan_to_num(P, nan=1.0/K)
    P = P / np.clip(P.sum(axis=1, keepdims=True), 1e-12, None)
    return -(P * np.log(np.clip(P, 1e-12, 1.0))).sum(axis=1)

# Load
train = json.load(open("train_data.json","r",encoding="utf-8"))
test = json.load(open("test_data.json","r",encoding="utf-8"))
df_tr = pd.DataFrame(train)
df_tr["score"] = pd.to_numeric(df_tr["score"], errors="coerce").round().clip(0,10).astype(int)
df_tr["combo_hash"] = df_tr.apply(combo_hash_row, axis=1)
df_te = pd.DataFrame(test)
df_te["combo_hash"] = df_te.apply(combo_hash_row, axis=1)

dup_cache = dict(zip(df_tr["combo_hash"], df_tr["score"]))
y_true = df_tr["score"].values
groups = df_tr["combo_hash"].values


alpha = 1.0
lambda_shrink = 20  

def build_oof_priors(lambda_shrink):
    oof_P = np.zeros((len(df_tr), K), dtype=np.float32)
    gkf = GroupKFold(n_splits=5)
    for tr_idx, va_idx in gkf.split(df_tr, y_true, groups):
        fold = df_tr.iloc[tr_idx]
        counts = fold.groupby(["metric_name","score"]).size().unstack(fill_value=0)
        for c in range(K):
            if c not in counts.columns:
                counts[c] = 0
        counts = counts[sorted(counts.columns)]
        n_m = counts.sum(axis=1).astype(float)
        global_counts = counts.sum(axis=0).astype(float)
        global_prior = (global_counts + alpha) / (global_counts.sum() + alpha*K)

        p_emp = (counts + alpha).div((counts + alpha).sum(axis=1), axis=0)
        w_m = (n_m / (n_m + lambda_shrink)).clip(0.0, 1.0)
        p_m = p_emp.mul(w_m, axis=0).add(global_prior, axis=1).sub(global_prior.mul(w_m, axis=0), axis=0)

        for i in va_idx:
            m = df_tr.iloc[i]["metric_name"]
            if m in p_m.index:
                oof_P[i] = p_m.loc[m].values.astype(np.float32)
            else:
                oof_P[i] = global_prior.values.astype(np.float32)
    oof_P = np.nan_to_num(oof_P, nan=1.0/K)
    oof_P /= np.clip(oof_P.sum(axis=1, keepdims=True), 1e-12, None)
    return oof_P

oof_P = build_oof_priors(lambda_shrink)


best_tau = (1.0, 1.0, 1.0)
best_score = None
for tau_hi in [1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0]:
    for tau_mid in [1.0, 1.1, 1.3, 1.6, 2.0]:
        for tau_lo in [1.0, 0.9, 0.8]:
            P_t = apply_temperatures(oof_P, tau_lo, tau_mid, tau_hi)
            y_pred = P_t.argmax(axis=1)
            mae = np.mean(np.abs(y_true - y_pred))
            k = qwk(y_true, y_pred)
            score = (0.8 * (-mae)) + (0.2 * k)
            if (best_score is None) or (score > best_score):
                best_score = score
                best_tau = (tau_lo, tau_mid, tau_hi)

print("Chosen temperatures:", best_tau, "objective:", best_score)


def apply_tilt(P, eps=0.02):
    P = P.copy()
    take = np.minimum(P[:,10], eps)
    P[:,10] -= take
    P[:,9] += take * 0.6
    P[:,8] += take * 0.4
    P /= np.clip(P.sum(axis=1, keepdims=True), 1e-12, None)
    return P

P_cal = apply_temperatures(oof_P, *best_tau)
if best_tau == (1.0, 1.0, 1.0):
    P_cal = apply_tilt(P_cal, eps=0.02)

ev_cal = expected_value(P_cal)
ent_cal = entropy(P_cal)

X_res = np.column_stack([ev_cal, ent_cal, P_cal])
X_res = np.nan_to_num(X_res, nan=0.0, posinf=1e6, neginf=-1e6)
y_res = y_true - np.round(ev_cal)
y_res = np.nan_to_num(y_res, nan=0.0)

res_model = RidgeCV(alphas=[0.1, 1.0, 5.0, 10.0], fit_intercept=True)
res_model.fit(X_res, y_res)

# Final priors on full train with same lambda
counts_full = df_tr.groupby(["metric_name","score"]).size().unstack(fill_value=0)
for c in range(K):
    if c not in counts_full.columns:
        counts_full[c] = 0
counts_full = counts_full[sorted(counts_full.columns)]
n_m_full = counts_full.sum(axis=1).astype(float)
global_counts_full = counts_full.sum(axis=0).astype(float)
global_prior_full = (global_counts_full + alpha) / (global_counts_full.sum() + alpha*K)
p_emp_full = (counts_full + alpha).div((counts_full + alpha).sum(axis=1), axis=0)
w_m_full = (n_m_full / (n_m_full + lambda_shrink)).clip(0.0, 1.0)
p_m_full = p_emp_full.mul(w_m_full, axis=0).add(global_prior_full, axis=1).sub(global_prior_full.mul(w_m_full, axis=0), axis=0)

metric_prior = {m: p_m_full.loc[m].values.astype(np.float32) for m in p_m_full.index}
global_prior_vec = global_prior_full.values.astype(np.float32)

def predict_row(r):
    # Duplicate fill
    h = r["combo_hash"]
    if h in dup_cache:
        return int(dup_cache[h])
    # Metric prior
    m = r["metric_name"]
    P = metric_prior.get(m, global_prior_vec)
    P = P[None, :]
    P = apply_temperatures(P, *best_tau)
    if best_tau == (1.0, 1.0, 1.0):
        P = apply_tilt(P, eps=0.02)
    P = np.nan_to_num(P, nan=1.0/K)
    P = P / np.clip(P.sum(axis=1, keepdims=True), 1e-12, None)
    ev = float((P * idxs[None,:]).sum(axis=1)[0])
    ent = float(entropy(P)[0])
    x = np.concatenate([[ev, ent], P[0]])
    x = np.nan_to_num(x, nan=0.0, posinf=1e6, neginf=-1e6)
    delta = float(res_model.predict(x[None, :])[0])
    delta = float(np.clip(delta, -1.0, 0.5))
    pred = np.round(ev + delta)
    if not np.isfinite(pred):
        pred = np.round(ev)  # fallback
    pred = int(np.clip(pred, 0, 10))
    return pred

df_te = df_te.copy()
preds = [predict_row(r) for _, r in df_te.iterrows()]
submission = pd.DataFrame({"row_id": np.arange(len(df_te)), "score": preds})
print("Pred distribution:", submission["score"].value_counts().sort_index().to_dict())
submission.to_csv("submission_calibrated_shrunk_residual_safe.csv", index=False)
print("Wrote submission_calibrated_shrunk_residual_safe.csv")


Chosen temperatures: (1.0, 1.0, 1.2) objective: -0.36608
Pred distribution: {0: 2, 1: 7, 6: 1482, 7: 70, 8: 76, 9: 1336, 10: 665}
Wrote submission_calibrated_shrunk_residual_safe.csv


In [None]:
import pandas as pd

# Load  last predictions
preds_df = pd.read_csv("submission_calibrated_shrunk_residual_safe.csv")

# Read sample to get exact column names and order
sample = pd.read_csv("sample_submission.csv")
sample_cols = list(sample.columns)
print("Sample columns:", sample_cols)

# Create a new DataFrame with the same columns
out = pd.DataFrame(columns=sample_cols)
out[sample_cols[0]] = range(len(preds_df))  # ID column

score_col_in_sample = sample_cols[1]
if "score" in preds_df.columns:
    out[score_col_in_sample] = preds_df["score"].astype(int)
else:
  
    pred_col = [c for c in preds_df.columns if c.lower() != sample_cols[0].lower()][0]
    out[score_col_in_sample] = preds_df[pred_col].astype(int)

# Save with a new name
out.to_csv("submission_final.csv", index=False)
print("Wrote submission_final.csv with columns:", list(out.columns))
print(out.head())


Sample columns: ['ID', 'score']
Wrote submission_final.csv with columns: ['ID', 'score']
   ID  score
0   0      9
1   1      9
2   2      9
3   3      9
4   4      6


In [None]:
import pandas as pd

# Load  latest predictions 
preds_df = pd.read_csv("submission_calibrated_shrunk_residual_safe.csv")

# Load sample to enforce exact header and order
sample = pd.read_csv("sample_submission.csv")
sample_cols = list(sample.columns)
print("Sample columns:", sample_cols)

# Build output with correct columns
out = pd.DataFrame(columns=sample_cols)

# Set ID from 1..N h)
N = len(preds_df)
out[sample_cols[0]] = range(1, N + 1)

# Map scores to the sample's score column name
score_col_in_sample = sample_cols[1]
if "score" in preds_df.columns:
    out[score_col_in_sample] = preds_df["score"].astype(int)
else:
    
    pred_col = [c for c in preds_df.columns if c.lower() != "id" and c.lower() != sample_cols[0].lower()][0]
    out[score_col_in_sample] = preds_df[pred_col].astype(int)

#  sanity checks
assert out[sample_cols[0]].iloc[0] == 1, "ID should start at 1"
assert out[sample_cols[0]].iloc[-1] == N, f"ID should end at {N}"
print("ID range:", out[sample_cols[0]].min(), "to", out[sample_cols[0]].max())

# Save final submission
out.to_csv("submission_final.csv", index=False)
print("Wrote submission_final.csv with columns:", list(out.columns), "and", len(out), "rows")


Sample columns: ['ID', 'score']
ID range: 1 to 3638
Wrote submission_final.csv with columns: ['ID', 'score'] and 3638 rows
