## Setup, paths, helpers

In [1]:
# ============================================================
# 0) SETUP: imports, config, paths
#    Ensembles XLM-R (native) + DeBERTa (MT→EN)
#    using calibrated probabilities from CV notebooks
# ============================================================

import os, json, warnings
from pathlib import Path

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

# ------------------------------------------------------------
# Language + data root
# ------------------------------------------------------------
LANG = "eng"                # e.g. "eng", "ben", "hin"
BASE = "../dev_phase_aug"       # organizer data root

# Train/Dev CSVs (for id order + labels)
T1_TRAIN = f"{BASE}/subtask1/train/{LANG}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{LANG}.csv"

T2_TRAIN = f"{BASE}/subtask2/train/{LANG}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{LANG}.csv"

T3_TRAIN = f"{BASE}/subtask3/train/{LANG}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{LANG}.csv"

# ------------------------------------------------------------
# Paths for method2 CV runs
# ------------------------------------------------------------
XLMR_CACHE_ROOT   = Path("cache")     / "xlmr_cv"     / LANG
DEBERTA_CACHE_ROOT = Path("cache")    / "deberta_cv"  / LANG

XLMR_ART_ROOT     = Path("artifacts") / "xlmr_cv"     / LANG
DEBERTA_ART_ROOT  = Path("artifacts") / "deberta_cv"  / LANG

OUT_ROOT          = Path("outputs")   / "ensemble_cv" / LANG
SUB_ROOT          = Path("submissions") / "ensemble_cv"

for d in [OUT_ROOT, SUB_ROOT]:
    d.mkdir(parents=True, exist_ok=True)

# Submission subfolders
(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Label orders (same as training notebooks)
# ------------------------------------------------------------
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = ["vilification", "extreme_language", "stereotype",
             "invalidation", "lack_of_empathy", "dehumanization"]

# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)

def grid_search_thresholds(y_true, y_prob, label_names=None):
    """
    For multi-label: per-label threshold search.
    y_true: [N, C], y_prob: [N, C]
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    C = y_true.shape[1]
    grid = np.linspace(0.05, 0.95, 19)
    thrs = {}
    for c in range(C):
        best_t, best_f = 0.5, -1.0
        for t in grid:
            preds = (y_prob[:, c] >= t).astype(int)
            f = f1_score(y_true[:, c], preds, average="binary", zero_division=0)
            if f > best_f:
                best_f, best_t = f, t
        name = label_names[c] if label_names else str(c)
        thrs[name] = float(best_t)
    return thrs

def load_json(path):
    with open(path, "r") as f:
        return json.load(f)

print(f"LANG={LANG}")
print("XLM-R cache root   :", XLMR_CACHE_ROOT)
print("DeBERTa cache root :", DEBERTA_CACHE_ROOT)
print("Outputs root       :", OUT_ROOT)
print("Submissions root   :", SUB_ROOT)


LANG=eng
XLM-R cache root   : cache/xlmr_cv/eng
DeBERTa cache root : cache/deberta_cv/eng
Outputs root       : outputs/ensemble_cv/eng
Submissions root   : submissions/ensemble_cv


## Subtask 1 ensemble (binary)

In [2]:
# ============================================================
# 1) SUBTASK 1 — Polarization (binary) ENSEMBLE
#    Uses calibrated probs from:
#      - cache/xlmr_cv/{LANG}/t1_train_probs.csv
#      - cache/deberta_cv/{LANG}/t1_train_probs.csv
# ============================================================

# 1.1 Load train CSV for labels + id order
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_train_cols_t1 = {"id", "text", "polarization"}
required_dev_cols_t1   = {"id", "text"}
assert required_train_cols_t1.issubset(t1_train_df.columns), \
    f"T1 TRAIN missing: {required_train_cols_t1 - set(t1_train_df.columns)}"
assert required_dev_cols_t1.issubset(t1_dev_df.columns), \
    f"T1 DEV missing: {required_dev_cols_t1 - set(t1_dev_df.columns)}"

t1_train_df["id"] = t1_train_df["id"].astype(str)
t1_dev_df["id"]   = t1_dev_df["id"].astype(str)
t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)
y_true_t1 = t1_train_df["polarization"].values

print("[T1] TRAIN size:", len(t1_train_df))
print("[T1] DEV size  :", len(t1_dev_df))

# 1.2 Load prob caches
t1_train_x = pd.read_csv(XLMR_CACHE_ROOT   / "t1_train_probs.csv")
t1_train_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t1_train_probs.csv")

t1_train_x["id"] = t1_train_x["id"].astype(str)
t1_train_d["id"] = t1_train_d["id"].astype(str)

t1_train_merged = (
    t1_train_df[["id", "polarization"]]
    .merge(t1_train_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t1_train_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

assert t1_train_merged["prob_pos_xlmr"].notna().all(), "Missing XLM-R probs for some train rows"
assert t1_train_merged["prob_pos_deberta"].notna().all(), "Missing DeBERTa probs for some train rows"

# 1.3 Load calibration thresholds for individual models (optional, for metrics)
cal_t1_x = load_json(XLMR_ART_ROOT    / "calib_t1_native.json")
cal_t1_d = load_json(DEBERTA_ART_ROOT / "calib_t1_native.json")

thr_t1_x = float(cal_t1_x["threshold"])
thr_t1_d = float(cal_t1_d["threshold"])

p_x = t1_train_merged["prob_pos_xlmr"].values
p_d = t1_train_merged["prob_pos_deberta"].values
p_ens = 0.5 * (p_x + p_d)

# 1.4 Metrics: XLM-R vs DeBERTa vs Ensemble
pred_x = (p_x >= thr_t1_x).astype(int)
pred_d = (p_d >= thr_t1_d).astype(int)

best_thr_ens, best_f1_ens = 0.5, -1.0
for t in np.linspace(0.05, 0.95, 19):
    pred_e = (p_ens >= t).astype(int)
    f = macro_f1(y_true_t1, pred_e)
    if f > best_f1_ens:
        best_f1_ens, best_thr_ens = f, t

print("\n[T1] TRAIN macro-F1 comparison:")
print("  XLM-R    @thr_native :", macro_f1(y_true_t1, pred_x))
print("  DeBERTa  @thr_native :", macro_f1(y_true_t1, pred_d))
print(f"  Ensemble @thr_ens={best_thr_ens:.2f} :", best_f1_ens)

# 1.5 Save ensemble calibration info
calib_t1_ens = {
    "threshold": float(best_thr_ens),
    "note": "Threshold chosen on train using averaged calibrated probs (XLM-R + DeBERTa).",
}
with open(OUT_ROOT / "calib_t1_ensemble.json", "w") as f:
    json.dump(calib_t1_ens, f, indent=2)

# 1.6 Debug dump (optional)
t1_debug = t1_train_merged[["id", "polarization", "prob_pos_xlmr", "prob_pos_deberta"]].copy()
t1_debug["prob_pos_ens"] = p_ens
t1_debug.to_excel(OUT_ROOT / "t1_train_ensemble_debug.xlsx", index=False)
print("Saved T1 debug ensemble file:", OUT_ROOT / "t1_train_ensemble_debug.xlsx")

# 1.7 Ensemble predictions for DEV
t1_dev_x = pd.read_csv(XLMR_CACHE_ROOT / "t1_dev_probs.csv")
t1_dev_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t1_dev_probs.csv")

t1_dev_x["id"] = t1_dev_x["id"].astype(str)
t1_dev_d["id"] = t1_dev_d["id"].astype(str)

t1_dev_merged = (
    t1_dev_df[["id"]]
    .merge(t1_dev_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t1_dev_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

assert t1_dev_merged["prob_pos_xlmr"].notna().all(), "Missing XLM-R probs for some dev rows"
assert t1_dev_merged["prob_pos_deberta"].notna().all(), "Missing DeBERTa probs for some dev rows"

p_x_dev = t1_dev_merged["prob_pos_xlmr"].values
p_d_dev = t1_dev_merged["prob_pos_deberta"].values
p_ens_dev = 0.5 * (p_x_dev + p_d_dev)
pred_ens_dev = (p_ens_dev >= best_thr_ens).astype(int)

# 1.8 Codabench submission CSV
sub1 = pd.DataFrame({
    "id": t1_dev_merged["id"].astype(str),
    "polarization": pred_ens_dev.astype(int),
})
sub1_path = SUB_ROOT / "subtask_1" / f"pred_{LANG}.csv"
sub1.to_csv(sub1_path, index=False)
print("Saved Subtask 1 ensemble submission:", sub1_path)


[T1] TRAIN size: 5572
[T1] DEV size  : 160

[T1] TRAIN macro-F1 comparison:
  XLM-R    @thr_native : 0.9321058071750241
  DeBERTa  @thr_native : 0.9833770214149999
  Ensemble @thr_ens=0.50 : 0.975692726750826
Saved T1 debug ensemble file: outputs/ensemble_cv/eng/t1_train_ensemble_debug.xlsx
Saved Subtask 1 ensemble submission: submissions/ensemble_cv/subtask_1/pred_eng.csv


## Subtask 2 ensemble (multi-label 5)

In [3]:
# ============================================================
# 2) SUBTASK 2 — Hate type (5 labels) ENSEMBLE
# ============================================================

# 2.1 Load train + dev IDs
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_train_cols_t2 = {"id", "text", *T2_LABELS}
required_dev_cols_t2   = {"id", "text"}
assert required_train_cols_t2.issubset(t2_train_df.columns), \
    f"T2 TRAIN missing: {required_train_cols_t2 - set(t2_train_df.columns)}"
assert required_dev_cols_t2.issubset(t2_dev_df.columns), \
    f"T2 DEV missing: {required_dev_cols_t2 - set(t2_dev_df.columns)}"

t2_train_df["id"] = t2_train_df["id"].astype(str)
t2_dev_df["id"]   = t2_dev_df["id"].astype(str)

Y2_true = t2_train_df[T2_LABELS].values.astype(int)

print("[T2] TRAIN size:", len(t2_train_df))
print("[T2] DEV size  :", len(t2_dev_df))

# 2.2 Load prob caches
t2_train_x = pd.read_csv(XLMR_CACHE_ROOT   / "t2_train_probs.csv")
t2_train_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t2_train_probs.csv")

t2_train_x["id"] = t2_train_x["id"].astype(str)
t2_train_d["id"] = t2_train_d["id"].astype(str)

t2_train_merged = (
    t2_train_df[["id"]]
    .merge(t2_train_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t2_train_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

# sanity check presence
for lab in T2_LABELS:
    assert t2_train_merged[f"prob_{lab}_xlmr"].notna().all(), f"Missing XLM-R prob_{lab}"
    assert t2_train_merged[f"prob_{lab}_deberta"].notna().all(), f"Missing DeBERTa prob_{lab}"

# 2.3 Load individual model calibration thresholds
cal_t2_x = load_json(XLMR_ART_ROOT    / "calib_t2_native.json")
cal_t2_d = load_json(DEBERTA_ART_ROOT / "calib_t2_native.json")
thr_map_t2_x = cal_t2_x["thresholds"]
thr_map_t2_d = cal_t2_d["thresholds"]

# 2.4 Build prob matrices
N = len(t2_train_merged)
C = len(T2_LABELS)
P_x_train   = np.zeros((N, C), dtype=np.float32)
P_d_train   = np.zeros((N, C), dtype=np.float32)
P_ens_train = np.zeros((N, C), dtype=np.float32)

for j, lab in enumerate(T2_LABELS):
    P_x_train[:, j]   = t2_train_merged[f"prob_{lab}_xlmr"].values
    P_d_train[:, j]   = t2_train_merged[f"prob_{lab}_deberta"].values
    P_ens_train[:, j] = 0.5 * (P_x_train[:, j] + P_d_train[:, j])

# 2.5 XLM-R, DeBERTa, Ensemble F1
P2_x = np.zeros_like(P_x_train, dtype=int)
P2_d = np.zeros_like(P_d_train, dtype=int)

for j, lab in enumerate(T2_LABELS):
    thr_x = float(thr_map_t2_x[lab])
    thr_d = float(thr_map_t2_d[lab])
    P2_x[:, j] = (P_x_train[:, j] >= thr_x).astype(int)
    P2_d[:, j] = (P_d_train[:, j] >= thr_d).astype(int)

f1_x = f1_score(Y2_true, P2_x, average="macro", zero_division=0)
f1_d = f1_score(Y2_true, P2_d, average="macro", zero_division=0)

# ensemble thresholds via grid search on P_ens_train
thr_map_t2_ens = grid_search_thresholds(Y2_true, P_ens_train, T2_LABELS)

P2_ens = np.zeros_like(P_ens_train, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr_e = float(thr_map_t2_ens[lab])
    P2_ens[:, j] = (P_ens_train[:, j] >= thr_e).astype(int)

f1_ens = f1_score(Y2_true, P2_ens, average="macro", zero_division=0)

print("\n[T2] TRAIN macro-F1 comparison:")
print("  XLM-R    (calibrated) :", f1_x)
print("  DeBERTa  (calibrated) :", f1_d)
print("  Ensemble (calibrated) :", f1_ens)
print("  Ensemble thresholds   :", thr_map_t2_ens)

# 2.6 Save ensemble calibration info
with open(OUT_ROOT / "calib_t2_ensemble.json", "w") as f:
    json.dump({"thresholds": thr_map_t2_ens}, f, indent=2)

# 2.7 Debug Excel
t2_debug_cols = {"id": t2_train_merged["id"].astype(str).values}
for j, lab in enumerate(T2_LABELS):
    t2_debug_cols[f"prob_{lab}_xlmr"]   = P_x_train[:, j]
    t2_debug_cols[f"prob_{lab}_deberta"] = P_d_train[:, j]
    t2_debug_cols[f"prob_{lab}_ens"]    = P_ens_train[:, j]
t2_debug = pd.DataFrame(t2_debug_cols)
t2_debug.to_excel(OUT_ROOT / "t2_train_ensemble_debug.xlsx", index=False)
print("Saved T2 debug ensemble file:", OUT_ROOT / "t2_train_ensemble_debug.xlsx")

# 2.8 Ensemble predictions for DEV
t2_dev_x = pd.read_csv(XLMR_CACHE_ROOT   / "t2_dev_probs.csv")
t2_dev_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t2_dev_probs.csv")

t2_dev_x["id"] = t2_dev_x["id"].astype(str)
t2_dev_d["id"] = t2_dev_d["id"].astype(str)

t2_dev_merged = (
    t2_dev_df[["id"]]
    .merge(t2_dev_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t2_dev_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

N_dev = len(t2_dev_merged)
P_ens_dev = np.zeros((N_dev, C), dtype=np.float32)

for j, lab in enumerate(T2_LABELS):
    px = t2_dev_merged[f"prob_{lab}_xlmr"].values
    pd_ = t2_dev_merged[f"prob_{lab}_deberta"].values
    assert np.isfinite(px).all(), f"Missing XLM-R dev probs for label {lab}"
    assert np.isfinite(pd_).all(), f"Missing DeBERTa dev probs for label {lab}"
    P_ens_dev[:, j] = 0.5 * (px + pd_)

P2_dev = np.zeros_like(P_ens_dev, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr_e = float(thr_map_t2_ens[lab])
    P2_dev[:, j] = (P_ens_dev[:, j] >= thr_e).astype(int)

# 2.9 Codabench submission CSV (required header order)
idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            t2_dev_merged["id"].astype(str).values,
    "political":     P2_dev[:, idx_political],
    "racial/ethnic": P2_dev[:, idx_racial],
    "religious":     P2_dev[:, idx_religious],
    "gender/sexual": P2_dev[:, idx_gender],
    "other":         P2_dev[:, idx_other],
})
sub2_path = SUB_ROOT / "subtask_2" / f"pred_{LANG}.csv"
sub2.to_csv(sub2_path, index=False)
print("Saved Subtask 2 ensemble submission:", sub2_path)


[T2] TRAIN size: 5572
[T2] DEV size  : 160

[T2] TRAIN macro-F1 comparison:
  XLM-R    (calibrated) : 0.7591862707286214
  DeBERTa  (calibrated) : 0.7879986535787913
  Ensemble (calibrated) : 0.8157219982846303
  Ensemble thresholds   : {'gender/sexual': 0.75, 'political': 0.44999999999999996, 'religious': 0.75, 'racial/ethnic': 0.7, 'other': 0.75}
Saved T2 debug ensemble file: outputs/ensemble_cv/eng/t2_train_ensemble_debug.xlsx
Saved Subtask 2 ensemble submission: submissions/ensemble_cv/subtask_2/pred_eng.csv


## Subtask 3 ensemble (multi-label 6)

In [4]:
# ============================================================
# 3) SUBTASK 3 — Manifestation (6 labels) ENSEMBLE
# ============================================================

# 3.1 Load train + dev IDs
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_train_cols_t3 = {"id", "text", *T3_LABELS}
required_dev_cols_t3   = {"id", "text"}
assert required_train_cols_t3.issubset(t3_train_df.columns), \
    f"T3 TRAIN missing: {required_train_cols_t3 - set(t3_train_df.columns)}"
assert required_dev_cols_t3.issubset(t3_dev_df.columns), \
    f"T3 DEV missing: {required_dev_cols_t3 - set(t3_dev_df.columns)}"

t3_train_df["id"] = t3_train_df["id"].astype(str)
t3_dev_df["id"]   = t3_dev_df["id"].astype(str)

Y3_true = t3_train_df[T3_LABELS].values.astype(int)

print("[T3] TRAIN size:", len(t3_train_df))
print("[T3] DEV size  :", len(t3_dev_df))

# 3.2 Load prob caches
t3_train_x = pd.read_csv(XLMR_CACHE_ROOT   / "t3_train_probs.csv")
t3_train_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t3_train_probs.csv")

t3_train_x["id"] = t3_train_x["id"].astype(str)
t3_train_d["id"] = t3_train_d["id"].astype(str)

t3_train_merged = (
    t3_train_df[["id"]]
    .merge(t3_train_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t3_train_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

for lab in T3_LABELS:
    assert t3_train_merged[f"prob_{lab}_xlmr"].notna().all(), f"Missing XLM-R prob_{lab}"
    assert t3_train_merged[f"prob_{lab}_deberta"].notna().all(), f"Missing DeBERTa prob_{lab}"

# 3.3 Load individual model calibration thresholds
cal_t3_x = load_json(XLMR_ART_ROOT    / "calib_t3_native.json")
cal_t3_d = load_json(DEBERTA_ART_ROOT / "calib_t3_native.json")
thr_map_t3_x = cal_t3_x["thresholds"]
thr_map_t3_d = cal_t3_d["thresholds"]

# 3.4 Build prob matrices
N = len(t3_train_merged)
C = len(T3_LABELS)
P_x_train   = np.zeros((N, C), dtype=np.float32)
P_d_train   = np.zeros((N, C), dtype=np.float32)
P_ens_train = np.zeros((N, C), dtype=np.float32)

for j, lab in enumerate(T3_LABELS):
    P_x_train[:, j]   = t3_train_merged[f"prob_{lab}_xlmr"].values
    P_d_train[:, j]   = t3_train_merged[f"prob_{lab}_deberta"].values
    P_ens_train[:, j] = 0.5 * (P_x_train[:, j] + P_d_train[:, j])

# 3.5 XLM-R, DeBERTa, Ensemble F1
P3_x = np.zeros_like(P_x_train, dtype=int)
P3_d = np.zeros_like(P_d_train, dtype=int)

for j, lab in enumerate(T3_LABELS):
    thr_x = float(thr_map_t3_x[lab])
    thr_d = float(thr_map_t3_d[lab])
    P3_x[:, j] = (P_x_train[:, j] >= thr_x).astype(int)
    P3_d[:, j] = (P_d_train[:, j] >= thr_d).astype(int)

f1_x = f1_score(Y3_true, P3_x, average="macro", zero_division=0)
f1_d = f1_score(Y3_true, P3_d, average="macro", zero_division=0)

# ensemble thresholds via grid search
thr_map_t3_ens = grid_search_thresholds(Y3_true, P_ens_train, T3_LABELS)

P3_ens = np.zeros_like(P_ens_train, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr_e = float(thr_map_t3_ens[lab])
    P3_ens[:, j] = (P_ens_train[:, j] >= thr_e).astype(int)

f1_ens = f1_score(Y3_true, P3_ens, average="macro", zero_division=0)

print("\n[T3] TRAIN macro-F1 comparison:")
print("  XLM-R    (calibrated) :", f1_x)
print("  DeBERTa  (calibrated) :", f1_d)
print("  Ensemble (calibrated) :", f1_ens)
print("  Ensemble thresholds   :", thr_map_t3_ens)

# 3.6 Save ensemble calibration info
with open(OUT_ROOT / "calib_t3_ensemble.json", "w") as f:
    json.dump({"thresholds": thr_map_t3_ens}, f, indent=2)

# 3.7 Debug Excel
t3_debug_cols = {"id": t3_train_merged["id"].astype(str).values}
for j, lab in enumerate(T3_LABELS):
    t3_debug_cols[f"prob_{lab}_xlmr"]    = P_x_train[:, j]
    t3_debug_cols[f"prob_{lab}_deberta"] = P_d_train[:, j]
    t3_debug_cols[f"prob_{lab}_ens"]     = P_ens_train[:, j]
t3_debug = pd.DataFrame(t3_debug_cols)
t3_debug.to_excel(OUT_ROOT / "t3_train_ensemble_debug.xlsx", index=False)
print("Saved T3 debug ensemble file:", OUT_ROOT / "t3_train_ensemble_debug.xlsx")

# 3.8 Ensemble predictions for DEV
t3_dev_x = pd.read_csv(XLMR_CACHE_ROOT   / "t3_dev_probs.csv")
t3_dev_d = pd.read_csv(DEBERTA_CACHE_ROOT / "t3_dev_probs.csv")

t3_dev_x["id"] = t3_dev_x["id"].astype(str)
t3_dev_d["id"] = t3_dev_d["id"].astype(str)

t3_dev_merged = (
    t3_dev_df[["id"]]
    .merge(t3_dev_x, on="id", how="left", suffixes=("", "_xlmr"))
    .merge(t3_dev_d, on="id", how="left", suffixes=("_xlmr", "_deberta"))
)

N_dev = len(t3_dev_merged)
P_ens_dev = np.zeros((N_dev, C), dtype=np.float32)

for j, lab in enumerate(T3_LABELS):
    px = t3_dev_merged[f"prob_{lab}_xlmr"].values
    pd_ = t3_dev_merged[f"prob_{lab}_deberta"].values
    assert np.isfinite(px).all(), f"Missing XLM-R dev probs for label {lab}"
    assert np.isfinite(pd_).all(), f"Missing DeBERTa dev probs for label {lab}"
    P_ens_dev[:, j] = 0.5 * (px + pd_)

P3_dev = np.zeros_like(P_ens_dev, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr_e = float(thr_map_t3_ens[lab])
    P3_dev[:, j] = (P_ens_dev[:, j] >= thr_e).astype(int)

# 3.9 Codabench submission CSV (required header)
#   id,stereotype,vilification,dehumanization,
#   extreme_language,lack_of_empathy,invalidation

idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               t3_dev_merged["id"].astype(str).values,
    "stereotype":       P3_dev[:, idx_stereo],
    "vilification":     P3_dev[:, idx_vil],
    "dehumanization":   P3_dev[:, idx_dehum],
    "extreme_language": P3_dev[:, idx_extreme],
    "lack_of_empathy":  P3_dev[:, idx_lackemp],
    "invalidation":     P3_dev[:, idx_invalid],
})
sub3_path = SUB_ROOT / "subtask_3" / f"pred_{LANG}.csv"
sub3.to_csv(sub3_path, index=False)
print("Saved Subtask 3 ensemble submission:", sub3_path)


[T3] TRAIN size: 5572
[T3] DEV size  : 160

[T3] TRAIN macro-F1 comparison:
  XLM-R    (calibrated) : 0.6740981764510829
  DeBERTa  (calibrated) : 0.7587176504651337
  Ensemble (calibrated) : 0.7640762401668121
  Ensemble thresholds   : {'vilification': 0.49999999999999994, 'extreme_language': 0.5499999999999999, 'stereotype': 0.6, 'invalidation': 0.5499999999999999, 'lack_of_empathy': 0.65, 'dehumanization': 0.65}
Saved T3 debug ensemble file: outputs/ensemble_cv/eng/t3_train_ensemble_debug.xlsx
Saved Subtask 3 ensemble submission: submissions/ensemble_cv/subtask_3/pred_eng.csv
