## Setup: env, device, config

In [None]:
# ============================================================
# 0) SETUP: environment, device toggle, imports, config
# ============================================================

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["WANDB_DISABLED"] = "true"

import random
import json
import warnings
from pathlib import Path
from typing import List, Dict, Optional

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)
import transformers
import inspect

print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)

# ------------------------------------------------------------
# Device selection
# ------------------------------------------------------------
RUN_DEVICE = "gpu"   # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ------------------------------------------------------------
# Reproducibility
# ------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# High-level config
# ------------------------------------------------------------
LANG = "eng"                          # e.g. "eng", "ben", "hin"
BASE = "../dev_phase"                 # root of organizer data
EN_MODEL = "microsoft/deberta-v3-base"

MAX_LEN = 192
EPOCHS = 3
LR = 2e-5

BATCH_TRAIN_GPU = 8
BATCH_TRAIN_CPU = 4
BATCH_EVAL = 8
BATCH_TRAIN = BATCH_TRAIN_GPU if DEVICE.type == "cuda" else BATCH_TRAIN_CPU

WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
GRAD_ACCUM = 1
N_FOLDS = 3  

print(f"LANG={LANG}, EN_MODEL={EN_MODEL}, EPOCHS={EPOCHS}, LR={LR}, "
      f"BATCH_TRAIN={BATCH_TRAIN}, FOLDS={N_FOLDS}")

# ------------------------------------------------------------
# Paths / dirs
# ------------------------------------------------------------
lang_fname = LANG  # adjust if filenames differ from LANG

# TRAIN + DEV (DEV is UNLABELED)
T1_TRAIN = f"{BASE}/subtask1/train/{lang_fname}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{lang_fname}.csv"

T2_TRAIN = f"{BASE}/subtask2/train/{lang_fname}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{lang_fname}.csv"

T3_TRAIN = f"{BASE}/subtask3/train/{lang_fname}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{lang_fname}.csv"

# roots for this method-2 DeBERTa pipeline
ART_ROOT   = Path("artifacts") / "deberta_cv" / LANG
CACHE_ROOT = Path("cache")     / "deberta_cv" / LANG   # both translation & probs
OUT_ROOT   = Path("outputs")   / "deberta_cv" / LANG
SUB_ROOT   = Path("submissions") / "deberta"

for d in [ART_ROOT, CACHE_ROOT, OUT_ROOT, SUB_ROOT]:
    d.mkdir(parents=True, exist_ok=True)

# Submission subfolders
(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

# Label orders (training view)
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = ["vilification", "extreme_language", "stereotype",
             "invalidation", "lack_of_empathy", "dehumanization"]

# ------------------------------------------------------------
# TrainingArguments capability detection
# ------------------------------------------------------------
_TA_PARAMS = inspect.signature(TrainingArguments.__init__).parameters
TRAINER_CAPS = {
    "evaluation_strategy": "evaluation_strategy" in _TA_PARAMS,
    "save_strategy":       "save_strategy" in _TA_PARAMS,
    "warmup_ratio":        "warmup_ratio" in _TA_PARAMS,
    "fp16":                "fp16" in _TA_PARAMS,
    "no_cuda":             "no_cuda" in _TA_PARAMS,
    "use_mps_device":      "use_mps_device" in _TA_PARAMS,
    "report_to":           "report_to" in _TA_PARAMS,
    "grad_accum":          "gradient_accumulation_steps" in _TA_PARAMS,
    "eval_accum":          "eval_accumulation_steps" in _TA_PARAMS,
}

def build_training_args(
    output_dir,
    per_device_train_batch_size,
    per_device_eval_batch_size,
    num_train_epochs,
    learning_rate,
    weight_decay,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
    warmup_steps=0,
):
    use_cuda_flag = (DEVICE.type == "cuda")
    kwargs = dict(
        output_dir=str(output_dir),
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_steps=logging_steps,
        dataloader_pin_memory=use_cuda_flag,
        dataloader_num_workers=0,
    )
    if TRAINER_CAPS["evaluation_strategy"]:
        kwargs["evaluation_strategy"] = evaluation
    if TRAINER_CAPS["save_strategy"]:
        kwargs["save_strategy"] = save
    if TRAINER_CAPS["warmup_ratio"]:
        kwargs["warmup_ratio"] = warmup_ratio
    else:
        kwargs["warmup_steps"] = warmup_steps
    if TRAINER_CAPS["fp16"]:
        kwargs["fp16"] = False
    if TRAINER_CAPS["no_cuda"]:
        kwargs["no_cuda"] = not use_cuda_flag
    if TRAINER_CAPS["use_mps_device"]:
        kwargs["use_mps_device"] = False
    if TRAINER_CAPS["report_to"]:
        kwargs["report_to"] = "none"
    if TRAINER_CAPS["grad_accum"]:
        kwargs["gradient_accumulation_steps"] = GRAD_ACCUM
    if TRAINER_CAPS["eval_accum"]:
        kwargs["eval_accumulation_steps"] = 4
    return TrainingArguments(**kwargs)


2025-12-07 16:57:41.613995: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-07 16:57:41.627130: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765155461.640305 3894870 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765155461.644280 3894870 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765155461.656625 3894870 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

PyTorch: 2.9.0
Transformers: 4.57.1
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
LANG=eng, EN_MODEL=microsoft/deberta-v3-base, EPOCHS=3, LR=2e-05, BATCH_TRAIN=8, FOLDS=3


## Dataset, metrics, calibration, focal loss, CV helper

In [None]:
# ============================================================
# 1) DATASET + METRICS + CALIBRATION HELPERS
# ============================================================

from sklearn.model_selection import StratifiedKFold

class TextClsDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: Optional[List] = None,
        tokenizer=None,
        max_len: int = 256,
        is_multilabel: bool = False,
    ):
        self.texts = list(texts)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_multilabel = is_multilabel

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if self.labels is not None:
            y = self.labels[idx]
            item["labels"] = torch.tensor(
                y,
                dtype=torch.float if self.is_multilabel else torch.long,
            )
        return item


def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)


def grid_search_thresholds(y_true, y_prob, label_names=None):
    """
    Per-label threshold search for multi-label tasks.
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    C = y_true.shape[1]
    grid = np.linspace(0.05, 0.95, 19)
    thrs = {}
    for c in range(C):
        best_t, best_f = 0.5, -1.0
        for t in grid:
            preds = (y_prob[:, c] >= t).astype(int)
            f = f1_score(y_true[:, c], preds, average="binary", zero_division=0)
            if f > best_f:
                best_f, best_t = f, t
        name = label_names[c] if label_names else str(c)
        thrs[name] = float(best_t)
    return thrs


# ============================================================
# Fixed temperature scaling (safe, F1-aware)
# ============================================================

class TempScaler(nn.Module):
    """
    Temperature scaler with log-parameterization:
      T = exp(log_T)  -> T > 0
    and clamping to [min_T, max_T] for stability.
    """
    def __init__(self, init_T: float = 1.0, min_T: float = 0.05, max_T: float = 10.0):
        super().__init__()
        self.log_T = nn.Parameter(torch.log(torch.tensor([init_T], dtype=torch.float32)))
        self.min_T = min_T
        self.max_T = max_T

    def get_T(self) -> torch.Tensor:
        T = torch.exp(self.log_T)
        if self.min_T is not None or self.max_T is not None:
            T = torch.clamp(T, self.min_T, self.max_T)
        return T

    def forward(self, logits: torch.Tensor) -> torch.Tensor:
        T = self.get_T()
        return logits / T


def learn_temperature(
    dev_logits: torch.Tensor,
    dev_labels: torch.Tensor,
    is_multilabel: bool,
    f1_tolerance: float = 0.01,
) -> float:
    """
    Learn a temperature T using LBFGS on CPU, with safety checks:
      - T is forced positive via log_T and clamped to [0.05, 10.0].
      - Compute macro-F1 before and after scaling (thr=0.5 for multi-label).
      - If calibrated F1 << base F1 (by > f1_tolerance), fall back to T=1.0.

    Returns:
        scalar float T
    """
    device_cpu = torch.device("cpu")

    dev_logits = dev_logits.detach().to(device_cpu)
    dev_labels = dev_labels.detach().to(device_cpu)

    # ---- 1) Baseline F1 at T = 1.0 ----
    with torch.no_grad():
        if is_multilabel:
            probs_base = torch.sigmoid(dev_logits)
            preds_base = (probs_base >= 0.5).long().cpu().numpy()
            y_true = dev_labels.cpu().numpy()
            base_f1 = f1_score(y_true, preds_base, average="macro", zero_division=0)
        else:
            probs_base = torch.softmax(dev_logits, dim=1)
            preds_base = probs_base.argmax(dim=1).cpu().numpy()
            y_true = dev_labels.cpu().numpy()
            base_f1 = f1_score(y_true, preds_base, average="macro", zero_division=0)

    # ---- 2) Optimize log_T ----
    scaler = TempScaler(init_T=1.0, min_T=0.05, max_T=10.0).to(device_cpu)
    opt = torch.optim.LBFGS([scaler.log_T], lr=0.01, max_iter=50)
    criterion = nn.BCEWithLogitsLoss() if is_multilabel else nn.CrossEntropyLoss()

    def closure():
        opt.zero_grad()
        z = scaler(dev_logits)
        if is_multilabel:
            loss = criterion(z, dev_labels.float())
        else:
            loss = criterion(z, dev_labels.long())
        loss.backward()
        return loss

    try:
        opt.step(closure)
    except Exception as e:
        print(f"[TempScale] LBFGS failed: {e}. Using T=1.0.")
        return 1.0

    # ---- 3) Evaluate calibrated F1 ----
    with torch.no_grad():
        T_tensor = scaler.get_T()
        T_value = float(T_tensor.item())
        z_cal = dev_logits / T_tensor

        if is_multilabel:
            probs_cal = torch.sigmoid(z_cal)
            preds_cal = (probs_cal >= 0.5).long().cpu().numpy()
            f1_cal = f1_score(y_true, preds_cal, average="macro", zero_division=0)
        else:
            probs_cal = torch.softmax(z_cal, dim=1)
            preds_cal = probs_cal.argmax(dim=1).cpu().numpy()
            f1_cal = f1_score(y_true, preds_cal, average="macro", zero_division=0)

    print(f"[TempScale] base_F1={base_f1:.4f}, calibrated_F1={f1_cal:.4f}, T={T_value:.4f}")

    # ---- 4) Safety fallback ----
    if f1_cal + 1e-4 < base_f1 - f1_tolerance:
        print("[TempScale] calibrated F1 is worse than base F1; using T=1.0 instead.")
        return 1.0

    return float(T_value)


def collect_logits(trainer: Trainer, dataset: Dataset, is_multilabel: bool):
    """
    Use Trainer.predict to collect logits + labels for given dataset.
    """
    preds = trainer.predict(dataset)
    raw = preds.predictions
    if isinstance(raw, (list, tuple)):
        raw = raw[0]
    logits = torch.tensor(raw)
    labels = torch.tensor(preds.label_ids)
    if not is_multilabel and logits.ndim == 1:
        logits = logits.unsqueeze(1)
    return logits, labels


# ============================================================
# 1.1 Focal BCE loss (multi-label)
# ============================================================

class FocalBCEWithLogitsLoss(nn.Module):
    """
    Focal BCE with class-wise alpha (pos_weight) and gamma.
    alpha: per-class weights (like pos_weight), shape [C] or None.
    """
    def __init__(self, alpha: Optional[torch.Tensor] = None, gamma: float = 2.0, reduction: str = "mean"):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, targets):
        # logits, targets: [B, C]
        bce = nn.functional.binary_cross_entropy_with_logits(
            logits, targets, reduction="none", pos_weight=self.alpha
        )  # [B,C]
        p = torch.sigmoid(logits)
        pt = p * targets + (1 - p) * (1 - targets)
        focal = (1 - pt) ** self.gamma * bce
        if self.reduction == "mean":
            return focal.mean()
        elif self.reduction == "sum":
            return focal.sum()
        return focal


# ============================================================
# 1.2 Simple K-fold helper
# ============================================================

def make_stratified_folds(y_for_strat: np.ndarray, n_splits: int, seed: int = 42):
    """
    y_for_strat: 1D stratification labels (e.g., polarization or has_any_label).
    Returns list of (train_idx, val_idx).
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    idx = np.arange(len(y_for_strat))
    return list(skf.split(idx, y_for_strat))


## Translation helpers + ensure_text_en

In [None]:
# ============================================================
# 2) TRANSLATION HELPERS (OPUS-MT) + ensure_text_en
# ============================================================

def _opus_model_for_lang(lang: str) -> Optional[str]:
    """
    Map language code -> OPUS-MT model name (to EN).
    Extend as needed.
    """
    lang = lang.lower()
    if lang in {"bn", "ben"}:
        return "Helsinki-NLP/opus-mt-bn-en"
    if lang in {"pa", "pan"}:
        return "Helsinki-NLP/opus-mt-pa-en"
    if lang in {"hi", "hin"}:
        return "Helsinki-NLP/opus-mt-hi-en"
    if lang in {"ur"}:
        return "Helsinki-NLP/opus-mt-ur-en"
    # default: no MT model
    return None

def translate_series_to_en(texts, model_name: Optional[str], batch_size: int = 16, max_len: int = 256):
    """
    Translate a list/Series of sentences to English using OPUS-MT.
    Runs on GPU if DEVICE is cuda, else CPU.
    """
    if model_name is None:
        return [str(t) for t in texts]

    device_index = 0 if DEVICE.type == "cuda" else -1

    tok = AutoTokenizer.from_pretrained(model_name)
    mt  = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    mt.to(DEVICE if DEVICE.type == "cuda" else torch.device("cpu"))

    pipe_trans = pipeline(
        "translation",
        model=mt,
        tokenizer=tok,
        device=device_index,
    )

    out = []
    batch = []
    for t in texts:
        batch.append("" if not isinstance(t, str) else t)
        if len(batch) == batch_size:
            res = pipe_trans(batch, max_length=max_len)
            out.extend([r["translation_text"] for r in res])
            batch = []
    if batch:
        res = pipe_trans(batch, max_length=max_len)
        out.extend([r["translation_text"] for r in res])

    # cleanup
    del pipe_trans, mt, tok
    if DEVICE.type == "cuda":
        torch.cuda.empty_cache()
    return out

def ensure_text_en(df: pd.DataFrame, subtask_tag: str, lang: str) -> pd.DataFrame:
    """
    Adds `text_en` to df.
    - If lang == "eng": copy text -> text_en
    - Else: use MT (OPUS) with caching:
        cache file: CACHE_ROOT / f"t{subtask_tag}__{lang}__to_en.csv"
        columns: ['id','text_en'] if 'id' exists, else just 'text_en'
    """
    df = df.copy()
    lang = lang.lower()

    if lang == "eng":
        df["text_en"] = df["text"].astype(str)
        return df

    cache_path = CACHE_ROOT / f"t{subtask_tag}__{lang}__to_en.csv"
    if cache_path.exists():
        cache = pd.read_csv(cache_path)
        if "id" in df.columns and "id" in cache.columns:
            df = df.merge(cache[["id", "text_en"]], on="id", how="left")
        else:
            df["text_en"] = cache["text_en"]
        need = df["text_en"].isna() | (df["text_en"].astype(str).str.len() == 0)
        if need.any():
            model_name = _opus_model_for_lang(lang)
            df.loc[need, "text_en"] = translate_series_to_en(
                df.loc[need, "text"].tolist(),
                model_name,
            )
            # refresh cache
            if "id" in df.columns:
                to_save = df[["id", "text_en"]]
            else:
                to_save = pd.DataFrame({"text_en": df["text_en"]})
            to_save.to_csv(cache_path, index=False)
        return df

    # No cache yet → translate all
    model_name = _opus_model_for_lang(lang)
    df["text_en"] = translate_series_to_en(df["text"], model_name)
    if "id" in df.columns:
        to_save = df[["id", "text_en"]]
    else:
        to_save = pd.DataFrame({"text_en": df["text_en"]})
    to_save.to_csv(cache_path, index=False)
    return df


## Subtask 1 (binary) DeBERTa+MT + K-fold calibration

In [4]:
# ============================================================
# 3) SUBTASK 1 — Polarization (binary, translate→EN + DeBERTa)
# ============================================================

# 3.1 Load TRAIN + DEV, build text_en
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_train_cols_t1 = {"id", "text", "polarization"}
required_dev_cols_t1   = {"id", "text"}
assert required_train_cols_t1.issubset(t1_train_df.columns), \
    f"T1 TRAIN missing: {required_train_cols_t1 - set(t1_train_df.columns)}"
assert required_dev_cols_t1.issubset(t1_dev_df.columns), \
    f"T1 DEV missing: {required_dev_cols_t1 - set(t1_dev_df.columns)}"

t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)

t1_train_df = ensure_text_en(t1_train_df, subtask_tag="1", lang=LANG)
t1_dev_df   = ensure_text_en(t1_dev_df,   subtask_tag="1", lang=LANG)

print(f"[T1] TRAIN size: {len(t1_train_df)}")
print(f"[T1] DEV size (unlabeled): {len(t1_dev_df)}")

# 3.2 K-fold OOF logits for calibration
y_t1 = t1_train_df["polarization"].to_numpy()
folds_t1 = make_stratified_folds(y_t1, n_splits=N_FOLDS, seed=SEED)

oof_logits_t1 = np.zeros((len(t1_train_df), 2), dtype=np.float32)
oof_labels_t1 = y_t1.copy()

tok_t1 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)

for fold, (tr_idx, val_idx) in enumerate(folds_t1):
    print(f"\n[T1] Fold {fold+1}/{N_FOLDS} — train={len(tr_idx)}, val={len(val_idx)}")

    ds_tr = TextClsDataset(
        texts=t1_train_df["text_en"].iloc[tr_idx].tolist(),
        labels=t1_train_df["polarization"].iloc[tr_idx].tolist(),
        tokenizer=tok_t1,
        max_len=MAX_LEN,
        is_multilabel=False,
    )
    ds_val = TextClsDataset(
        texts=t1_train_df["text_en"].iloc[val_idx].tolist(),
        labels=t1_train_df["polarization"].iloc[val_idx].tolist(),
        tokenizer=tok_t1,
        max_len=MAX_LEN,
        is_multilabel=False,
    )

    cfg_t1_fold = AutoConfig.from_pretrained(EN_MODEL, num_labels=2)
    mdl_t1_fold = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t1_fold)
    mdl_t1_fold.config.use_cache = False
    if hasattr(mdl_t1_fold, "gradient_checkpointing_disable"):
        mdl_t1_fold.gradient_checkpointing_disable()
    mdl_t1_fold.to(DEVICE)

    args_t1_fold = build_training_args(
        output_dir=ART_ROOT / f"t1_cv_fold{fold+1}",
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        evaluation="epoch",
        save="no",
        warmup_ratio=WARMUP_RATIO,
    )

    def compute_metrics_t1_fold(eval_pred):
        logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
        labels = eval_pred.label_ids
        preds = np.argmax(logits, axis=1)
        return {"f1_macro": macro_f1(labels, preds)}

    trainer_t1_fold = Trainer(
        model=mdl_t1_fold,
        args=args_t1_fold,
        train_dataset=ds_tr,
        eval_dataset=ds_val,
        tokenizer=tok_t1,
        data_collator=DataCollatorWithPadding(tok_t1),
        compute_metrics=compute_metrics_t1_fold,
    )
    print("  Trainer device:", trainer_t1_fold.args.device)

    trainer_t1_fold.train()
    eval_fold = trainer_t1_fold.evaluate()
    print("  Fold Macro-F1 (argmax):", eval_fold.get("eval_f1_macro"))

    logits_val, _ = collect_logits(trainer_t1_fold, ds_val, is_multilabel=False)
    oof_logits_t1[val_idx] = logits_val.numpy()

    del trainer_t1_fold, mdl_t1_fold
    if DEVICE.type == "cuda":
        torch.cuda.empty_cache()

# 3.3 Calibrate on OOF logits
logits_oof_t1 = torch.from_numpy(oof_logits_t1)
labels_oof_t1 = torch.from_numpy(oof_labels_t1)

T_t1 = learn_temperature(logits_oof_t1, labels_oof_t1, is_multilabel=False)
probs_oof_t1 = torch.softmax(logits_oof_t1 / T_t1, dim=1)[:, 1].cpu().numpy()

best_thr_t1, best_f1_t1 = 0.5, -1.0
for t in np.linspace(0.05, 0.95, 19):
    pred = (probs_oof_t1 >= t).astype(int)
    f = macro_f1(labels_oof_t1.numpy(), pred)
    if f > best_f1_t1:
        best_f1_t1, best_thr_t1 = f, t

print("\n[T1] Calibration (OOF):")
print(f"  Temperature T={T_t1:.4f}")
print(f"  Best threshold={best_thr_t1:.2f}")
print(f"  Macro-F1 (OOF, calibrated)={best_f1_t1:.4f}")

# 3.4 Train FINAL model on full TRAIN (text_en)
cfg_t1_final = AutoConfig.from_pretrained(EN_MODEL, num_labels=2)
mdl_t1_final = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t1_final)
mdl_t1_final.config.use_cache = False
if hasattr(mdl_t1_final, "gradient_checkpointing_disable"):
    mdl_t1_final.gradient_checkpointing_disable()
mdl_t1_final.to(DEVICE)

ds_t1_train_full = TextClsDataset(
    texts=t1_train_df["text_en"].tolist(),
    labels=t1_train_df["polarization"].tolist(),
    tokenizer=tok_t1,
    max_len=MAX_LEN,
    is_multilabel=False,
)
ds_t1_dev_full = TextClsDataset(
    texts=t1_dev_df["text_en"].tolist(),
    labels=[0] * len(t1_dev_df),
    tokenizer=tok_t1,
    max_len=MAX_LEN,
    is_multilabel=False,
)

args_t1_final = build_training_args(
    output_dir=ART_ROOT / "t1_final",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t1_final(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {"f1_macro": macro_f1(labels, preds)}

trainer_t1_final = Trainer(
    model=mdl_t1_final,
    args=args_t1_final,
    train_dataset=ds_t1_train_full,
    eval_dataset=ds_t1_train_full,
    tokenizer=tok_t1,
    data_collator=DataCollatorWithPadding(tok_t1),
    compute_metrics=compute_metrics_t1_final,
)

print("\n[T1] Training FINAL DeBERTa model on full train...")
trainer_t1_final.train()
eval_t1_train_full = trainer_t1_final.evaluate()
print("[T1] Macro-F1 (TRAIN, argmax, final model):", eval_t1_train_full.get("eval_f1_macro"))

# Calibrated train F1 for final model
logits_t1_train_full, labels_t1_train_full = collect_logits(trainer_t1_final, ds_t1_train_full, is_multilabel=False)
probs_t1_train_full = torch.softmax(logits_t1_train_full / T_t1, dim=1)[:, 1].cpu().numpy()
pred_t1_train_full  = (probs_t1_train_full >= best_thr_t1).astype(int)
print("[T1] Macro-F1 (TRAIN, calibrated T+thr, final model):",
      macro_f1(labels_t1_train_full.numpy(), pred_t1_train_full))

# 3.5 Inference on DEV
preds_dev_t1 = trainer_t1_final.predict(ds_t1_dev_full)
logits_t1_dev = torch.tensor(
    preds_dev_t1.predictions
    if not isinstance(preds_dev_t1.predictions, (list, tuple))
    else preds_dev_t1.predictions[0]
)
probs_t1_dev = torch.softmax(logits_t1_dev / T_t1, dim=1)[:, 1].cpu().numpy()
pred_t1_dev = (probs_t1_dev >= best_thr_t1).astype(int)

# 3.6 Cache train/dev probs for ensembling
cache_t1_train = pd.DataFrame({
    "id":   t1_train_df["id"].astype(str),
    "prob_pos": probs_t1_train_full,
    "label":   t1_train_df["polarization"].astype(int),
})
cache_t1_train.to_csv(CACHE_ROOT / "t1_train_probs.csv", index=False)

cache_t1_dev = pd.DataFrame({
    "id":      t1_dev_df["id"].astype(str),
    "prob_pos": probs_t1_dev,
})
cache_t1_dev.to_csv(CACHE_ROOT / "t1_dev_probs.csv", index=False)

print("Saved T1 train/dev probabilities (DeBERTa) for ensembling in:", CACHE_ROOT)

# 3.7 Save model + calibration
mdl_t1_final.save_pretrained(ART_ROOT / "native_t1")
tok_t1.save_pretrained(ART_ROOT / "native_t1")
with open(ART_ROOT / "calib_t1_native.json", "w") as f:
    json.dump({"temperature": float(T_t1), "threshold": float(best_thr_t1)}, f, indent=2)

# 3.8 Codabench submission CSV
sub1 = pd.DataFrame({
    "id": t1_dev_df["id"].astype(str),
    "polarization": pred_t1_dev.astype(int),
})
sub1_path = SUB_ROOT / "subtask_1" / f"pred_{lang_fname}.csv"
sub1.to_csv(sub1_path, index=False)
print("Wrote Subtask 1 submission CSV (DeBERTa):", sub1_path)


[T1] TRAIN size: 3222
[T1] DEV size (unlabeled): 160

[T1] Fold 1/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.698
100,0.6147
150,0.472
200,0.5253
250,0.497
300,0.4234
350,0.4334
400,0.4061
450,0.3518
500,0.3183


  Fold Macro-F1 (argmax): 0.7891223247594739

[T1] Fold 2/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.6717
100,0.5131
150,0.5375
200,0.4736
250,0.4798
300,0.3549
350,0.3792
400,0.4088
450,0.3805
500,0.371


  Fold Macro-F1 (argmax): 0.7737514214872951

[T1] Fold 3/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.6754
100,0.5857
150,0.5362
200,0.4423
250,0.4338
300,0.4302
350,0.3731
400,0.3211
450,0.3665
500,0.2518


  Fold Macro-F1 (argmax): 0.8004830020434701
[TempScale] base_F1=0.7878, calibrated_F1=0.7878, T=1.3773

[T1] Calibration (OOF):
  Temperature T=1.3773
  Best threshold=0.15
  Macro-F1 (OOF, calibrated)=0.7925


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



[T1] Training FINAL DeBERTa model on full train...


Step,Training Loss
50,0.6474
100,0.603
150,0.5808
200,0.488
250,0.4967
300,0.4382
350,0.5203
400,0.4394
450,0.3364
500,0.3404


[T1] Macro-F1 (TRAIN, argmax, final model): 0.9613044457770253
[T1] Macro-F1 (TRAIN, calibrated T+thr, final model): 0.9576491464436522


Saved T1 train/dev probabilities (DeBERTa) for ensembling in: cache/deberta_cv/eng
Wrote Subtask 1 submission CSV (DeBERTa): submissions/deberta/subtask_1/pred_eng.csv


## Subtask 2 (multi-label 5) DeBERTa+MT + focal + K-fold calibration

In [5]:
# ============================================================
# 4) SUBTASK 2 — Hate type (5 labels, translate→EN + DeBERTa)
# ============================================================

# 4.1 Load TRAIN + DEV, build text_en
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_train_cols_t2 = {"id", "text", *T2_LABELS}
required_dev_cols_t2   = {"id", "text"}
assert required_train_cols_t2.issubset(t2_train_df.columns), \
    f"T2 TRAIN missing: {required_train_cols_t2 - set(t2_train_df.columns)}"
assert required_dev_cols_t2.issubset(t2_dev_df.columns), \
    f"T2 DEV missing: {required_dev_cols_t2 - set(t2_dev_df.columns)}"

t2_train_df = ensure_text_en(t2_train_df, subtask_tag="2", lang=LANG)
t2_dev_df   = ensure_text_en(t2_dev_df,   subtask_tag="2", lang=LANG)

Y2_train = t2_train_df[T2_LABELS].values.astype(int)

print(f"[T2] TRAIN size: {len(t2_train_df)}")
print(f"[T2] DEV size (unlabeled): {len(t2_dev_df)}")

# For stratification: any positive label vs none
y2_strat = (Y2_train.sum(axis=1) > 0).astype(int)
folds_t2 = make_stratified_folds(y2_strat, n_splits=N_FOLDS, seed=SEED)

tok_t2 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)

# pos_weight on full train
pos_count_2 = Y2_train.sum(axis=0) + 1e-6
neg_count_2 = Y2_train.shape[0] - pos_count_2
pos_weight_2 = torch.tensor(neg_count_2 / pos_count_2, dtype=torch.float)

oof_logits_t2 = np.zeros((len(t2_train_df), len(T2_LABELS)), dtype=np.float32)
oof_labels_t2 = Y2_train.copy()

class FocalTrainerT2(Trainer):
    def __init__(self, *args, pos_weight=None, gamma=1.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalBCEWithLogitsLoss(
            alpha=self.pos_weight.to(logits.device),
            gamma=self.gamma,
            reduction="mean",
        )
        loss = loss_fct(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

for fold, (tr_idx, val_idx) in enumerate(folds_t2):
    print(f"\n[T2] Fold {fold+1}/{N_FOLDS} — train={len(tr_idx)}, val={len(val_idx)}")

    ds_tr = TextClsDataset(
        texts=t2_train_df["text_en"].iloc[tr_idx].tolist(),
        labels=Y2_train[tr_idx].tolist(),
        tokenizer=tok_t2,
        max_len=MAX_LEN,
        is_multilabel=True,
    )
    ds_val = TextClsDataset(
        texts=t2_train_df["text_en"].iloc[val_idx].tolist(),
        labels=Y2_train[val_idx].tolist(),
        tokenizer=tok_t2,
        max_len=MAX_LEN,
        is_multilabel=True,
    )

    cfg_t2_fold = AutoConfig.from_pretrained(
        EN_MODEL,
        num_labels=len(T2_LABELS),
        problem_type="multi_label_classification",
    )
    mdl_t2_fold = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t2_fold)
    mdl_t2_fold.config.use_cache = False
    if hasattr(mdl_t2_fold, "gradient_checkpointing_disable"):
        mdl_t2_fold.gradient_checkpointing_disable()
    mdl_t2_fold.to(DEVICE)

    args_t2_fold = build_training_args(
        output_dir=ART_ROOT / f"t2_cv_fold{fold+1}",
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        evaluation="epoch",
        save="no",
        warmup_ratio=WARMUP_RATIO,
    )

    def compute_metrics_t2_fold(eval_pred):
        logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
        labels = eval_pred.label_ids
        probs  = 1.0 / (1.0 + np.exp(-logits))
        preds  = (probs >= 0.5).astype(int)
        return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

    trainer_t2_fold = FocalTrainerT2(
        model=mdl_t2_fold,
        args=args_t2_fold,
        train_dataset=ds_tr,
        eval_dataset=ds_val,
        tokenizer=tok_t2,
        data_collator=DataCollatorWithPadding(tok_t2),
        compute_metrics=compute_metrics_t2_fold,
        pos_weight=pos_weight_2,
        gamma=1.5,
    )
    print("  Trainer device:", trainer_t2_fold.args.device)

    trainer_t2_fold.train()
    eval_fold = trainer_t2_fold.evaluate()
    print("  Fold Macro-F1 (thr=0.5):", eval_fold.get("eval_f1_macro"))

    logits_val, _ = collect_logits(trainer_t2_fold, ds_val, is_multilabel=True)
    oof_logits_t2[val_idx] = logits_val.numpy()

    del trainer_t2_fold, mdl_t2_fold
    if DEVICE.type == "cuda":
        torch.cuda.empty_cache()

# 4.2 Calibrate on OOF logits
logits_oof_t2 = torch.from_numpy(oof_logits_t2)
labels_oof_t2 = torch.from_numpy(oof_labels_t2)

T_t2 = learn_temperature(logits_oof_t2, labels_oof_t2, is_multilabel=True)
probs_oof_t2 = torch.sigmoid(logits_oof_t2 / T_t2).cpu().numpy()
thr_map_t2 = grid_search_thresholds(labels_oof_t2.numpy(), probs_oof_t2, T2_LABELS)

P2_oof = np.zeros_like(probs_oof_t2, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr = float(thr_map_t2[lab])
    P2_oof[:, j] = (probs_oof_t2[:, j] >= thr).astype(int)
f1_oof_t2 = f1_score(labels_oof_t2.numpy(), P2_oof, average="macro", zero_division=0)

print("\n[T2] Calibration (OOF):")
print("  Temperature:", T_t2)
print("  Thresholds:", thr_map_t2)
print("  Macro-F1 (OOF, calibrated):", f1_oof_t2)

# 4.3 Train FINAL model on full TRAIN (text_en)
cfg_t2_final = AutoConfig.from_pretrained(
    EN_MODEL,
    num_labels=len(T2_LABELS),
    problem_type="multi_label_classification",
)
mdl_t2_final = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t2_final)
mdl_t2_final.config.use_cache = False
if hasattr(mdl_t2_final, "gradient_checkpointing_disable"):
    mdl_t2_final.gradient_checkpointing_disable()
mdl_t2_final.to(DEVICE)

ds_t2_train_full = TextClsDataset(
    texts=t2_train_df["text_en"].tolist(),
    labels=Y2_train.tolist(),
    tokenizer=tok_t2,
    max_len=MAX_LEN,
    is_multilabel=True,
)
dummy_labels_t2_dev = np.zeros((len(t2_dev_df), len(T2_LABELS)), dtype=int)
ds_t2_dev_full = TextClsDataset(
    texts=t2_dev_df["text_en"].tolist(),
    labels=dummy_labels_t2_dev.tolist(),
    tokenizer=tok_t2,
    max_len=MAX_LEN,
    is_multilabel=True,
)

args_t2_final = build_training_args(
    output_dir=ART_ROOT / "t2_final",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t2_final(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    probs  = 1.0 / (1.0 + np.exp(-logits))
    preds  = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t2_final = FocalTrainerT2(
    model=mdl_t2_final,
    args=args_t2_final,
    train_dataset=ds_t2_train_full,
    eval_dataset=ds_t2_train_full,
    tokenizer=tok_t2,
    data_collator=DataCollatorWithPadding(tok_t2),
    compute_metrics=compute_metrics_t2_final,
    pos_weight=pos_weight_2,
    gamma=1.5,
)

print("\n[T2] Training FINAL DeBERTa model on full train...")
trainer_t2_final.train()
eval_t2_train_full = trainer_t2_final.evaluate()
print("[T2] Macro-F1 (TRAIN, thr=0.5, final model):", eval_t2_train_full.get("eval_f1_macro"))

# calibrated train F1
logits_t2_train_full, labels_t2_train_full = collect_logits(trainer_t2_final, ds_t2_train_full, is_multilabel=True)
probs_t2_train_full = torch.sigmoid(logits_t2_train_full / T_t2).cpu().numpy()
P2_train_full = np.zeros_like(probs_t2_train_full, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr = float(thr_map_t2[lab])
    P2_train_full[:, j] = (probs_t2_train_full[:, j] >= thr).astype(int)
train_f1_calib_t2 = f1_score(labels_t2_train_full.numpy(), P2_train_full, average="macro", zero_division=0)
print("[T2] Macro-F1 (TRAIN, calibrated T+thr, final model):", train_f1_calib_t2)

# 4.4 Inference on DEV
preds_dev_t2 = trainer_t2_final.predict(ds_t2_dev_full)
logits_t2_dev = torch.tensor(
    preds_dev_t2.predictions
    if not isinstance(preds_dev_t2.predictions, (list, tuple))
    else preds_dev_t2.predictions[0]
)
probs_t2_dev = torch.sigmoid(logits_t2_dev / T_t2).cpu().numpy()

P2_dev = np.zeros_like(probs_t2_dev, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr = float(thr_map_t2[lab])
    P2_dev[:, j] = (probs_t2_dev[:, j] >= thr).astype(int)

# 4.5 Cache train/dev probs for ensembling
cache_cols_train_t2 = {"id": t2_train_df["id"].astype(str).values}
for j, lab in enumerate(T2_LABELS):
    cache_cols_train_t2[f"prob_{lab}"]  = probs_t2_train_full[:, j]
    cache_cols_train_t2[f"label_{lab}"] = labels_t2_train_full.numpy()[:, j]

t2_train_cache = pd.DataFrame(cache_cols_train_t2)
t2_train_cache.to_csv(CACHE_ROOT / "t2_train_probs.csv", index=False)

cache_cols_dev_t2 = {"id": t2_dev_df["id"].astype(str).values}
for j, lab in enumerate(T2_LABELS):
    cache_cols_dev_t2[f"prob_{lab}"] = probs_t2_dev[:, j]
t2_dev_cache = pd.DataFrame(cache_cols_dev_t2)
t2_dev_cache.to_csv(CACHE_ROOT / "t2_dev_probs.csv", index=False)

print("Saved T2 train/dev probabilities (DeBERTa) for ensembling in:", CACHE_ROOT)

# 4.6 Save model + calibration
mdl_t2_final.save_pretrained(ART_ROOT / "native_t2")
tok_t2.save_pretrained(ART_ROOT / "native_t2")
with open(ART_ROOT / "calib_t2_native.json", "w") as f:
    json.dump({"temperature": float(T_t2), "thresholds": thr_map_t2}, f, indent=2)

# 4.7 Codabench submission CSV (required header order)
idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            t2_dev_df["id"].astype(str).values,
    "political":     P2_dev[:, idx_political],
    "racial/ethnic": P2_dev[:, idx_racial],
    "religious":     P2_dev[:, idx_religious],
    "gender/sexual": P2_dev[:, idx_gender],
    "other":         P2_dev[:, idx_other],
})
sub2_path = SUB_ROOT / "subtask_2" / f"pred_{lang_fname}.csv"
sub2.to_csv(sub2_path, index=False)
print("Wrote Subtask 2 submission CSV (DeBERTa):", sub2_path)


[T2] TRAIN size: 3222
[T2] DEV size (unlabeled): 160

[T2] Fold 1/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.4654
100,0.4403
150,0.5028
200,0.5312
250,0.5651
300,0.4057
350,0.4725
400,0.3612
450,0.4787
500,0.4357


  Fold Macro-F1 (thr=0.5): 0.34754764217307843

[T2] Fold 2/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.4945
100,0.4631
150,0.5089
200,0.6085
250,0.4669
300,0.3952
350,0.4592
400,0.4374
450,0.3734
500,0.3287


  Fold Macro-F1 (thr=0.5): 0.3395533929990189

[T2] Fold 3/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.4677
100,0.4345
150,0.4973
200,0.5792
250,0.4617
300,0.4711
350,0.5408
400,0.4169
450,0.5416
500,0.5235


  Fold Macro-F1 (thr=0.5): 0.3010290604212571
[TempScale] base_F1=0.3336, calibrated_F1=0.3336, T=0.7884

[T2] Calibration (OOF):
  Temperature: 0.7884389758110046
  Thresholds: {'gender/sexual': 0.25, 'political': 0.44999999999999996, 'religious': 0.75, 'racial/ethnic': 0.35, 'other': 0.5499999999999999}
  Macro-F1 (OOF, calibrated): 0.3626535698904044


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



[T2] Training FINAL DeBERTa model on full train...


Step,Training Loss
50,0.4366
100,0.4559
150,0.4107
200,0.5166
250,0.5165
300,0.4769
350,0.5455
400,0.4698
450,0.4715
500,0.4204


[T2] Macro-F1 (TRAIN, thr=0.5, final model): 0.48405134377736986
[T2] Macro-F1 (TRAIN, calibrated T+thr, final model): 0.4876222168787071


Saved T2 train/dev probabilities (DeBERTa) for ensembling in: cache/deberta_cv/eng
Wrote Subtask 2 submission CSV (DeBERTa): submissions/deberta/subtask_2/pred_eng.csv


## Subtask 3 (multi-label 6) DeBERTa+MT + focal + K-fold calibration

In [6]:
# ============================================================
# 5) SUBTASK 3 — Manifestation (6 labels, translate→EN + DeBERTa)
# ============================================================

# 5.1 Load TRAIN + DEV, build text_en
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_train_cols_t3 = {"id", "text", *T3_LABELS}
required_dev_cols_t3   = {"id", "text"}
assert required_train_cols_t3.issubset(t3_train_df.columns), \
    f"T3 TRAIN missing: {required_train_cols_t3 - set(t3_train_df.columns)}"
assert required_dev_cols_t3.issubset(t3_dev_df.columns), \
    f"T3 DEV missing: {required_dev_cols_t3 - set(t3_dev_df.columns)}"

t3_train_df = ensure_text_en(t3_train_df, subtask_tag="3", lang=LANG)
t3_dev_df   = ensure_text_en(t3_dev_df,   subtask_tag="3", lang=LANG)

Y3_train = t3_train_df[T3_LABELS].values.astype(int)

print(f"[T3] TRAIN size: {len(t3_train_df)}")
print(f"[T3] DEV size (unlabeled): {len(t3_dev_df)}")

# stratification: any manifestation vs none
y3_strat = (Y3_train.sum(axis=1) > 0).astype(int)
folds_t3 = make_stratified_folds(y3_strat, n_splits=N_FOLDS, seed=SEED)

tok_t3 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)

pos_count_3 = Y3_train.sum(axis=0) + 1e-6
neg_count_3 = Y3_train.shape[0] - pos_count_3
pos_weight_3 = torch.tensor(neg_count_3 / pos_count_3, dtype=torch.float)

oof_logits_t3 = np.zeros((len(t3_train_df), len(T3_LABELS)), dtype=np.float32)
oof_labels_t3 = Y3_train.copy()

class FocalTrainerT3(Trainer):
    def __init__(self, *args, pos_weight=None, gamma=1.5, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_weight = pos_weight
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalBCEWithLogitsLoss(
            alpha=self.pos_weight.to(logits.device),
            gamma=self.gamma,
            reduction="mean",
        )
        loss = loss_fct(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

for fold, (tr_idx, val_idx) in enumerate(folds_t3):
    print(f"\n[T3] Fold {fold+1}/{N_FOLDS} — train={len(tr_idx)}, val={len(val_idx)}")

    ds_tr = TextClsDataset(
        texts=t3_train_df["text_en"].iloc[tr_idx].tolist(),
        labels=Y3_train[tr_idx].tolist(),
        tokenizer=tok_t3,
        max_len=MAX_LEN,
        is_multilabel=True,
    )
    ds_val = TextClsDataset(
        texts=t3_train_df["text_en"].iloc[val_idx].tolist(),
        labels=Y3_train[val_idx].tolist(),
        tokenizer=tok_t3,
        max_len=MAX_LEN,
        is_multilabel=True,
    )

    cfg_t3_fold = AutoConfig.from_pretrained(
        EN_MODEL,
        num_labels=len(T3_LABELS),
        problem_type="multi_label_classification",
    )
    mdl_t3_fold = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t3_fold)
    mdl_t3_fold.config.use_cache = False
    if hasattr(mdl_t3_fold, "gradient_checkpointing_disable"):
        mdl_t3_fold.gradient_checkpointing_disable()
    mdl_t3_fold.to(DEVICE)

    args_t3_fold = build_training_args(
        output_dir=ART_ROOT / f"t3_cv_fold{fold+1}",
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        logging_steps=50,
        evaluation="epoch",
        save="no",
        warmup_ratio=WARMUP_RATIO,
    )

    def compute_metrics_t3_fold(eval_pred):
        logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
        labels = eval_pred.label_ids
        probs  = 1.0 / (1.0 + np.exp(-logits))
        preds  = (probs >= 0.5).astype(int)
        return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

    trainer_t3_fold = FocalTrainerT3(
        model=mdl_t3_fold,
        args=args_t3_fold,
        train_dataset=ds_tr,
        eval_dataset=ds_val,
        tokenizer=tok_t3,
        data_collator=DataCollatorWithPadding(tok_t3),
        compute_metrics=compute_metrics_t3_fold,
        pos_weight=pos_weight_3,
        gamma=1.5,
    )
    print("  Trainer device:", trainer_t3_fold.args.device)

    trainer_t3_fold.train()
    eval_fold = trainer_t3_fold.evaluate()
    print("  Fold Macro-F1 (thr=0.5):", eval_fold.get("eval_f1_macro"))

    logits_val, _ = collect_logits(trainer_t3_fold, ds_val, is_multilabel=True)
    oof_logits_t3[val_idx] = logits_val.numpy()

    del trainer_t3_fold, mdl_t3_fold
    if DEVICE.type == "cuda":
        torch.cuda.empty_cache()

# 5.2 Calibrate on OOF logits
logits_oof_t3 = torch.from_numpy(oof_logits_t3)
labels_oof_t3 = torch.from_numpy(oof_labels_t3)

T_t3 = learn_temperature(logits_oof_t3, labels_oof_t3, is_multilabel=True)
probs_oof_t3 = torch.sigmoid(logits_oof_t3 / T_t3).cpu().numpy()
thr_map_t3 = grid_search_thresholds(labels_oof_t3.numpy(), probs_oof_t3, T3_LABELS)

P3_oof = np.zeros_like(probs_oof_t3, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr = float(thr_map_t3[lab])
    P3_oof[:, j] = (probs_oof_t3[:, j] >= thr).astype(int)
f1_oof_t3 = f1_score(labels_oof_t3.numpy(), P3_oof, average="macro", zero_division=0)

print("\n[T3] Calibration (OOF):")
print("  Temperature:", T_t3)
print("  Thresholds:", thr_map_t3)
print("  Macro-F1 (OOF, calibrated):", f1_oof_t3)

# 5.3 Train FINAL model on full TRAIN (text_en)
cfg_t3_final = AutoConfig.from_pretrained(
    EN_MODEL,
    num_labels=len(T3_LABELS),
    problem_type="multi_label_classification",
)
mdl_t3_final = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t3_final)
mdl_t3_final.config.use_cache = False
if hasattr(mdl_t3_final, "gradient_checkpointing_disable"):
    mdl_t3_final.gradient_checkpointing_disable()
mdl_t3_final.to(DEVICE)

ds_t3_train_full = TextClsDataset(
    texts=t3_train_df["text_en"].tolist(),
    labels=Y3_train.tolist(),
    tokenizer=tok_t3,
    max_len=MAX_LEN,
    is_multilabel=True,
)
dummy_labels_t3_dev = np.zeros((len(t3_dev_df), len(T3_LABELS)), dtype=int)
ds_t3_dev_full = TextClsDataset(
    texts=t3_dev_df["text_en"].tolist(),
    labels=dummy_labels_t3_dev.tolist(),
    tokenizer=tok_t3,
    max_len=MAX_LEN,
    is_multilabel=True,
)

args_t3_final = build_training_args(
    output_dir=ART_ROOT / "t3_final",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t3_final(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    probs  = 1.0 / (1.0 + np.exp(-logits))
    preds  = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t3_final = FocalTrainerT3(
    model=mdl_t3_final,
    args=args_t3_final,
    train_dataset=ds_t3_train_full,
    eval_dataset=ds_t3_train_full,
    tokenizer=tok_t3,
    data_collator=DataCollatorWithPadding(tok_t3),
    compute_metrics=compute_metrics_t3_final,
    pos_weight=pos_weight_3,
    gamma=1.5,
)

print("\n[T3] Training FINAL DeBERTa model on full train...")
trainer_t3_final.train()
eval_t3_train_full = trainer_t3_final.evaluate()
print("[T3] Macro-F1 (TRAIN, thr=0.5, final model):", eval_t3_train_full.get("eval_f1_macro"))

# calibrated train F1
logits_t3_train_full, labels_t3_train_full = collect_logits(trainer_t3_final, ds_t3_train_full, is_multilabel=True)
probs_t3_train_full = torch.sigmoid(logits_t3_train_full / T_t3).cpu().numpy()
P3_train_full = np.zeros_like(probs_t3_train_full, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr = float(thr_map_t3[lab])
    P3_train_full[:, j] = (probs_t3_train_full[:, j] >= thr).astype(int)
train_f1_calib_t3 = f1_score(labels_t3_train_full.numpy(), P3_train_full, average="macro", zero_division=0)
print("[T3] Macro-F1 (TRAIN, calibrated T+thr, final model):", train_f1_calib_t3)

# 5.4 Inference on DEV
preds_dev_t3 = trainer_t3_final.predict(ds_t3_dev_full)
logits_t3_dev = torch.tensor(
    preds_dev_t3.predictions
    if not isinstance(preds_dev_t3.predictions, (list, tuple))
    else preds_dev_t3.predictions[0]
)
probs_t3_dev = torch.sigmoid(logits_t3_dev / T_t3).cpu().numpy()

P3_dev = np.zeros_like(probs_t3_dev, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr = float(thr_map_t3[lab])
    P3_dev[:, j] = (probs_t3_dev[:, j] >= thr).astype(int)

# 5.5 Cache train/dev probs for ensembling
cache_cols_train_t3 = {"id": t3_train_df["id"].astype(str).values}
for j, lab in enumerate(T3_LABELS):
    cache_cols_train_t3[f"prob_{lab}"]  = probs_t3_train_full[:, j]
    cache_cols_train_t3[f"label_{lab}"] = labels_t3_train_full.numpy()[:, j]

t3_train_cache = pd.DataFrame(cache_cols_train_t3)
t3_train_cache.to_csv(CACHE_ROOT / "t3_train_probs.csv", index=False)

cache_cols_dev_t3 = {"id": t3_dev_df["id"].astype(str).values}
for j, lab in enumerate(T3_LABELS):
    cache_cols_dev_t3[f"prob_{lab}"] = probs_t3_dev[:, j]
t3_dev_cache = pd.DataFrame(cache_cols_dev_t3)
t3_dev_cache.to_csv(CACHE_ROOT / "t3_dev_probs.csv", index=False)

print("Saved T3 train/dev probabilities (DeBERTa) for ensembling in:", CACHE_ROOT)

# 5.6 Save model + calibration
mdl_t3_final.save_pretrained(ART_ROOT / "native_t3")
tok_t3.save_pretrained(ART_ROOT / "native_t3")
with open(ART_ROOT / "calib_t3_native.json", "w") as f:
    json.dump({"temperature": float(T_t3), "thresholds": thr_map_t3}, f, indent=2)

# 5.7 Codabench submission CSV (required header order)
idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               t3_dev_df["id"].astype(str).values,
    "stereotype":       P3_dev[:, idx_stereo],
    "vilification":     P3_dev[:, idx_vil],
    "dehumanization":   P3_dev[:, idx_dehum],
    "extreme_language": P3_dev[:, idx_extreme],
    "lack_of_empathy":  P3_dev[:, idx_lackemp],
    "invalidation":     P3_dev[:, idx_invalid],
})
sub3_path = SUB_ROOT / "subtask_3" / f"pred_{lang_fname}.csv"
sub3.to_csv(sub3_path, index=False)
print("Wrote Subtask 3 submission CSV (DeBERTa):", sub3_path)


[T3] TRAIN size: 3222
[T3] DEV size (unlabeled): 160

[T3] Fold 1/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.412
100,0.403
150,0.3536
200,0.3662
250,0.3536
300,0.3374
350,0.3137
400,0.3244
450,0.3231
500,0.2853


  Fold Macro-F1 (thr=0.5): 0.494572838367564

[T3] Fold 2/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.4275
100,0.4231
150,0.3629
200,0.3559
250,0.3194
300,0.3091
350,0.3272
400,0.3036
450,0.3173
500,0.2786


  Fold Macro-F1 (thr=0.5): 0.48446004150646144

[T3] Fold 3/3 — train=2148, val=1074


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


  Trainer device: cuda:0


Step,Training Loss
50,0.4138
100,0.412
150,0.3758
200,0.3301
250,0.3154
300,0.3232
350,0.3096
400,0.2655
450,0.326
500,0.2541


  Fold Macro-F1 (thr=0.5): 0.4882677408629481
[TempScale] base_F1=0.4894, calibrated_F1=0.4894, T=0.8889

[T3] Calibration (OOF):
  Temperature: 0.8888956904411316
  Thresholds: {'vilification': 0.44999999999999996, 'extreme_language': 0.44999999999999996, 'stereotype': 0.5499999999999999, 'invalidation': 0.39999999999999997, 'lack_of_empathy': 0.49999999999999994, 'dehumanization': 0.65}
  Macro-F1 (OOF, calibrated): 0.49501524716667394


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.



[T3] Training FINAL DeBERTa model on full train...


Step,Training Loss
50,0.3937
100,0.4101
150,0.3702
200,0.3407
250,0.367
300,0.3363
350,0.3626
400,0.3642
450,0.2926
500,0.2791


[T3] Macro-F1 (TRAIN, thr=0.5, final model): 0.6164551110137816
[T3] Macro-F1 (TRAIN, calibrated T+thr, final model): 0.6261228098252071


Saved T3 train/dev probabilities (DeBERTa) for ensembling in: cache/deberta_cv/eng
Wrote Subtask 3 submission CSV (DeBERTa): submissions/deberta/subtask_3/pred_eng.csv
