## SETUP: environment, device toggle, imports, config

In [None]:
# ============================================================
# 0) SETUP: environment, device toggle, imports, config
# ============================================================

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_NO_TF"] = "1"   # avoid TF / tf_keras imports
os.environ["WANDB_DISABLED"] = "true"

import random
import json
import warnings
from pathlib import Path
from typing import List, Dict, Optional

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    pipeline,
)
import transformers

print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)

# ------------------------------------------------------------
# Device selection: switch between GPU / CPU here
# ------------------------------------------------------------
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ------------------------------------------------------------
# Reproducibility
# ------------------------------------------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ------------------------------------------------------------
# High-level config
# ------------------------------------------------------------
LANG = "eng"                          # e.g. "eng", "ben", "hin", etc.
BASE = "../dev_phase"                 # root of organizer data
EN_MODEL = "microsoft/deberta-v3-base"

MAX_LEN = 192
EPOCHS = 3
LR = 2e-5
BATCH_TRAIN_GPU = 8
BATCH_TRAIN_CPU = 4
BATCH_EVAL = 8
BATCH_TRAIN = BATCH_TRAIN_GPU if DEVICE.type == "cuda" else BATCH_TRAIN_CPU
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
GRAD_ACCUM = 1  

print(f"LANG={LANG}, MODEL={EN_MODEL}, EPOCHS={EPOCHS}, LR={LR}, BATCH_TRAIN={BATCH_TRAIN}")

# ------------------------------------------------------------
# Paths / dirs
# ------------------------------------------------------------
lang_fname = LANG  # if your filenames differ, adjust here

# TRAIN + DEV (DEV is UNLABELED in this setup)
T1_TRAIN = f"{BASE}/subtask1/train/{lang_fname}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{lang_fname}.csv"

T2_TRAIN = f"{BASE}/subtask2/train/{lang_fname}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{lang_fname}.csv"

T3_TRAIN = f"{BASE}/subtask3/train/{lang_fname}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{lang_fname}.csv"

# method-specific roots
ART_ROOT   = Path("artifacts") / "deberta" / LANG
CACHE_ROOT = Path("cache") / "deberta" / LANG
OUT_ROOT   = Path("outputs") / "deberta" / LANG
SUB_ROOT   = Path("submissions") / "deberta"

for d in [ART_ROOT, CACHE_ROOT, OUT_ROOT, SUB_ROOT]:
    d.mkdir(parents=True, exist_ok=True)

# Submission subfolders
(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]
T3_LABELS = ["vilification", "extreme_language", "stereotype",
             "invalidation", "lack_of_empathy", "dehumanization"]

# ------------------------------------------------------------
# TrainingArguments capability detection
# ------------------------------------------------------------
import inspect
_TA_PARAMS = inspect.signature(TrainingArguments.__init__).parameters
TRAINER_CAPS = {
    "evaluation_strategy": "evaluation_strategy" in _TA_PARAMS,
    "save_strategy":       "save_strategy" in _TA_PARAMS,
    "warmup_ratio":        "warmup_ratio" in _TA_PARAMS,
    "fp16":                "fp16" in _TA_PARAMS,
    "no_cuda":             "no_cuda" in _TA_PARAMS,
    "use_mps_device":      "use_mps_device" in _TA_PARAMS,
    "report_to":           "report_to" in _TA_PARAMS,
    "grad_accum":          "gradient_accumulation_steps" in _TA_PARAMS,
    "eval_accum":          "eval_accumulation_steps" in _TA_PARAMS,
}

def build_training_args(
    output_dir,
    per_device_train_batch_size,
    per_device_eval_batch_size,
    num_train_epochs,
    learning_rate,
    weight_decay,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
    warmup_steps=0,
):
    use_cuda_flag = (DEVICE.type == "cuda")
    kwargs = dict(
        output_dir=str(output_dir),
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_steps=logging_steps,
        dataloader_pin_memory=use_cuda_flag,
        dataloader_num_workers=0,
    )
    if TRAINER_CAPS["evaluation_strategy"]:
        kwargs["evaluation_strategy"] = evaluation
    if TRAINER_CAPS["save_strategy"]:
        kwargs["save_strategy"] = save
    if TRAINER_CAPS["warmup_ratio"]:
        kwargs["warmup_ratio"] = warmup_ratio
    else:
        kwargs["warmup_steps"] = warmup_steps
    if TRAINER_CAPS["fp16"]:
        kwargs["fp16"] = False
    if TRAINER_CAPS["no_cuda"]:
        kwargs["no_cuda"] = not use_cuda_flag
    if TRAINER_CAPS["use_mps_device"]:
        kwargs["use_mps_device"] = False
    if TRAINER_CAPS["report_to"]:
        kwargs["report_to"] = "none"
    if TRAINER_CAPS["grad_accum"]:
        kwargs["gradient_accumulation_steps"] = GRAD_ACCUM
    if TRAINER_CAPS["eval_accum"]:
        kwargs["eval_accumulation_steps"] = 4
    return TrainingArguments(**kwargs)

2025-12-07 18:20:08.368782: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-07 18:20:08.382058: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765160408.395546 3937706 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765160408.399602 3937706 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765160408.412173 3937706 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

PyTorch: 2.9.0
Transformers: 4.57.1
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
LANG=eng, MODEL=microsoft/deberta-v3-base, EPOCHS=3, LR=2e-05, BATCH_TRAIN=8


## DATASET + METRICS + CALIBRATION HELPERS

In [2]:
# ============================================================
# 1) DATASET + METRICS + CALIBRATION HELPERS
# ============================================================

class TextClsDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: Optional[List] = None,
        tokenizer=None,
        max_len: int = 256,
        is_multilabel: bool = False,
    ):
        self.texts = list(texts)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_multilabel = is_multilabel

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if self.labels is not None:
            y = self.labels[idx]
            item["labels"] = torch.tensor(
                y,
                dtype=torch.float if self.is_multilabel else torch.long,
            )
        return item

def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)

def grid_search_thresholds(y_true, y_prob, label_names=None):
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    C = y_true.shape[1]
    grid = np.linspace(0.05, 0.95, 19)
    thrs = {}
    for c in range(C):
        best_t, best_f = 0.5, -1.0
        for t in grid:
            preds = (y_prob[:, c] >= t).astype(int)
            f = f1_score(y_true[:, c], preds, average="binary", zero_division=0)
            if f > best_f:
                best_f, best_t = f, t
        name = label_names[c] if label_names else str(c)
        thrs[name] = float(best_t)
    return thrs

class TempScaler(nn.Module):
    def __init__(self):
        super().__init__()
        self.T = nn.Parameter(torch.ones(1))

    def forward(self, logits):
        return logits / self.T

def learn_temperature(dev_logits, dev_labels, is_multilabel: bool):
    """
    Simple temperature scaling on CPU.
    """
    device = torch.device("cpu")
    scaler = TempScaler().to(device)
    dev_logits = dev_logits.to(device)
    dev_labels = dev_labels.to(device)

    opt = torch.optim.LBFGS([scaler.T], max_iter=50)
    criterion = nn.BCEWithLogitsLoss() if is_multilabel else nn.CrossEntropyLoss()

    def closure():
        opt.zero_grad()
        z = scaler(dev_logits)
        loss = criterion(
            z,
            dev_labels.float() if is_multilabel else dev_labels.long(),
        )
        loss.backward()
        return loss

    opt.step(closure)
    return float(scaler.T.detach().cpu().item())

def collect_logits(trainer: Trainer, dataset: Dataset, is_multilabel: bool):
    preds = trainer.predict(dataset)
    raw = preds.predictions
    if isinstance(raw, (list, tuple)):
        raw = raw[0]
    logits = torch.tensor(raw)
    labels = torch.tensor(preds.label_ids)
    if not is_multilabel and logits.ndim == 1:
        logits = logits.unsqueeze(1)
    return logits, labels

## TRANSLATION HELPERS (for non-ENG), text_en caching

In [3]:
# ============================================================
# 2) TRANSLATION HELPERS (for non-ENG), text_en caching
# ============================================================

def _opus_model_for_lang(lang: str) -> Optional[str]:
    """
    Map language code -> OPUS-MT model name (to EN).
    Extend as needed for your languages.
    """
    lang = lang.lower()
    if lang in {"bn", "ben"}:
        return "Helsinki-NLP/opus-mt-bn-en"
    if lang in {"pa", "pan"}:
        return "Helsinki-NLP/opus-mt-pa-en"
    if lang in {"hi", "hin"}:
        return "Helsinki-NLP/opus-mt-hi-en"
    return None

def translate_series_to_en(texts, model_name: str, batch_size: int = 16, max_len: int = 256):
    if model_name is None:
        return [str(t) for t in texts]

    tok = AutoTokenizer.from_pretrained(model_name)
    mt  = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    if DEVICE.type == "cuda":
        mt.to(DEVICE)
        pipe = pipeline("translation", model=mt, tokenizer=tok, device=0)  # GPU 0
    else:
        mt.to(torch.device("cpu"))
        pipe = pipeline("translation", model=mt, tokenizer=tok, device=-1)  # CPU

    out = []
    batch = []
    for t in texts:
        batch.append("" if not isinstance(t, str) else t)
        if len(batch) == batch_size:
            res = pipe(batch, max_length=max_len)
            out.extend([r["translation_text"] for r in res])
            batch = []
    if batch:
        res = pipe(batch, max_length=max_len)
        out.extend([r["translation_text"] for r in res])
    return out

def ensure_text_en(df: pd.DataFrame, subtask_tag: str, lang: str) -> pd.DataFrame:
    """
    Adds `text_en` to df.
    - If LANG == "eng": just copy `text` -> `text_en`.
    - Else: translate (and cache) in CACHE_ROOT / f"t{subtask_tag}__{lang}__to_en.csv".
    Cache format: at least columns ['id','text_en'] if 'id' exists, else just 'text_en'.
    """
    df = df.copy()
    lang = lang.lower()

    if lang == "eng":
        df["text_en"] = df["text"].astype(str)
        return df

    cache_path = CACHE_ROOT / f"t{subtask_tag}__{lang}__to_en.csv"
    if cache_path.exists():
        cache = pd.read_csv(cache_path)
        if "id" in df.columns and "id" in cache.columns:
            df = df.merge(cache[["id", "text_en"]], on="id", how="left")
        else:
            df["text_en"] = cache["text_en"]
        need = df["text_en"].isna() | (df["text_en"].astype(str).str.len() == 0)
        if need.any():
            model_name = _opus_model_for_lang(lang)
            df.loc[need, "text_en"] = translate_series_to_en(
                df.loc[need, "text"], model_name
            )
            # refresh cache
            if "id" in df.columns:
                to_save = df[["id", "text_en"]]
            else:
                to_save = pd.DataFrame({"text_en": df["text_en"]})
            to_save.to_csv(cache_path, index=False)
        return df

    # No cache yet → translate all
    model_name = _opus_model_for_lang(lang)
    df["text_en"] = translate_series_to_en(df["text"], model_name)
    if "id" in df.columns:
        to_save = df[["id", "text_en"]]
    else:
        to_save = pd.DataFrame({"text_en": df["text_en"]})
    to_save.to_csv(cache_path, index=False)
    return df

## SUBTASK 1 — Polarization detection (binary)

In [4]:
# ============================================================
# 3) SUBTASK 1 — Polarization detection (binary)
#    Train+F1+calibrate on train, infer on dev, save cache + submission
# ============================================================

# 3.1 Load TRAIN + DEV
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_train_cols_t1 = {"id", "text", "polarization"}
required_dev_cols_t1   = {"id", "text"}
assert required_train_cols_t1.issubset(t1_train_df.columns), f"T1 TRAIN missing: {required_train_cols_t1 - set(t1_train_df.columns)}"
assert required_dev_cols_t1.issubset(t1_dev_df.columns),     f"T1 DEV missing: {required_dev_cols_t1 - set(t1_dev_df.columns)}"

t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)

# 3.2 Build text_en for DeBERTa
t1_train_df = ensure_text_en(t1_train_df, subtask_tag="1", lang=LANG)
t1_dev_df   = ensure_text_en(t1_dev_df,   subtask_tag="1", lang=LANG)

# 3.3 Model + tokenizer
tok_t1 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)
cfg_t1 = AutoConfig.from_pretrained(EN_MODEL, num_labels=2)
mdl_t1 = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t1)
mdl_t1.config.use_cache = False
if hasattr(mdl_t1, "gradient_checkpointing_disable"):
    mdl_t1.gradient_checkpointing_disable()
mdl_t1.to(DEVICE)

# 3.4 Datasets
ds_t1_train = TextClsDataset(
    texts=t1_train_df["text_en"].tolist(),
    labels=t1_train_df["polarization"].tolist(),
    tokenizer=tok_t1,
    max_len=MAX_LEN,
    is_multilabel=False,
)
# dev is unlabeled -> dummy zeros (not used for loss/metrics)
ds_t1_dev = TextClsDataset(
    texts=t1_dev_df["text_en"].tolist(),
    labels=[0] * len(t1_dev_df),
    tokenizer=tok_t1,
    max_len=MAX_LEN,
    is_multilabel=False,
)

# 3.5 Trainer
args_t1 = build_training_args(
    output_dir=ART_ROOT / "t1_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t1(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {"f1_macro": macro_f1(labels, preds)}

trainer_t1 = Trainer(
    model=mdl_t1,
    args=args_t1,
    train_dataset=ds_t1_train,
    eval_dataset=ds_t1_train,  # evaluate on TRAIN
    tokenizer=tok_t1,
    data_collator=DataCollatorWithPadding(tok_t1),
    compute_metrics=compute_metrics_t1,
)

print("Subtask 1 (DeBERTa) trainer device:", trainer_t1.args.device)

# 3.6 Train + F1 on TRAIN (with 0.5 threshold implicit)
trainer_t1.train()
eval_t1_train_raw = trainer_t1.evaluate()
print("T1 Macro-F1 (TRAIN, 0.5 thr):", eval_t1_train_raw.get("eval_f1_macro"))

# 3.7 Calibrate (temperature + threshold) on TRAIN
logits_t1_train, labels_t1_train = collect_logits(trainer_t1, ds_t1_train, is_multilabel=False)
T_t1 = learn_temperature(logits_t1_train, labels_t1_train, is_multilabel=False)
probs_t1_train = torch.softmax(logits_t1_train / T_t1, dim=1)[:, 1].cpu().numpy()

best_thr_t1, best_f1_train = 0.5, -1.0
for t in np.linspace(0.05, 0.95, 19):
    pred = (probs_t1_train >= t).astype(int)
    f = macro_f1(labels_t1_train.numpy(), pred)
    if f > best_f1_train:
        best_f1_train, best_thr_t1 = f, t

print(f"T1 calibration (TRAIN): T={T_t1:.4f}, best_thr={best_thr_t1:.2f}, macroF1={best_f1_train:.4f}")

# 3.8 Save model + calibration
mdl_t1.save_pretrained(ART_ROOT / "native_t1")
tok_t1.save_pretrained(ART_ROOT / "native_t1")
with open(ART_ROOT / "calib_t1_native.json", "w") as f:
    json.dump({"temperature": float(T_t1), "threshold": float(best_thr_t1)}, f, indent=2)

# 3.9 Infer on DEV (unlabeled), cache probs, build submission CSV
preds_dev_t1 = trainer_t1.predict(ds_t1_dev)
logits_t1_dev = torch.tensor(preds_dev_t1.predictions if not isinstance(preds_dev_t1.predictions,(list,tuple)) else preds_dev_t1.predictions[0])
probs_t1_dev = torch.softmax(logits_t1_dev / T_t1, dim=1)[:, 1].cpu().numpy()
pred_t1_dev = (probs_t1_dev >= best_thr_t1).astype(int)

# Cache for ensemble
cache_t1_train = pd.DataFrame({
    "id": t1_train_df["id"].astype(str),
    "prob_pos": probs_t1_train,
    "label": t1_train_df["polarization"].astype(int),
})
cache_t1_train.to_csv(CACHE_ROOT / "t1_train_probs.csv", index=False)

cache_t1_dev = pd.DataFrame({
    "id": t1_dev_df["id"].astype(str),
    "prob_pos": probs_t1_dev,
})
cache_t1_dev.to_csv(CACHE_ROOT / "t1_dev_probs.csv", index=False)

# Submission CSV for subtask 1
sub1 = pd.DataFrame({
    "id": t1_dev_df["id"].astype(str),
    "polarization": pred_t1_dev.astype(int),
})
sub1_path = SUB_ROOT / "subtask_1" / f"pred_{lang_fname}.csv"
sub1.to_csv(sub1_path, index=False)
print("Wrote Subtask 1 submission CSV:", sub1_path)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Subtask 1 (DeBERTa) trainer device: cuda:0


Step,Training Loss
50,0.7025
100,0.6169
150,0.5404
200,0.5104
250,0.5405
300,0.4455
350,0.4832
400,0.4482
450,0.3466
500,0.3807


T1 Macro-F1 (TRAIN, 0.5 thr): 0.9540936933186643
T1 calibration (TRAIN): T=1.4648, best_thr=0.35, macroF1=0.9553


Wrote Subtask 1 submission CSV: submissions/deberta/subtask_1/pred_eng.csv


## SUBTASK 2 — Type classification (multi-label: 5)

In [5]:
# ============================================================
# 4) SUBTASK 2 — Type classification (multi-label: 5)
# ============================================================

# 4.1 Load TRAIN + DEV
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_train_cols_t2 = {"id", "text", *T2_LABELS}
required_dev_cols_t2   = {"id", "text"}
assert required_train_cols_t2.issubset(t2_train_df.columns), f"T2 TRAIN missing: {required_train_cols_t2 - set(t2_train_df.columns)}"
assert required_dev_cols_t2.issubset(t2_dev_df.columns),     f"T2 DEV missing: {required_dev_cols_t2 - set(t2_dev_df.columns)}"

Y2_train = t2_train_df[T2_LABELS].values.astype(int)

# 4.2 text_en
t2_train_df = ensure_text_en(t2_train_df, subtask_tag="2", lang=LANG)
t2_dev_df   = ensure_text_en(t2_dev_df,   subtask_tag="2", lang=LANG)

# 4.3 Model + tokenizer
tok_t2 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)
cfg_t2 = AutoConfig.from_pretrained(
    EN_MODEL,
    num_labels=len(T2_LABELS),
    problem_type="multi_label_classification",
)
mdl_t2 = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t2)
mdl_t2.config.use_cache = False
if hasattr(mdl_t2, "gradient_checkpointing_disable"):
    mdl_t2.gradient_checkpointing_disable()
mdl_t2.to(DEVICE)

# 4.4 Datasets
ds_t2_train = TextClsDataset(
    texts=t2_train_df["text_en"].tolist(),
    labels=Y2_train.tolist(),
    tokenizer=tok_t2,
    max_len=MAX_LEN,
    is_multilabel=True,
)
ds_t2_dev = TextClsDataset(
    texts=t2_dev_df["text_en"].tolist(),
    labels=[[0]*len(T2_LABELS)] * len(t2_dev_df),
    tokenizer=tok_t2,
    max_len=MAX_LEN,
    is_multilabel=True,
)

# 4.5 Class imbalance (pos_weight)
pos_count2 = Y2_train.sum(axis=0) + 1e-6
neg_count2 = Y2_train.shape[0] - pos_count2
pos_weight_2 = torch.tensor(neg_count2 / pos_count2, dtype=torch.float)

class WeightedTrainerT2(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self._pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=self._pos_weight.to(logits.device))
        loss = loss_fn(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

args_t2 = build_training_args(
    output_dir=ART_ROOT / "t2_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t2(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    probs = 1.0 / (1.0 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t2 = WeightedTrainerT2(
    model=mdl_t2,
    args=args_t2,
    train_dataset=ds_t2_train,
    eval_dataset=ds_t2_train,  # evaluate on TRAIN
    tokenizer=tok_t2,
    data_collator=DataCollatorWithPadding(tok_t2),
    compute_metrics=compute_metrics_t2,
    pos_weight=pos_weight_2,
)

print("Subtask 2 (DeBERTa) trainer device:", trainer_t2.args.device)

# 4.6 Train + raw F1 (0.5 thr) on TRAIN
trainer_t2.train()
eval_t2_train_raw = trainer_t2.evaluate()
print("T2 Macro-F1 (TRAIN, thr=0.5):", eval_t2_train_raw.get("eval_f1_macro"))

# 4.7 Calibrate: temperature + label-wise thresholds on TRAIN
logits_t2_train, labels_t2_train = collect_logits(trainer_t2, ds_t2_train, is_multilabel=True)
T_t2 = learn_temperature(logits_t2_train, labels_t2_train, is_multilabel=True)
probs_t2_train = torch.sigmoid(logits_t2_train / T_t2).cpu().numpy()
thr_map_t2 = grid_search_thresholds(labels_t2_train.cpu().numpy(), probs_t2_train, T2_LABELS)

# compute calibrated macro-F1 on TRAIN using those thresholds
P2_train = np.zeros_like(probs_t2_train, dtype=int)
for j, lab in enumerate(T2_LABELS):
    P2_train[:, j] = (probs_t2_train[:, j] >= thr_map_t2[lab]).astype(int)
f1_t2_cal = f1_score(labels_t2_train.cpu().numpy(), P2_train, average="macro", zero_division=0)
print("T2 temperature:", T_t2)
print("T2 thresholds:", thr_map_t2)
print("T2 Macro-F1 (TRAIN, calibrated):", f1_t2_cal)

# 4.8 Save model + calibration
mdl_t2.save_pretrained(ART_ROOT / "native_t2")
tok_t2.save_pretrained(ART_ROOT / "native_t2")
with open(ART_ROOT / "calib_t2_native.json", "w") as f:
    json.dump({"temperature": float(T_t2), "thresholds": thr_map_t2}, f, indent=2)

# 4.9 Infer on DEV, cache probs, build submission CSV
preds_dev_t2 = trainer_t2.predict(ds_t2_dev)
logits_t2_dev = torch.tensor(preds_dev_t2.predictions if not isinstance(preds_dev_t2.predictions,(list,tuple)) else preds_dev_t2.predictions[0])
probs_t2_dev = torch.sigmoid(logits_t2_dev / T_t2).cpu().numpy()

P2_dev = np.zeros_like(probs_t2_dev, dtype=int)
for j, lab in enumerate(T2_LABELS):
    P2_dev[:, j] = (probs_t2_dev[:, j] >= thr_map_t2[lab]).astype(int)

# Cache for ensemble
train_cols = {"id": t2_train_df["id"].astype(str)}
for j, lab in enumerate(T2_LABELS):
    train_cols[f"prob_{lab}"] = probs_t2_train[:, j]
    train_cols[f"label_{lab}"] = labels_t2_train.cpu().numpy()[:, j]
cache_t2_train = pd.DataFrame(train_cols)
cache_t2_train.to_csv(CACHE_ROOT / "t2_train_probs.csv", index=False)

dev_cols = {"id": t2_dev_df["id"].astype(str)}
for j, lab in enumerate(T2_LABELS):
    dev_cols[f"prob_{lab}"] = probs_t2_dev[:, j]
cache_t2_dev = pd.DataFrame(dev_cols)
cache_t2_dev.to_csv(CACHE_ROOT / "t2_dev_probs.csv", index=False)

# Submission CSV for subtask 2
# Codabench header: id,political,racial/ethnic,religious,gender/sexual,other
idx2 = {lab: i for i, lab in enumerate(T2_LABELS)}
sub2 = pd.DataFrame({
    "id": t2_dev_df["id"].astype(str),
    "political":      P2_dev[:, idx2["political"]],
    "racial/ethnic":  P2_dev[:, idx2["racial/ethnic"]],
    "religious":      P2_dev[:, idx2["religious"]],
    "gender/sexual":  P2_dev[:, idx2["gender/sexual"]],
    "other":          P2_dev[:, idx2["other"]],
})
sub2_path = SUB_ROOT / "subtask_2" / f"pred_{lang_fname}.csv"
sub2.to_csv(sub2_path, index=False)
print("Wrote Subtask 2 submission CSV:", sub2_path)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Subtask 2 (DeBERTa) trainer device: cuda:0


Step,Training Loss
50,1.2181
100,1.2793
150,1.1692
200,1.4141
250,1.3803
300,1.3766
350,1.4593
400,1.4166
450,1.1185
500,1.0317


T2 Macro-F1 (TRAIN, thr=0.5): 0.48166939693340993
T2 temperature: 1.1098158359527588
T2 thresholds: {'gender/sexual': 0.7999999999999999, 'political': 0.49999999999999994, 'religious': 0.9, 'racial/ethnic': 0.65, 'other': 0.75}
T2 Macro-F1 (TRAIN, calibrated): 0.5422365856535791


Wrote Subtask 2 submission CSV: submissions/deberta/subtask_2/pred_eng.csv


## SUBTASK 3 — Manifestation (multi-label: 6)

In [6]:
# ============================================================
# 5) SUBTASK 3 — Manifestation (multi-label: 6)
# ============================================================

# 5.1 Load TRAIN + DEV
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_train_cols_t3 = {"id", "text", *T3_LABELS}
required_dev_cols_t3   = {"id", "text"}
assert required_train_cols_t3.issubset(t3_train_df.columns), f"T3 TRAIN missing: {required_train_cols_t3 - set(t3_train_df.columns)}"
assert required_dev_cols_t3.issubset(t3_dev_df.columns),     f"T3 DEV missing: {required_dev_cols_t3 - set(t3_dev_df.columns)}"

Y3_train = t3_train_df[T3_LABELS].values.astype(int)

# 5.2 text_en
t3_train_df = ensure_text_en(t3_train_df, subtask_tag="3", lang=LANG)
t3_dev_df   = ensure_text_en(t3_dev_df,   subtask_tag="3", lang=LANG)

# 5.3 Model + tokenizer
tok_t3 = AutoTokenizer.from_pretrained(EN_MODEL, use_fast=True)
cfg_t3 = AutoConfig.from_pretrained(
    EN_MODEL,
    num_labels=len(T3_LABELS),
    problem_type="multi_label_classification",
)
mdl_t3 = AutoModelForSequenceClassification.from_pretrained(EN_MODEL, config=cfg_t3)
mdl_t3.config.use_cache = False
if hasattr(mdl_t3, "gradient_checkpointing_disable"):
    mdl_t3.gradient_checkpointing_disable()
mdl_t3.to(DEVICE)

# 5.4 Datasets
ds_t3_train = TextClsDataset(
    texts=t3_train_df["text_en"].tolist(),
    labels=Y3_train.tolist(),
    tokenizer=tok_t3,
    max_len=MAX_LEN,
    is_multilabel=True,
)
ds_t3_dev = TextClsDataset(
    texts=t3_dev_df["text_en"].tolist(),
    labels=[[0]*len(T3_LABELS)] * len(t3_dev_df),
    tokenizer=tok_t3,
    max_len=MAX_LEN,
    is_multilabel=True,
)

# 5.5 Class imbalance (pos_weight)
pos_count3 = Y3_train.sum(axis=0) + 1e-6
neg_count3 = Y3_train.shape[0] - pos_count3
pos_weight_3 = torch.tensor(neg_count3 / pos_count3, dtype=torch.float)

class WeightedTrainerT3(Trainer):
    def __init__(self, *args, pos_weight=None, **kwargs):
        super().__init__(*args, **kwargs)
        self._pos_weight = pos_weight

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=self._pos_weight.to(logits.device))
        loss = loss_fn(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

args_t3 = build_training_args(
    output_dir=ART_ROOT / "t3_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t3(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(eval_pred.predictions, (list, tuple)) else eval_pred.predictions
    labels = eval_pred.label_ids
    probs = 1.0 / (1.0 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t3 = WeightedTrainerT3(
    model=mdl_t3,
    args=args_t3,
    train_dataset=ds_t3_train,
    eval_dataset=ds_t3_train,  # on TRAIN
    tokenizer=tok_t3,
    data_collator=DataCollatorWithPadding(tok_t3),
    compute_metrics=compute_metrics_t3,
    pos_weight=pos_weight_3,
)

print("Subtask 3 (DeBERTa) trainer device:", trainer_t3.args.device)

# 5.6 Train + raw F1 (0.5 thr) on TRAIN
trainer_t3.train()
eval_t3_train_raw = trainer_t3.evaluate()
print("T3 Macro-F1 (TRAIN, thr=0.5):", eval_t3_train_raw.get("eval_f1_macro"))

# 5.7 Calibrate temperature + thresholds on TRAIN
logits_t3_train, labels_t3_train = collect_logits(trainer_t3, ds_t3_train, is_multilabel=True)
T_t3 = learn_temperature(logits_t3_train, labels_t3_train, is_multilabel=True)
probs_t3_train = torch.sigmoid(logits_t3_train / T_t3).cpu().numpy()
thr_map_t3 = grid_search_thresholds(labels_t3_train.cpu().numpy(), probs_t3_train, T3_LABELS)

P3_train = np.zeros_like(probs_t3_train, dtype=int)
for j, lab in enumerate(T3_LABELS):
    P3_train[:, j] = (probs_t3_train[:, j] >= thr_map_t3[lab]).astype(int)
f1_t3_cal = f1_score(labels_t3_train.cpu().numpy(), P3_train, average="macro", zero_division=0)
print("T3 temperature:", T_t3)
print("T3 thresholds:", thr_map_t3)
print("T3 Macro-F1 (TRAIN, calibrated):", f1_t3_cal)

# 5.8 Save model + calibration
mdl_t3.save_pretrained(ART_ROOT / "native_t3")
tok_t3.save_pretrained(ART_ROOT / "native_t3")
with open(ART_ROOT / "calib_t3_native.json", "w") as f:
    json.dump({"temperature": float(T_t3), "thresholds": thr_map_t3}, f, indent=2)

# 5.9 Infer on DEV, cache probs, build submission CSV
preds_dev_t3 = trainer_t3.predict(ds_t3_dev)
logits_t3_dev = torch.tensor(preds_dev_t3.predictions if not isinstance(preds_dev_t3.predictions,(list,tuple)) else preds_dev_t3.predictions[0])
probs_t3_dev = torch.sigmoid(logits_t3_dev / T_t3).cpu().numpy()

P3_dev = np.zeros_like(probs_t3_dev, dtype=int)
for j, lab in enumerate(T3_LABELS):
    P3_dev[:, j] = (probs_t3_dev[:, j] >= thr_map_t3[lab]).astype(int)

# Cache for ensemble
train_cols3 = {"id": t3_train_df["id"].astype(str)}
for j, lab in enumerate(T3_LABELS):
    train_cols3[f"prob_{lab}"] = probs_t3_train[:, j]
    train_cols3[f"label_{lab}"] = labels_t3_train.cpu().numpy()[:, j]
cache_t3_train = pd.DataFrame(train_cols3)
cache_t3_train.to_csv(CACHE_ROOT / "t3_train_probs.csv", index=False)

dev_cols3 = {"id": t3_dev_df["id"].astype(str)}
for j, lab in enumerate(T3_LABELS):
    dev_cols3[f"prob_{lab}"] = probs_t3_dev[:, j]
cache_t3_dev = pd.DataFrame(dev_cols3)
cache_t3_dev.to_csv(CACHE_ROOT / "t3_dev_probs.csv", index=False)

# Submission CSV for subtask 3
# Codabench header: id,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
idx3 = {lab: i for i, lab in enumerate(T3_LABELS)}
sub3 = pd.DataFrame({
    "id": t3_dev_df["id"].astype(str),
    "stereotype":       P3_dev[:, idx3["stereotype"]],
    "vilification":     P3_dev[:, idx3["vilification"]],
    "dehumanization":   P3_dev[:, idx3["dehumanization"]],
    "extreme_language": P3_dev[:, idx3["extreme_language"]],
    "lack_of_empathy":  P3_dev[:, idx3["lack_of_empathy"]],
    "invalidation":     P3_dev[:, idx3["invalidation"]],
})
sub3_path = SUB_ROOT / "subtask_3" / f"pred_{lang_fname}.csv"
sub3.to_csv(sub3_path, index=False)
print("Wrote Subtask 3 submission CSV:", sub3_path)

print("\nAll DeBERTa training + calibration + dev submissions done.")
print("Submission roots (zip one subtask folder at a time for Codabench):")
print("  ", SUB_ROOT / "subtask_1")
print("  ", SUB_ROOT / "subtask_2")
print("  ", SUB_ROOT / "subtask_3")
print("Caches for ensembling live under:", CACHE_ROOT)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Subtask 3 (DeBERTa) trainer device: cuda:0


Step,Training Loss
50,1.1011
100,1.1465
150,1.0287
200,0.9123
250,0.9733
300,0.961
350,1.054
400,0.9361
450,0.8199
500,0.8219


T3 Macro-F1 (TRAIN, thr=0.5): 0.6129851481851645
T3 temperature: 1.3111610412597656
T3 thresholds: {'vilification': 0.44999999999999996, 'extreme_language': 0.65, 'stereotype': 0.75, 'invalidation': 0.49999999999999994, 'lack_of_empathy': 0.65, 'dehumanization': 0.7}
T3 Macro-F1 (TRAIN, calibrated): 0.6336362476490647


Wrote Subtask 3 submission CSV: submissions/deberta/subtask_3/pred_eng.csv

All DeBERTa training + calibration + dev submissions done.
Submission roots (zip one subtask folder at a time for Codabench):
   submissions/deberta/subtask_1
   submissions/deberta/subtask_2
   submissions/deberta/subtask_3
Caches for ensembling live under: cache/deberta/eng
