## Setup, device, config, directories

In [None]:
# ============================
# 0) Setup: env, device, config
# ============================

import os

# Prevent TensorFlow / Flax imports (avoids tf-keras noise & slow init)
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

import json, random, warnings, inspect
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
import transformers

print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)

# ---- Device selection (GPU / CPU toggle) ----
RUN_DEVICE = "gpu"  # "gpu" or "cpu"

if RUN_DEVICE.lower() == "gpu" and torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    torch.backends.cudnn.benchmark = True
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    DEVICE = torch.device("cpu")
    torch.backends.cudnn.enabled = False
    torch.set_num_threads(max(1, os.cpu_count() // 2))
    print("Using CPU")

# ---- Seeds ----
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ============================
# Config
# ============================

METHOD      = "xlmr"              # method name (for folder structure)
LANG        = "eng"               # e.g. "eng" | "ben" | "hin"
MODEL_NAME  = "xlm-roberta-base"  # multilingual encoder

MAX_LEN      = 192
EPOCHS       = 3
LR           = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1
GRAD_ACCUM   = 4

BATCH_TRAIN_GPU = 8
BATCH_TRAIN_CPU = 4
BATCH_EVAL_GPU  = 16
BATCH_EVAL_CPU  = 8

BATCH_TRAIN = BATCH_TRAIN_GPU if DEVICE.type == "cuda" else BATCH_TRAIN_CPU
BATCH_EVAL  = BATCH_EVAL_GPU  if DEVICE.type == "cuda" else BATCH_EVAL_CPU

BASE = "../dev_phase"  # organizer data root

lang_fname = LANG if LANG != "eng" else "eng"

# ============================
# Data paths: TRAIN (labeled) / DEV (unlabeled)
# ============================

# Subtask 1 (binary)
T1_TRAIN = f"{BASE}/subtask1/train/{lang_fname}.csv"
T1_DEV   = f"{BASE}/subtask1/dev/{lang_fname}.csv"   # unlabeled dev

# Subtask 2 (multi-label 5)
T2_TRAIN = f"{BASE}/subtask2/train/{lang_fname}.csv"
T2_DEV   = f"{BASE}/subtask2/dev/{lang_fname}.csv"

# Subtask 3 (multi-label 6)
T3_TRAIN = f"{BASE}/subtask3/train/{lang_fname}.csv"
T3_DEV   = f"{BASE}/subtask3/dev/{lang_fname}.csv"

# ============================
# Directories (method-aware)
# ============================

ART_DIR_ROOT    = Path("artifacts")
OUT_DIR_ROOT    = Path("outputs")
SUBMIT_ROOT     = Path("submissions")
CACHE_ROOT      = Path("cache")

ART_DIR   = ART_DIR_ROOT / METHOD / LANG
OUT_DIR   = OUT_DIR_ROOT / METHOD / LANG
METHOD_SUBMIT_DIR = SUBMIT_ROOT / METHOD
CACHE_DIR = CACHE_ROOT / METHOD / LANG

for d in [ART_DIR, OUT_DIR, METHOD_SUBMIT_DIR, CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Subtask-specific submission dirs:
SUB1_DIR = METHOD_SUBMIT_DIR / "subtask_1"
SUB2_DIR = METHOD_SUBMIT_DIR / "subtask_2"
SUB3_DIR = METHOD_SUBMIT_DIR / "subtask_3"
for d in [SUB1_DIR, SUB2_DIR, SUB3_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Artifacts dir:", ART_DIR)
print("Outputs dir:", OUT_DIR)
print("Submission dirs:", SUB1_DIR, SUB2_DIR, SUB3_DIR)
print("Cache dir (for ensembling):", CACHE_DIR)

# Label orders in TRAIN (used internally)
T2_LABELS = ["gender/sexual","political","religious","racial/ethnic","other"]
T3_LABELS = ["vilification","extreme_language","stereotype",
             "invalidation","lack_of_empathy","dehumanization"]

# ============================
# TrainingArguments capability detection
# ============================

_TA_PARAMS = inspect.signature(TrainingArguments.__init__).parameters
TRAINER_CAPS = {
    "evaluation_strategy": "evaluation_strategy" in _TA_PARAMS,
    "save_strategy":       "save_strategy" in _TA_PARAMS,
    "warmup_ratio":        "warmup_ratio" in _TA_PARAMS,
    "fp16":                "fp16" in _TA_PARAMS,
    "no_cuda":             "no_cuda" in _TA_PARAMS,
    "use_mps_device":      "use_mps_device" in _TA_PARAMS,
    "report_to":           "report_to" in _TA_PARAMS,
    "grad_accum":          "gradient_accumulation_steps" in _TA_PARAMS,
    "eval_accum":          "eval_accumulation_steps" in _TA_PARAMS,
}

def build_training_args(output_dir, per_device_train_batch_size, per_device_eval_batch_size,
                        num_train_epochs, learning_rate, weight_decay, logging_steps=50,
                        evaluation="epoch", save="no", warmup_ratio=WARMUP_RATIO, warmup_steps=0):
    use_cuda_flag = (DEVICE.type == "cuda")

    kwargs = dict(
        output_dir=str(output_dir),
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        logging_steps=logging_steps,
        dataloader_pin_memory=use_cuda_flag,
        dataloader_num_workers=0,
    )
    if TRAINER_CAPS["evaluation_strategy"]:
        kwargs["evaluation_strategy"] = evaluation
    if TRAINER_CAPS["save_strategy"]:
        kwargs["save_strategy"] = save
    if TRAINER_CAPS["warmup_ratio"]:
        kwargs["warmup_ratio"] = warmup_ratio
    else:
        kwargs["warmup_steps"] = warmup_steps
    if TRAINER_CAPS["fp16"]:
        kwargs["fp16"] = False   # keep simple/stable
    if TRAINER_CAPS["no_cuda"]:
        kwargs["no_cuda"] = not use_cuda_flag
    if TRAINER_CAPS["use_mps_device"]:
        kwargs["use_mps_device"] = False
    if TRAINER_CAPS["report_to"]:
        kwargs["report_to"] = "none"
    if TRAINER_CAPS["grad_accum"]:
        kwargs["gradient_accumulation_steps"] = GRAD_ACCUM
    if TRAINER_CAPS["eval_accum"]:
        kwargs["eval_accumulation_steps"] = 8

    return TrainingArguments(**kwargs)


2025-12-07 18:15:20.166559: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765160120.180386 3936315 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765160120.184496 3936315 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765160120.197176 3936315 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765160120.197192 3936315 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1765160120.197194 3936315 computation_placer.cc:177] computation placer alr

PyTorch: 2.9.0
Transformers: 4.57.1
Using GPU: NVIDIA H100 80GB HBM3 MIG 2g.20gb
Artifacts dir: artifacts/xlmr/eng
Outputs dir: outputs/xlmr/eng
Submission dirs: submissions/xlmr/subtask_1 submissions/xlmr/subtask_2 submissions/xlmr/subtask_3
Cache dir (for ensembling): cache/xlmr/eng


## Dataset class, metrics, calibration helpers

In [2]:
# ============================
# 1) Dataset + helpers
# ============================

class TextClsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256, is_multilabel=False):
        self.texts = list(texts)
        self.labels = labels  # list/array
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_multilabel = is_multilabel

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        y = self.labels[idx]
        item["labels"] = torch.tensor(
            y,
            dtype=torch.float if self.is_multilabel else torch.long,
        )
        return item


def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro", zero_division=0)


def grid_search_thresholds(y_true, y_prob, label_names=None):
    """
    Per-label threshold search (multi-label).
    """
    y_true = np.asarray(y_true)
    y_prob = np.asarray(y_prob)
    C = y_true.shape[1]
    grid = np.linspace(0.05, 0.95, 19)
    thrs = {}
    for c in range(C):
        best_t, best_f = 0.5, -1.0
        for t in grid:
            preds = (y_prob[:, c] >= t).astype(int)
            f = f1_score(y_true[:, c], preds, average="binary", zero_division=0)
            if f > best_f:
                best_f, best_t = f, t
        name = label_names[c] if label_names else str(c)
        thrs[name] = float(best_t)
    return thrs


class TempScaler(nn.Module):
    def __init__(self):
        super().__init__()
        self.T = nn.Parameter(torch.ones(1))

    def forward(self, logits):
        return logits / self.T


def learn_temperature(train_logits, train_labels, is_multilabel):
    """
    Learn a single temperature scalar on TRAIN logits.
    """
    device = DEVICE
    scaler = TempScaler().to(device)
    train_logits = train_logits.to(device)
    train_labels = train_labels.to(device)
    opt = torch.optim.LBFGS([scaler.T], max_iter=50)
    criterion = nn.BCEWithLogitsLoss() if is_multilabel else nn.CrossEntropyLoss()

    def closure():
        opt.zero_grad()
        z = scaler(train_logits)
        loss = criterion(z, train_labels.float() if is_multilabel else train_labels.long())
        loss.backward()
        return loss

    opt.step(closure)
    return float(scaler.T.detach().cpu().item())


def collect_logits(trainer, dataset, is_multilabel):
    """
    Collect logits + labels from trainer.predict(dataset).
    """
    preds = trainer.predict(dataset)
    if isinstance(preds.predictions, (list, tuple)):
        logits_arr = preds.predictions[0]
    else:
        logits_arr = preds.predictions
    logits = torch.tensor(logits_arr)
    labels = torch.tensor(preds.label_ids)
    if not is_multilabel and logits.ndim == 1:
        logits = logits.unsqueeze(1)
    return logits, labels


## Subtask 1: train, calibrate on train, infer dev, cache, submission

In [3]:
# ============================
# 2) Subtask 1 — Polarization (binary)
# ============================

# 2.1 Load TRAIN (labeled) + DEV (unlabeled)
t1_train_df = pd.read_csv(T1_TRAIN)
t1_dev_df   = pd.read_csv(T1_DEV)

required_cols_t1_train = {"id", "text", "polarization"}
required_cols_t1_dev   = {"id", "text"}  # dev has no labels
assert required_cols_t1_train.issubset(t1_train_df.columns), \
    f"T1 TRAIN missing: {required_cols_t1_train - set(t1_train_df.columns)}"
assert required_cols_t1_dev.issubset(t1_dev_df.columns), \
    f"T1 DEV missing: {required_cols_t1_dev - set(t1_dev_df.columns)}"

t1_train_df["polarization"] = t1_train_df["polarization"].astype(int)

print(f"[T1] TRAIN size: {len(t1_train_df)}")
print(f"[T1] DEV size (unlabeled): {len(t1_dev_df)}")

# 2.2 Tokenizer & model
tok_t1 = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
cfg_t1 = AutoConfig.from_pretrained(MODEL_NAME, num_labels=2)
mdl_t1 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=cfg_t1)
mdl_t1.config.use_cache = False
if hasattr(mdl_t1, "gradient_checkpointing_enable"):
    mdl_t1.gradient_checkpointing_enable()
mdl_t1.to(DEVICE)

# 2.3 Datasets
ds_t1_train = TextClsDataset(
    t1_train_df["text"],
    t1_train_df["polarization"],
    tok_t1,
    MAX_LEN,
    False,
)

# For DEV inference we use dummy labels (all zeros)
dummy_labels_t1_dev = np.zeros(len(t1_dev_df), dtype=int)
ds_t1_dev_infer = TextClsDataset(
    t1_dev_df["text"],
    dummy_labels_t1_dev,
    tok_t1,
    MAX_LEN,
    False,
)

# 2.4 Trainer (eval on TRAIN)
args_t1 = build_training_args(
    output_dir=ART_DIR / "t1_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t1(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {"f1_macro": macro_f1(labels, preds)}

trainer_t1 = Trainer(
    model=mdl_t1,
    args=args_t1,
    train_dataset=ds_t1_train,
    eval_dataset=ds_t1_train,  # eval on TRAIN
    tokenizer=tok_t1,
    data_collator=DataCollatorWithPadding(tok_t1),
    compute_metrics=compute_metrics_t1,
)
print("Trainer T1 device:", trainer_t1.args.device)

# 2.5 Train + F1 on TRAIN (argmax)
trainer_t1.train()
eval_t1_train = trainer_t1.evaluate()
print("[T1] Macro-F1 (TRAIN, argmax):", eval_t1_train.get("eval_f1_macro"))

# 2.6 Calibration on TRAIN (temperature + global threshold)
logits_t1_train, labels_t1_train = collect_logits(
    trainer_t1,
    ds_t1_train,
    is_multilabel=False,
)
T_t1 = learn_temperature(logits_t1_train, labels_t1_train, is_multilabel=False)
probs_t1_train = torch.softmax(logits_t1_train / T_t1, dim=1)[:, 1].cpu().numpy()

best_thr_t1, best_f1_train = 0.5, -1.0
for t in np.linspace(0.05, 0.95, 19):
    pred = (probs_t1_train >= t).astype(int)
    f = macro_f1(labels_t1_train.numpy(), pred)
    if f > best_f1_train:
        best_f1_train, best_thr_t1 = f, t

print(
    f"[T1] calibration (TRAIN): T={T_t1:.4f}, "
    f"best_thr={best_thr_t1:.2f}, train_macroF1@thr={best_f1_train:.4f}"
)

# 2.7 Cache TRAIN probabilities for ensembling
t1_train_cache = pd.DataFrame({
    "id":   t1_train_df["id"].astype(str).values,
    "prob_pos": probs_t1_train,
    "label":   labels_t1_train.numpy(),
})
t1_train_cache_path = CACHE_DIR / f"t1_train_probs.csv"
t1_train_cache.to_csv(t1_train_cache_path, index=False)
print("Saved T1 TRAIN probs to cache:", t1_train_cache_path)

# 2.8 Inference on DEV (unlabeled) + cache DEV probabilities
logits_t1_dev, _ = collect_logits(
    trainer_t1,
    ds_t1_dev_infer,
    is_multilabel=False,
)
probs_t1_dev = torch.softmax(logits_t1_dev / T_t1, dim=1)[:, 1].cpu().numpy()
pred1_dev = (probs_t1_dev >= best_thr_t1).astype(int)

t1_dev_cache = pd.DataFrame({
    "id":      t1_dev_df["id"].astype(str).values,
    "prob_pos": probs_t1_dev,
})
t1_dev_cache_path = CACHE_DIR / f"t1_dev_probs.csv"
t1_dev_cache.to_csv(t1_dev_cache_path, index=False)
print("Saved T1 DEV probs to cache:", t1_dev_cache_path)

# 2.9 Save model + calibration
trainer_t1.save_model(ART_DIR / f"native_t1")
with open(ART_DIR / f"calib_t1_native.json","w") as f:
    json.dump(
        {"temperature": float(T_t1), "threshold": float(best_thr_t1)},
        f,
        indent=2,
    )

# 2.10 Write Codabench submission CSV for Subtask 1
sub1 = pd.DataFrame({
    "id":           t1_dev_df["id"].astype(str).values,
    "polarization": pred1_dev.astype(int),
})
sub1_path = SUB1_DIR / f"pred_{LANG}.csv"
sub1.to_csv(sub1_path, index=False)
print("Saved Codabench file (Subtask 1, XLM-R):", sub1_path)


[T1] TRAIN size: 3222
[T1] DEV size (unlabeled): 160


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer T1 device: cuda:0


Step,Training Loss
50,0.6477
100,0.5924
150,0.5304
200,0.4966
250,0.4115
300,0.4414


[T1] Macro-F1 (TRAIN, argmax): 0.835714586127454
[T1] calibration (TRAIN): T=0.9626, best_thr=0.45, train_macroF1@thr=0.8371
Saved T1 TRAIN probs to cache: cache/xlmr/eng/t1_train_probs.csv


Saved T1 DEV probs to cache: cache/xlmr/eng/t1_dev_probs.csv
Saved Codabench file (Subtask 1, XLM-R): submissions/xlmr/subtask_1/pred_eng.csv


## Subtask 2: train, calibrate on train, infer dev, cache, submission

In [4]:
# ============================
# 3) Subtask 2 — Hate type (multi-label 5)
# ============================

# 3.1 Load TRAIN (labeled) + DEV (unlabeled)
t2_train_df = pd.read_csv(T2_TRAIN)
t2_dev_df   = pd.read_csv(T2_DEV)

required_cols_t2_train = {"id", "text", *T2_LABELS}
required_cols_t2_dev   = {"id", "text"}  # dev has no labels
assert required_cols_t2_train.issubset(t2_train_df.columns), \
    f"T2 TRAIN missing: {required_cols_t2_train - set(t2_train_df.columns)}"
assert required_cols_t2_dev.issubset(t2_dev_df.columns), \
    f"T2 DEV missing: {required_cols_t2_dev - set(t2_dev_df.columns)}"

Y2_train = t2_train_df[T2_LABELS].values.astype(int)

print(f"[T2] TRAIN size: {len(t2_train_df)}")
print(f"[T2] DEV size (unlabeled): {len(t2_dev_df)}")

# 3.2 Tokenizer & model
tok_t2 = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
cfg_t2 = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(T2_LABELS),
    problem_type="multi_label_classification",
)
mdl_t2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=cfg_t2)
mdl_t2.config.use_cache = False
if hasattr(mdl_t2, "gradient_checkpointing_enable"):
    mdl_t2.gradient_checkpointing_enable()
mdl_t2.to(DEVICE)

# 3.3 Datasets
ds_t2_train = TextClsDataset(
    t2_train_df["text"],
    Y2_train.tolist(),
    tok_t2,
    MAX_LEN,
    True,
)

dummy_labels_t2_dev = np.zeros((len(t2_dev_df), len(T2_LABELS)), dtype=int)
ds_t2_dev_infer = TextClsDataset(
    t2_dev_df["text"],
    dummy_labels_t2_dev.tolist(),
    tok_t2,
    MAX_LEN,
    True,
)

# 3.4 pos_weight for imbalance (on TRAIN)
pos_count_2 = Y2_train.sum(axis=0) + 1e-6
neg_count_2 = Y2_train.shape[0] - pos_count_2
pos_weight_2 = torch.tensor(neg_count_2 / pos_count_2, dtype=torch.float)

class WeightedTrainerT2(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight_2.to(logits.device))
        loss = loss_fct(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

args_t2 = build_training_args(
    output_dir=ART_DIR / "t2_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t2(eval_pred):
    logits = eval_pred.predictions[0] if isinstance(
        eval_pred.predictions, (tuple, list)
    ) else eval_pred.predictions
    labels = eval_pred.label_ids
    probs  = 1.0 / (1.0 + np.exp(-logits))
    preds  = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t2 = WeightedTrainerT2(
    model=mdl_t2,
    args=args_t2,
    train_dataset=ds_t2_train,
    eval_dataset=ds_t2_train,  # eval on TRAIN
    tokenizer=tok_t2,
    data_collator=DataCollatorWithPadding(tok_t2),
    compute_metrics=compute_metrics_t2,
)
print("Trainer T2 device:", trainer_t2.args.device)

# 3.5 Train + F1 on TRAIN @0.5
trainer_t2.train()
eval_t2_train = trainer_t2.evaluate()
print("[T2] Macro-F1 (TRAIN, thr=0.5):", eval_t2_train.get("eval_f1_macro"))

# 3.6 Calibration on TRAIN (temperature + per-label thresholds)
logits_t2_train, labels_t2_train = collect_logits(
    trainer_t2,
    ds_t2_train,
    is_multilabel=True,
)
T_t2 = learn_temperature(logits_t2_train, labels_t2_train, is_multilabel=True)
probs_t2_train = torch.sigmoid(logits_t2_train / T_t2).cpu().numpy()

thr_map_t2 = grid_search_thresholds(labels_t2_train.numpy(), probs_t2_train, T2_LABELS)

P2_train = np.zeros_like(probs_t2_train, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr = float(thr_map_t2[lab])
    P2_train[:, j] = (probs_t2_train[:, j] >= thr).astype(int)

train_f1_calib_t2 = f1_score(labels_t2_train.numpy(), P2_train, average="macro", zero_division=0)
print("[T2] calibration (TRAIN):")
print("  temperature:", T_t2)
print("  thresholds:", thr_map_t2)
print("  Macro-F1 (TRAIN, calibrated):", train_f1_calib_t2)

# 3.7 Cache TRAIN probabilities for ensembling
cache_cols_train_t2 = {"id": t2_train_df["id"].astype(str).values}
for j, lab in enumerate(T2_LABELS):
    cache_cols_train_t2[f"prob_{lab}"] = probs_t2_train[:, j]
    cache_cols_train_t2[f"label_{lab}"] = labels_t2_train.numpy()[:, j]

t2_train_cache = pd.DataFrame(cache_cols_train_t2)
t2_train_cache_path = CACHE_DIR / f"t2_train_probs.csv"
t2_train_cache.to_csv(t2_train_cache_path, index=False)
print("Saved T2 TRAIN probs to cache:", t2_train_cache_path)

# 3.8 Inference on DEV (unlabeled) + cache DEV probs
logits_t2_dev, _ = collect_logits(
    trainer_t2,
    ds_t2_dev_infer,
    is_multilabel=True,
)
probs_t2_dev = torch.sigmoid(logits_t2_dev / T_t2).cpu().numpy()

P2_dev = np.zeros_like(probs_t2_dev, dtype=int)
for j, lab in enumerate(T2_LABELS):
    thr = float(thr_map_t2[lab])
    P2_dev[:, j] = (probs_t2_dev[:, j] >= thr).astype(int)

cache_cols_dev_t2 = {"id": t2_dev_df["id"].astype(str).values}
for j, lab in enumerate(T2_LABELS):
    cache_cols_dev_t2[f"prob_{lab}"] = probs_t2_dev[:, j]
t2_dev_cache = pd.DataFrame(cache_cols_dev_t2)
t2_dev_cache_path = CACHE_DIR / f"t2_dev_probs.csv"
t2_dev_cache.to_csv(t2_dev_cache_path, index=False)
print("Saved T2 DEV probs to cache:", t2_dev_cache_path)

# 3.9 Save model + calibration
trainer_t2.save_model(ART_DIR / f"native_t2")
with open(ART_DIR / f"calib_t2_native.json","w") as f:
    json.dump(
        {"temperature": float(T_t2), "thresholds": thr_map_t2},
        f,
        indent=2,
    )

# 3.10 Write Codabench submission CSV for Subtask 2
# Required header: id,political,racial/ethnic,religious,gender/sexual,other

idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            t2_dev_df["id"].astype(str).values,
    "political":     P2_dev[:, idx_political],
    "racial/ethnic": P2_dev[:, idx_racial],
    "religious":     P2_dev[:, idx_religious],
    "gender/sexual": P2_dev[:, idx_gender],
    "other":         P2_dev[:, idx_other],
})
sub2_path = SUB2_DIR / f"pred_{LANG}.csv"
sub2.to_csv(sub2_path, index=False)
print("Saved Codabench file (Subtask 2, XLM-R):", sub2_path)


[T2] TRAIN size: 3222
[T2] DEV size (unlabeled): 160


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer T2 device: cuda:0


Step,Training Loss
50,1.2274
100,1.2533
150,1.1294
200,1.0635
250,0.9985
300,0.9202


[T2] Macro-F1 (TRAIN, thr=0.5): 0.33029032232449357
[T2] calibration (TRAIN):
  temperature: 0.7747967839241028
  thresholds: {'gender/sexual': 0.85, 'political': 0.35, 'religious': 0.95, 'racial/ethnic': 0.9, 'other': 0.7}
  Macro-F1 (TRAIN, calibrated): 0.4713820066883467
Saved T2 TRAIN probs to cache: cache/xlmr/eng/t2_train_probs.csv


Saved T2 DEV probs to cache: cache/xlmr/eng/t2_dev_probs.csv
Saved Codabench file (Subtask 2, XLM-R): submissions/xlmr/subtask_2/pred_eng.csv


## Subtask 3: train, calibrate on train, infer dev, cache, submission

In [None]:
# ============================
# 4) Subtask 3 — Manifestation (multi-label 6)
# ============================

# 4.1 Load TRAIN (labeled) + DEV (unlabeled)
t3_train_df = pd.read_csv(T3_TRAIN)
t3_dev_df   = pd.read_csv(T3_DEV)

required_cols_t3_train = {"id", "text", *T3_LABELS}
required_cols_t3_dev   = {"id", "text"}  # dev has no labels
assert required_cols_t3_train.issubset(t3_train_df.columns), \
    f"T3 TRAIN missing: {required_cols_t3_train - set(t3_train_df.columns)}"
assert required_cols_t3_dev.issubset(t3_dev_df.columns), \
    f"T3 DEV missing: {required_cols_t3_dev - set(t3_dev_df.columns)}"

Y3_train = t3_train_df[T3_LABELS].values.astype(int)

print(f"[T3] TRAIN size: {len(t3_train_df)}")
print(f"[T3] DEV size (unlabeled): {len(t3_dev_df)}")

# 4.2 Tokenizer & model
tok_t3 = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
cfg_t3 = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=len(T3_LABELS),
    problem_type="multi_label_classification",
)
mdl_t3 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=cfg_t3)
mdl_t3.config.use_cache = False
if hasattr(mdl_t3, "gradient_checkpointing_enable"):
    mdl_t3.gradient_checkpointing_enable()
mdl_t3.to(DEVICE)

# 4.3 Datasets
ds_t3_train = TextClsDataset(
    t3_train_df["text"],
    Y3_train.tolist(),
    tok_t3,
    MAX_LEN,
    True,
)

dummy_labels_t3_dev = np.zeros((len(t3_dev_df), len(T3_LABELS)), dtype=int)
ds_t3_dev_infer = TextClsDataset(
    t3_dev_df["text"],
    dummy_labels_t3_dev.tolist(),
    tok_t3,
    MAX_LEN,
    True,
)

# 4.4 pos_weight for imbalance (TRAIN)
pos_count_3 = Y3_train.sum(axis=0) + 1e-6
neg_count_3 = Y3_train.shape[0] - pos_count_3
pos_weight_3 = torch.tensor(neg_count_3 / pos_count_3, dtype=torch.float)

class WeightedTrainerT3(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight_3.to(logits.device))
        loss = loss_fct(logits, labels.to(logits.device).float())
        return (loss, outputs) if return_outputs else loss

args_t3 = build_training_args(
    output_dir=ART_DIR / "t3_tmp",
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    logging_steps=50,
    evaluation="epoch",
    save="no",
    warmup_ratio=WARMUP_RATIO,
)

def compute_metrics_t3(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    probs  = 1.0 / (1.0 + np.exp(-logits))
    preds  = (probs >= 0.5).astype(int)
    return {"f1_macro": f1_score(labels, preds, average="macro", zero_division=0)}

trainer_t3 = WeightedTrainerT3(
    model=mdl_t3,
    args=args_t3,
    train_dataset=ds_t3_train,
    eval_dataset=ds_t3_train,  # eval on TRAIN
    tokenizer=tok_t3,
    data_collator=DataCollatorWithPadding(tok_t3),
    compute_metrics=compute_metrics_t3,
)
print("Trainer T3 device:", trainer_t3.args.device)

# 4.5 Train + F1 on TRAIN @0.5
trainer_t3.train()
eval_t3_train = trainer_t3.evaluate()
print("[T3] Macro-F1 (TRAIN, thr=0.5):", eval_t3_train.get("eval_f1_macro"))

# 4.6 Calibration on TRAIN (temperature + per-label thresholds)
logits_t3_train, labels_t3_train = collect_logits(
    trainer_t3,
    ds_t3_train,
    is_multilabel=True,
)
T_t3 = learn_temperature(logits_t3_train, labels_t3_train, is_multilabel=True)
probs_t3_train = torch.sigmoid(logits_t3_train / T_t3).cpu().numpy()

thr_map_t3 = grid_search_thresholds(labels_t3_train.numpy(), probs_t3_train, T3_LABELS)

P3_train = np.zeros_like(probs_t3_train, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr = float(thr_map_t3[lab])
    P3_train[:, j] = (probs_t3_train[:, j] >= thr).astype(int)

train_f1_calib_t3 = f1_score(labels_t3_train.numpy(), P3_train, average="macro", zero_division=0)
print("[T3] calibration (TRAIN):")
print("  temperature:", T_t3)
print("  thresholds:", thr_map_t3)
print("  Macro-F1 (TRAIN, calibrated):", train_f1_calib_t3)

# 4.7 Cache TRAIN probabilities for ensembling
cache_cols_train_t3 = {"id": t3_train_df["id"].astype(str).values}
for j, lab in enumerate(T3_LABELS):
    cache_cols_train_t3[f"prob_{lab}"] = probs_t3_train[:, j]
    cache_cols_train_t3[f"label_{lab}"] = labels_t3_train.numpy()[:, j]

t3_train_cache = pd.DataFrame(cache_cols_train_t3)
t3_train_cache_path = CACHE_DIR / f"t3_train_probs.csv"
t3_train_cache.to_csv(t3_train_cache_path, index=False)
print("Saved T3 TRAIN probs to cache:", t3_train_cache_path)

# 4.8 Inference on DEV (unlabeled) + cache DEV probs
logits_t3_dev, _ = collect_logits(
    trainer_t3,
    ds_t3_dev_infer,
    is_multilabel=True,
)
probs_t3_dev = torch.sigmoid(logits_t3_dev / T_t3).cpu().numpy()

P3_dev = np.zeros_like(probs_t3_dev, dtype=int)
for j, lab in enumerate(T3_LABELS):
    thr = float(thr_map_t3[lab])
    P3_dev[:, j] = (probs_t3_dev[:, j] >= thr).astype(int)

cache_cols_dev_t3 = {"id": t3_dev_df["id"].astype(str).values}
for j, lab in enumerate(T3_LABELS):
    cache_cols_dev_t3[f"prob_{lab}"] = probs_t3_dev[:, j]
t3_dev_cache = pd.DataFrame(cache_cols_dev_t3)
t3_dev_cache_path = CACHE_DIR / f"t3_dev_probs.csv"
t3_dev_cache.to_csv(t3_dev_cache_path, index=False)
print("Saved T3 DEV probs to cache:", t3_dev_cache_path)

# 4.9 Save model + calibration
trainer_t3.save_model(ART_DIR / f"native_t3")
with open(ART_DIR / f"calib_t3_native.json","w") as f:
    json.dump(
        {"temperature": float(T_t3), "thresholds": thr_map_t3},
        f,
        indent=2,
    )

# 4.10 Write Codabench submission CSV for Subtask 3
# Required header:
#   id,stereotype,vilification,dehumanization,
#   extreme_language,lack_of_empathy,invalidation

idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               t3_dev_df["id"].astype(str).values,
    "stereotype":       P3_dev[:, idx_stereo],
    "vilification":     P3_dev[:, idx_vil],
    "dehumanization":   P3_dev[:, idx_dehum],
    "extreme_language": P3_dev[:, idx_extreme],
    "lack_of_empathy":  P3_dev[:, idx_lackemp],
    "invalidation":     P3_dev[:, idx_invalid],
})
sub3_path = SUB3_DIR / f"pred_{LANG}.csv"
sub3.to_csv(sub3_path, index=False)
print("Saved Codabench file (Subtask 3, XLM-R):", sub3_path)


[T3] TRAIN size: 3222
[T3] DEV size (unlabeled): 160


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer T3 device: cuda:0


Step,Training Loss
50,1.1329
100,1.1426
150,1.0411
200,0.9887
250,0.9422
300,0.9074


[T3] Macro-F1 (TRAIN, thr=0.5): 0.4867891043907717
[T3] calibration (TRAIN):
  temperature: 1.0684672594070435
  thresholds: {'vilification': 0.65, 'extreme_language': 0.6, 'stereotype': 0.7, 'invalidation': 0.65, 'lack_of_empathy': 0.65, 'dehumanization': 0.7}
  Macro-F1 (TRAIN, calibrated): 0.5175818625510322
Saved T3 TRAIN probs to cache: cache/xlmr/eng/t3_train_probs.csv


Saved T3 DEV probs to cache: cache/xlmr/eng/t3_dev_probs.csv
Saved Codabench file (Subtask 3, XLM-R): submissions/xlmr/subtask_3/pred_eng.csv


: 