## Setup, unzip, installs, imports

In [None]:
# - Trains the same BERT baselines as the organizer's starter notebook for all 3 subtasks.
# - Uses TRAIN as both train and "validation".
# - Runs inference on DEV and writes **submission-ready CSVs**:
#   - submissions/starter/subtask_1/pred_eng.csv
#   - submissions/starter/subtask_2/pred_eng.csv
#   - submissions/starter/subtask_3/pred_eng.csv

# If needed (e.g., on Codabench), unzip the data once:
# !unzip -o dev_phase.zip

!pip install -q pandas

import os
from pathlib import Path

import pandas as pd
import numpy as np
import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

print("PyTorch:", torch.__version__)

SUB_ROOT = Path("submissions") / "starter"
(SUB_ROOT / "subtask_1").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_2").mkdir(parents=True, exist_ok=True)
(SUB_ROOT / "subtask_3").mkdir(parents=True, exist_ok=True)

BASE = "../dev_phase"
LANG = "eng" 
print("BASE:", BASE, "LANG:", LANG)


  from .autonotebook import tqdm as notebook_tqdm
2025-12-08 13:17:17.499909: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-08 13:17:17.513199: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765228637.527034 4098653 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765228637.531142 4098653 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765228637.543800 4098653 computation_placer.cc:177] computation placer already r

PyTorch: 2.9.0
BASE: ../dev_phase LANG: eng


## Dataset classes (binary & multi-label)

In [None]:
# ## Dataset helpers
#
# We define:
# - `PolarizationDatasetBinary` for Subtask 1 (single label).
# - `PolarizationDatasetMultilabel` for Subtasks 2 and 3 (multi-hot labels).

from torch.utils.data import Dataset

class PolarizationDatasetBinary(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {k: v.squeeze() for k, v in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


class PolarizationDatasetMultilabel(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        labels: list of list/array of 0/1 (multi-hot).
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {k: v.squeeze() for k, v in encoding.items()}
        # multi-label requires float labels
        item["labels"] = torch.tensor(label, dtype=torch.float)
        return item


## Subtask 1: train on TRAIN, predict on DEV, save submission

In [None]:
# ## Subtask 1 — Polarization detection (binary)
#
# - Train on `../dev_phase/subtask1/train/eng.csv`
# - Use TRAIN also as "validation" (as in starter).
# - Run inference on `../dev_phase/subtask1/dev/eng.csv`
# - Save submission to `submissions/starter/subtask_1/pred_eng.csv`
#   with columns: `id,polarization`

# Load train and dev
t1_train_path = f"{BASE}/subtask1/train/{LANG}.csv"
t1_dev_path   = f"{BASE}/subtask1/dev/{LANG}.csv"

train_t1 = pd.read_csv(t1_train_path)
dev_t1   = pd.read_csv(t1_dev_path)

# Starter uses TRAIN as both train and val
val_t1 = train_t1.copy()

print("Subtask 1:")
print("  TRAIN size:", len(train_t1))
print("  DEV size  :", len(dev_t1))

# Tokenizer and datasets
tokenizer_t1 = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset_t1 = PolarizationDatasetBinary(
    texts=train_t1["text"].tolist(),
    labels=train_t1["polarization"].tolist(),
    tokenizer=tokenizer_t1,
)
val_dataset_t1 = PolarizationDatasetBinary(
    texts=val_t1["text"].tolist(),
    labels=val_t1["polarization"].tolist(),
    tokenizer=tokenizer_t1,
)

# Model
model_t1 = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
)

# Metrics
def compute_metrics_t1(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"f1_macro": f1_score(p.label_ids, preds, average="macro")}

# Training arguments
training_args_t1 = TrainingArguments(
    output_dir="./t1_output",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",
)

# Trainer
trainer_t1 = Trainer(
    model=model_t1,
    args=training_args_t1,
    train_dataset=train_dataset_t1,
    eval_dataset=val_dataset_t1,
    compute_metrics=compute_metrics_t1,
    data_collator=DataCollatorWithPadding(tokenizer_t1),
)

# Train
trainer_t1.train()

# Evaluate on TRAIN-as-val
eval_results_t1 = trainer_t1.evaluate()
print(f"[T1] Macro F1 on validation (train-as-val): {eval_results_t1['eval_f1_macro']}")

# ---------- Inference on DEV & save submission ----------
dev_texts_t1 = dev_t1["text"].tolist()

dev_encodings_t1 = tokenizer_t1(
    dev_texts_t1,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt",
)

with torch.no_grad():
    model_t1.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_t1.to(device)
    dev_inputs = {k: v.to(device) for k, v in dev_encodings_t1.items()}
    logits = model_t1(**dev_inputs).logits
    preds_dev_t1 = torch.argmax(logits, dim=1).cpu().numpy()

sub1 = pd.DataFrame({
    "id": dev_t1["id"].astype(str).values,
    "polarization": preds_dev_t1.astype(int),
})

sub1_path = SUB_ROOT / "subtask_1" / f"pred_{LANG}.csv"
sub1.to_csv(sub1_path, index=False)
print("Saved Subtask 1 submission to:", sub1_path)


Subtask 1:
  TRAIN size: 3222
  DEV size  : 160


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.432906,0.783742
2,0.496400,0.337824,0.844401
3,0.496400,0.298928,0.870926


[T1] Macro F1 on validation (train-as-val): 0.8709263895843766
Saved Subtask 1 submission to: submissions/starter/subtask_1/pred_eng.csv


## Subtask 2: train on TRAIN, predict on DEV, save submission

In [None]:
# ## Subtask 2 — Polarization Type Classification (multi-label 5)
#
# - Train on `../dev_phase/subtask2/train/eng.csv`
# - Use TRAIN also as "validation".
# - Run inference on `../dev_phase/subtask2/dev/eng.csv`
# - Save submission to `submissions/starter/subtask_2/pred_eng.csv`
#   with columns (required order):
#   `id,political,racial/ethnic,religious,gender/sexual,other`

# Load train and dev
t2_train_path = f"{BASE}/subtask2/train/{LANG}.csv"
t2_dev_path   = f"{BASE}/subtask2/dev/{LANG}.csv"

train_t2 = pd.read_csv(t2_train_path)
dev_t2   = pd.read_csv(t2_dev_path)
val_t2   = train_t2.copy()  # train-as-val

print("Subtask 2:")
print("  TRAIN size:", len(train_t2))
print("  DEV size  :", len(dev_t2))

# Label columns
T2_LABELS = ["gender/sexual", "political", "religious", "racial/ethnic", "other"]

tokenizer_t2 = AutoTokenizer.from_pretrained("bert-base-uncased")

y_train_t2 = train_t2[T2_LABELS].values.tolist()
y_val_t2   = val_t2[T2_LABELS].values.tolist()

train_dataset_t2 = PolarizationDatasetMultilabel(
    texts=train_t2["text"].tolist(),
    labels=y_train_t2,
    tokenizer=tokenizer_t2,
)
val_dataset_t2 = PolarizationDatasetMultilabel(
    texts=val_t2["text"].tolist(),
    labels=y_val_t2,
    tokenizer=tokenizer_t2,
)
dev_texts_t2 = dev_t2["text"].tolist()

model_t2 = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(T2_LABELS),
    problem_type="multi_label_classification",
)

def compute_metrics_t2(p):
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()
    return {"f1_macro": f1_score(p.label_ids, preds, average="macro")}

training_args_t2 = TrainingArguments(
    output_dir="./t2_output",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",
)

trainer_t2 = Trainer(
    model=model_t2,
    args=training_args_t2,
    train_dataset=train_dataset_t2,
    eval_dataset=val_dataset_t2,
    compute_metrics=compute_metrics_t2,
    data_collator=DataCollatorWithPadding(tokenizer_t2),
)

# Train
trainer_t2.train()

# Evaluate on TRAIN-as-val
eval_results_t2 = trainer_t2.evaluate()
print(f"[T2] Macro F1 on validation (train-as-val): {eval_results_t2['eval_f1_macro']}")

# ---------- Inference on DEV & save submission ----------
dev_encodings_t2 = tokenizer_t2(
    dev_texts_t2,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt",
)

with torch.no_grad():
    model_t2.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_t2.to(device)
    dev_inputs = {k: v.to(device) for k, v in dev_encodings_t2.items()}
    logits = model_t2(**dev_inputs).logits
    probs_dev_t2 = torch.sigmoid(logits).cpu().numpy()
    preds_dev_t2 = (probs_dev_t2 > 0.5).astype(int)

# Required header: id,political,racial/ethnic,religious,gender/sexual,other
idx_gender    = T2_LABELS.index("gender/sexual")
idx_political = T2_LABELS.index("political")
idx_religious = T2_LABELS.index("religious")
idx_racial    = T2_LABELS.index("racial/ethnic")
idx_other     = T2_LABELS.index("other")

sub2 = pd.DataFrame({
    "id":            dev_t2["id"].astype(str).values,
    "political":     preds_dev_t2[:, idx_political],
    "racial/ethnic": preds_dev_t2[:, idx_racial],
    "religious":     preds_dev_t2[:, idx_religious],
    "gender/sexual": preds_dev_t2[:, idx_gender],
    "other":         preds_dev_t2[:, idx_other],
})

sub2_path = SUB_ROOT / "subtask_2" / f"pred_{LANG}.csv"
sub2.to_csv(sub2_path, index=False)
print("Saved Subtask 2 submission to:", sub2_path)


Subtask 2:
  TRAIN size: 3222
  DEV size  : 160


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.2313,0.186983,0.147213
2,0.1749,0.138985,0.294734
3,0.1294,0.110939,0.438217


[T2] Macro F1 on validation (train-as-val): 0.4382169277825471
Saved Subtask 2 submission to: submissions/starter/subtask_2/pred_eng.csv


## Subtask 3: train on TRAIN, predict on DEV, save submission

In [None]:
# ## Subtask 3 — Manifestation Identification (multi-label 6)
#
# - Train on `../dev_phase/subtask3/train/eng.csv`
# - Use TRAIN also as "validation".
# - Run inference on `../dev_phase/subtask3/dev/eng.csv`
# - Save submission to `submissions/starter/subtask_3/pred_eng.csv`
#   with columns (required order):
#   `id,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation`

# Load train and dev
t3_train_path = f"{BASE}/subtask3/train/{LANG}.csv"
t3_dev_path   = f"{BASE}/subtask3/dev/{LANG}.csv"

t3_train_path = f"{BASE}/subtask3/train/{LANG}.csv"
t3_dev_path   = f"{BASE}/subtask3/dev/{LANG}.csv"

train_t3 = pd.read_csv(t3_train_path)
dev_t3   = pd.read_csv(t3_dev_path)
val_t3   = train_t3.copy()  # train-as-val

print("Subtask 3:")
print("  TRAIN size:", len(train_t3))
print("  DEV size  :", len(dev_t3))

T3_LABELS = [
    "vilification",
    "extreme_language",
    "stereotype",
    "invalidation",
    "lack_of_empathy",
    "dehumanization",
]

tokenizer_t3 = AutoTokenizer.from_pretrained("bert-base-uncased")

y_train_t3 = train_t3[T3_LABELS].values.tolist()
y_val_t3   = val_t3[T3_LABELS].values.tolist()

train_dataset_t3 = PolarizationDatasetMultilabel(
    texts=train_t3["text"].tolist(),
    labels=y_train_t3,
    tokenizer=tokenizer_t3,
)
val_dataset_t3 = PolarizationDatasetMultilabel(
    texts=val_t3["text"].tolist(),
    labels=y_val_t3,
    tokenizer=tokenizer_t3,
)
dev_texts_t3 = dev_t3["text"].tolist()

model_t3 = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(T3_LABELS),
    problem_type="multi_label_classification",
)

def compute_metrics_t3(p):
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()
    return {"f1_macro": f1_score(p.label_ids, preds, average="macro")}

training_args_t3 = TrainingArguments(
    output_dir="./t3_output",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False,
    report_to="none",
)

trainer_t3 = Trainer(
    model=model_t3,
    args=training_args_t3,
    train_dataset=train_dataset_t3,
    eval_dataset=val_dataset_t3,
    compute_metrics=compute_metrics_t3,
    data_collator=DataCollatorWithPadding(tokenizer_t3),
)

# Train
trainer_t3.train()

# Evaluate on TRAIN-as-val
eval_results_t3 = trainer_t3.evaluate()
print(f"[T3] Macro F1 on validation (train-as-val): {eval_results_t3['eval_f1_macro']}")

# ---------- Inference on DEV & save submission ----------
dev_encodings_t3 = tokenizer_t3(
    dev_texts_t3,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt",
)

with torch.no_grad():
    model_t3.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_t3.to(device)
    dev_inputs = {k: v.to(device) for k, v in dev_encodings_t3.items()}
    logits = model_t3(**dev_inputs).logits
    probs_dev_t3 = torch.sigmoid(logits).cpu().numpy()
    preds_dev_t3 = (probs_dev_t3 > 0.5).astype(int)

# Required header:
# id,stereotype,vilification,dehumanization,
# extreme_language,lack_of_empathy,invalidation

idx_vil      = T3_LABELS.index("vilification")
idx_extreme  = T3_LABELS.index("extreme_language")
idx_stereo   = T3_LABELS.index("stereotype")
idx_invalid  = T3_LABELS.index("invalidation")
idx_lackemp  = T3_LABELS.index("lack_of_empathy")
idx_dehum    = T3_LABELS.index("dehumanization")

sub3 = pd.DataFrame({
    "id":               dev_t3["id"].astype(str).values,
    "stereotype":       preds_dev_t3[:, idx_stereo],
    "vilification":     preds_dev_t3[:, idx_vil],
    "dehumanization":   preds_dev_t3[:, idx_dehum],
    "extreme_language": preds_dev_t3[:, idx_extreme],
    "lack_of_empathy":  preds_dev_t3[:, idx_lackemp],
    "invalidation":     preds_dev_t3[:, idx_invalid],
})

sub3_path = SUB_ROOT / "subtask_3" / f"pred_{LANG}.csv"
sub3.to_csv(sub3_path, index=False)
print("Saved Subtask 3 submission to:", sub3_path)


Subtask 3:
  TRAIN size: 3222
  DEV size  : 160


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.3961,0.36572,0.099442
2,0.342,0.288569,0.260225
3,0.2941,0.263036,0.489388


[T3] Macro F1 on validation (train-as-val): 0.48938831627151586
Saved Subtask 3 submission to: submissions/starter/subtask_3/pred_eng.csv
