In [None]:
from google.colab import drive
drive.mount('/content/drive')
zip_path = "/content/drive/My Drive/dev_phase.zip"
!unzip "$zip_path" -d /content/

Mounted at /content/drive
Archive:  /content/drive/My Drive/dev_phase.zip
   creating: /content/subtask1/
   creating: /content/subtask1/dev/
  inflating: /content/subtask1/dev/nep.csv  
  inflating: /content/subtask1/dev/ita.csv  
  inflating: /content/subtask1/dev/pol.csv  
  inflating: /content/subtask1/dev/rus.csv  
  inflating: /content/subtask1/dev/tel.csv  
  inflating: /content/subtask1/dev/hin.csv  
  inflating: /content/subtask1/dev/hau.csv  
  inflating: /content/subtask1/dev/pan.csv  
  inflating: /content/subtask1/dev/ori.csv  
  inflating: /content/subtask1/dev/spa.csv  
  inflating: /content/subtask1/dev/deu.csv  
  inflating: /content/subtask1/dev/fas.csv  
  inflating: /content/subtask1/dev/arb.csv  
  inflating: /content/subtask1/dev/ben.csv  
  inflating: /content/subtask1/dev/amh.csv  
  inflating: /content/subtask1/dev/khm.csv  
  inflating: /content/subtask1/dev/tur.csv  
  inflating: /content/subtask1/dev/zho.csv  
  inflating: /content/subtask1/dev/eng.csv  
  i

In [None]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import wandb
import os
from datetime import datetime

# Disable wandb
wandb.init(mode="disabled")
os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  | |_| | '_ \/ _` / _` |  _/ -_)


Using device: cuda


In [None]:
LANGUAGE = 'swa'  # Change to 'swa' or 'amh'

LABEL_ORDER = ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']

MODELS_CONFIG = {
    'eng': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'microsoft/deberta-v3-base',
        'FacebookAI/xlm-roberta-base'
    ],
    'swa': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ],
    'amh': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ]
}

NUM_EPOCHS = 8
BATCH_SIZE = 16
LEARNING_RATE = 3e-5
MAX_LENGTH = 256
DROPOUT_RATE = 0.2

RESULTS_CSV = 'results_subtask2_simple.csv'
PREDICTIONS_DIR = 'predictions_subtask2_simple'

In [None]:
class LabelAwareDeberta(nn.Module):
    """
    Custom model for multi-label classification
    """
    def __init__(self, model_name, num_labels=5, dropout_rate=0.2):
        super(LabelAwareDeberta, self).__init__()
        self.num_labels = num_labels

        self.deberta = AutoModel.from_pretrained(model_name)
        hidden_size = self.deberta.config.hidden_size

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_labels)

        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weights if hasattr(self, 'pos_weights') else None)
            loss = loss_fct(logits, labels)

        return {'loss': loss, 'logits': logits}

class PolarizationDatasetWithHints(Dataset):
    """
    Dataset with instruction prefix
    """
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.instruction = "Classify text for polarizing content related to: " + ", ".join(LABEL_ORDER) + ". Text: "

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        full_text = self.instruction + text

        encoding = self.tokenizer(
            full_text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

def compute_metrics_multilabel(p):
    """Compute macro and per-class F1 scores"""
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()

    macro_f1 = f1_score(p.label_ids, preds, average='macro', zero_division=0)
    per_class_f1 = f1_score(p.label_ids, preds, average=None, zero_division=0)

    print(f"\n  Per-class F1: {dict(zip(LABEL_ORDER, [f'{x:.3f}' for x in per_class_f1]))}")

    return {'f1_macro': macro_f1}


def train_and_evaluate_model(model_name, language, X_train, y_train, X_val, y_val, X_test, test_df, pos_weights):
    """
    Train and evaluate a single model
    """
    print(f"\n{'='*80}")
    print(f"Training: {model_name} on {language.upper()}")
    print(f"{'='*80}")

    start_time = datetime.now()

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = PolarizationDatasetWithHints(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LENGTH)
    val_dataset = PolarizationDatasetWithHints(X_val.tolist(), y_val.tolist(), tokenizer, MAX_LENGTH)

    model = LabelAwareDeberta(
        model_name=model_name,
        num_labels=len(LABEL_ORDER),
        dropout_rate=DROPOUT_RATE
    ).to(device)

    model.pos_weights = pos_weights

    # Training arguments
    total_steps = (len(train_dataset) // BATCH_SIZE) * NUM_EPOCHS
    warmup_steps = int(0.1 * total_steps)

    model_short_name = model_name.split('/')[-1]
    output_dir = f"./output_{language}_{model_short_name}_subtask2"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        warmup_steps=warmup_steps,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=50,
        logging_first_step=True,
        report_to="none",
        seed=42
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_multilabel,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\nStarting training...")
    trainer.train()

    print(f"\nEvaluating on validation set...")
    val_results = trainer.evaluate(val_dataset)

    print(f"\nValidation Results:")
    print(f"   Macro F1: {val_results['eval_f1_macro']:.4f}")

    print(f"\nGenerating predictions on test set...")

    if set(LABEL_ORDER).issubset(test_df.columns):
        test_df[LABEL_ORDER] = test_df[LABEL_ORDER].fillna(0)
        test_labels = test_df[LABEL_ORDER].values.tolist()
    else:
        print("Label columns not found in test set. Using dummy labels.")
        test_labels = np.zeros((len(test_df), len(LABEL_ORDER))).tolist()

    test_dataset = PolarizationDatasetWithHints(test_df['text'].tolist(), test_labels, tokenizer, MAX_LENGTH)

    trainer.compute_metrics = None
    prediction_output = trainer.predict(test_dataset)

    logits = torch.tensor(prediction_output.predictions)
    probs = torch.sigmoid(logits)
    predictions = (probs > 0.5).int().numpy()

    print(f"\nPrediction Distribution:")
    for i, label in enumerate(LABEL_ORDER):
        count = predictions[:, i].sum()
        print(f"   {label:20s}: {count} ({count/len(predictions)*100:.1f}%)")

    training_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\nTraining time: {training_time:.2f} minutes")

    os.makedirs(PREDICTIONS_DIR, exist_ok=True)

    submission_df = pd.DataFrame(predictions, columns=LABEL_ORDER)

    if 'id' in test_df.columns:
        submission_df.insert(0, 'id', test_df['id'])
    elif 'ID' in test_df.columns:
        submission_df.insert(0, 'id', test_df['ID'])

    pred_filename = f"{PREDICTIONS_DIR}/predictions_{language}_{model_short_name}.csv"
    submission_df.to_csv(pred_filename, index=False)
    print(f"\nPredictions saved to {pred_filename}")

    # Compile results
    results = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_name': model_name,
        'model_short_name': model_short_name,
        'language': language,
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'val_f1_macro': val_results['eval_f1_macro'],
        'training_time_minutes': training_time
    }

    # Add per-class predictions
    for i, label in enumerate(LABEL_ORDER):
        results[f'pred_{label}'] = int(predictions[:, i].sum())

    return results

In [None]:
def main():
    print(f"\n{'='*80}")
    print(f"MULTI-LABEL POLARIZATION CLASSIFICATION - SUBTASK 2")
    print(f"Language: {LANGUAGE.upper()}")
    print(f"{'='*80}")

    # Load data
    print(f"\nLoading data for {LANGUAGE}...")
    train_path = f'./subtask2/train/{LANGUAGE}.csv'
    dev_path = f'./subtask2/dev/{LANGUAGE}.csv'

    try:
        train_full = pd.read_csv(train_path)
        test_df = pd.read_csv(dev_path)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return

    X = train_full['text'].values
    y = train_full[LABEL_ORDER].values

    print(f"\nCalculating class weights...")
    pos_counts = train_full[LABEL_ORDER].sum().values
    num_samples = len(train_full)
    neg_counts = num_samples - pos_counts

    pos_weights = np.where(pos_counts > 0, neg_counts / pos_counts, 1.0)
    pos_weights = torch.tensor(pos_weights, dtype=torch.float).to(device)

    print(f"   Label distribution:")
    for i, col in enumerate(LABEL_ORDER):
        print(f"      {col:20s}: {int(pos_counts[i])} positive ({pos_counts[i]/num_samples*100:.1f}%) - weight: {pos_weights[i]:.2f}")
    print(f"\nCreating train/val split...")
    label_sums = y.sum(axis=0)
    primary_label_idx = np.argmax(label_sums)
    primary_labels = y[:, primary_label_idx]

    unique, counts = np.unique(primary_labels, return_counts=True)
    if np.min(counts) >= 2:
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=primary_labels
        )
        print(f"   Stratified split on '{LABEL_ORDER[primary_label_idx]}'")
    else:
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        print(f"   Random split (stratification not possible)")

    print(f"   Train: {len(X_train)}, Val: {len(X_val)}")

    X_test = test_df['text'].values

    models_to_test = MODELS_CONFIG.get(LANGUAGE, [])
    if not models_to_test:
        print(f"No models configured for language: {LANGUAGE}")
        return

    print(f"\nModels to test: {len(models_to_test)}")
    for i, model in enumerate(models_to_test, 1):
        print(f"   {i}. {model}")

    all_results = []
    for model_name in models_to_test:
        try:
            results = train_and_evaluate_model(
                model_name, LANGUAGE,
                X_train, y_train,
                X_val, y_val,
                X_test,
                test_df,
                pos_weights
            )
            all_results.append(results)

            results_df = pd.DataFrame(all_results)

            if os.path.exists(RESULTS_CSV):
                existing_df = pd.read_csv(RESULTS_CSV)
                results_df = pd.concat([existing_df, results_df], ignore_index=True)

            results_df.to_csv(RESULTS_CSV, index=False)
            print(f"\nResults updated in {RESULTS_CSV}")

        except Exception as e:
            print(f"\nError training {model_name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    print(f"\n{'='*80}")
    print(f"TRAINING COMPLETE")
    print(f"{'='*80}")
    print(f"\nSuccessfully trained {len(all_results)} out of {len(models_to_test)} models")
    print(f"Results saved to: {RESULTS_CSV}")
    print(f"Predictions saved to: {PREDICTIONS_DIR}/")

    if all_results:
        print(f"\nBest Model (by Validation Macro F1):")
        best_model = max(all_results, key=lambda x: x['val_f1_macro'])
        print(f"   Model: {best_model['model_short_name']}")
        print(f"   Val Macro F1: {best_model['val_f1_macro']:.4f}")
        print(f"   Training Time: {best_model['training_time_minutes']:.1f} min")


In [None]:
if __name__ == "__main__":
    main()


MULTI-LABEL POLARIZATION CLASSIFICATION - SUBTASK 2
Language: SWA

Loading data for swa...

Calculating class weights...
   Label distribution:
      gender/sexual       : 156 positive (2.2%) - weight: 43.81
      political           : 186 positive (2.7%) - weight: 36.59
      religious           : 247 positive (3.5%) - weight: 27.30
      racial/ethnic       : 2483 positive (35.5%) - weight: 1.82
      other               : 555 positive (7.9%) - weight: 11.60

Creating train/val split...
   Stratified split on 'racial/ethnic'
   Train: 5592, Val: 1399

Models to test: 2
   1. cardiffnlp/twitter-roberta-base-hate-latest
   2. Davlan/afro-xlmr-base

Training: cardiffnlp/twitter-roberta-base-hate-latest on SWA


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate-latest and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro
1,1.2919,1.230787,0.318155
2,1.0875,1.1645,0.403319
3,0.9535,1.039917,0.449533
4,0.8495,0.974735,0.406313
5,0.8938,1.184267,0.441543
6,0.5423,1.77317,0.518381
7,0.5222,1.772285,0.470428
8,0.484,1.910242,0.491322



  Per-class F1: {'gender/sexual': '0.000', 'political': '0.229', 'religious': '0.698', 'racial/ethnic': '0.664', 'other': '0.000'}

  Per-class F1: {'gender/sexual': '0.000', 'political': '0.353', 'religious': '0.865', 'racial/ethnic': '0.739', 'other': '0.060'}

  Per-class F1: {'gender/sexual': '0.097', 'political': '0.333', 'religious': '0.841', 'racial/ethnic': '0.728', 'other': '0.248'}

  Per-class F1: {'gender/sexual': '0.156', 'political': '0.163', 'religious': '0.644', 'racial/ethnic': '0.755', 'other': '0.313'}

  Per-class F1: {'gender/sexual': '0.124', 'political': '0.216', 'religious': '0.807', 'racial/ethnic': '0.756', 'other': '0.305'}

  Per-class F1: {'gender/sexual': '0.248', 'political': '0.416', 'religious': '0.876', 'racial/ethnic': '0.744', 'other': '0.309'}

  Per-class F1: {'gender/sexual': '0.238', 'political': '0.319', 'religious': '0.748', 'racial/ethnic': '0.733', 'other': '0.315'}

  Per-class F1: {'gender/sexual': '0.273', 'political': '0.333', 'religious


  Per-class F1: {'gender/sexual': '0.248', 'political': '0.416', 'religious': '0.876', 'racial/ethnic': '0.744', 'other': '0.309'}

Validation Results:
   Macro F1: 0.5184

Generating predictions on test set...

Prediction Distribution:
   gender/sexual       : 18 (5.2%)
   political           : 11 (3.2%)
   religious           : 13 (3.7%)
   racial/ethnic       : 143 (41.0%)
   other               : 42 (12.0%)

Training time: 16.33 minutes

Predictions saved to predictions_subtask2_simple/predictions_swa_twitter-roberta-base-hate-latest.csv

Results updated in results_subtask2_simple.csv

Training: Davlan/afro-xlmr-base on SWA


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro
1,1.402,1.117366,0.40328
2,0.9671,1.118938,0.445639
3,0.7887,1.059054,0.486392
4,0.6551,1.012074,0.455211
5,0.7101,1.269283,0.482165
6,0.4019,1.79432,0.532885
7,0.3274,1.780744,0.518326
8,0.2393,1.887171,0.522191



  Per-class F1: {'gender/sexual': '0.296', 'political': '0.151', 'religious': '0.812', 'racial/ethnic': '0.757', 'other': '0.000'}

  Per-class F1: {'gender/sexual': '0.240', 'political': '0.224', 'religious': '0.832', 'racial/ethnic': '0.747', 'other': '0.186'}

  Per-class F1: {'gender/sexual': '0.145', 'political': '0.404', 'religious': '0.847', 'racial/ethnic': '0.774', 'other': '0.261'}

  Per-class F1: {'gender/sexual': '0.190', 'political': '0.250', 'religious': '0.774', 'racial/ethnic': '0.770', 'other': '0.292'}

  Per-class F1: {'gender/sexual': '0.229', 'political': '0.329', 'religious': '0.786', 'racial/ethnic': '0.750', 'other': '0.317'}

  Per-class F1: {'gender/sexual': '0.275', 'political': '0.429', 'religious': '0.844', 'racial/ethnic': '0.773', 'other': '0.344'}

  Per-class F1: {'gender/sexual': '0.250', 'political': '0.457', 'religious': '0.786', 'racial/ethnic': '0.758', 'other': '0.341'}

  Per-class F1: {'gender/sexual': '0.253', 'political': '0.404', 'religious


  Per-class F1: {'gender/sexual': '0.275', 'political': '0.429', 'religious': '0.844', 'racial/ethnic': '0.773', 'other': '0.344'}

Validation Results:
   Macro F1: 0.5329

Generating predictions on test set...

Prediction Distribution:
   gender/sexual       : 7 (2.0%)
   political           : 16 (4.6%)
   religious           : 13 (3.7%)
   racial/ethnic       : 134 (38.4%)
   other               : 37 (10.6%)

Training time: 22.37 minutes

Predictions saved to predictions_subtask2_simple/predictions_swa_afro-xlmr-base.csv

Results updated in results_subtask2_simple.csv

TRAINING COMPLETE

Successfully trained 2 out of 2 models
Results saved to: results_subtask2_simple.csv
Predictions saved to: predictions_subtask2_simple/

Best Model (by Validation Macro F1):
   Model: afro-xlmr-base
   Val Macro F1: 0.5329
   Training Time: 22.4 min


In [None]:
print("hello")

hello
