In [None]:
# Only if using colab
from google.colab import drive
drive.mount('/content/drive')
zip_path = "/content/drive/My Drive/dev_phase.zip"
!unzip "$zip_path" -d /content/


Mounted at /content/drive
Archive:  /content/drive/My Drive/dev_phase.zip
   creating: /content/subtask1/
   creating: /content/subtask1/dev/
  inflating: /content/subtask1/dev/nep.csv  
  inflating: /content/subtask1/dev/ita.csv  
  inflating: /content/subtask1/dev/pol.csv  
  inflating: /content/subtask1/dev/rus.csv  
  inflating: /content/subtask1/dev/tel.csv  
  inflating: /content/subtask1/dev/hin.csv  
  inflating: /content/subtask1/dev/hau.csv  
  inflating: /content/subtask1/dev/pan.csv  
  inflating: /content/subtask1/dev/ori.csv  
  inflating: /content/subtask1/dev/spa.csv  
  inflating: /content/subtask1/dev/deu.csv  
  inflating: /content/subtask1/dev/fas.csv  
  inflating: /content/subtask1/dev/arb.csv  
  inflating: /content/subtask1/dev/ben.csv  
  inflating: /content/subtask1/dev/amh.csv  
  inflating: /content/subtask1/dev/khm.csv  
  inflating: /content/subtask1/dev/tur.csv  
  inflating: /content/subtask1/dev/zho.csv  
  inflating: /content/subtask1/dev/eng.csv  
  i

In [None]:
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import wandb
import os
from datetime import datetime

# Disable wandb
wandb.init(mode="disabled")
os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  | |_| | '_ \/ _` / _` |  _/ -_)


Using device: cuda


In [None]:
LANGUAGE = 'swa'  # Change to 'eng', 'swa' or 'amh'

LABEL_ORDER = ['stereotype', 'vilification', 'dehumanization', 'extreme_language', 'lack_of_empathy', 'invalidation']

MODELS_CONFIG = {
    'eng': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'microsoft/deberta-v3-base',
        'FacebookAI/xlm-roberta-base'
    ],
    'swa': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ],
    'amh': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ]
}

NUM_EPOCHS = 8
BATCH_SIZE = 16
LEARNING_RATE = 3e-5
MAX_LENGTH = 256
DROPOUT_RATE = 0.2

RESULTS_CSV = 'results_subtask3_simple.csv'
PREDICTIONS_DIR = 'predictions_subtask3_simple'

In [None]:
class ManifestationClassifier(nn.Module):
    """
    Custom model for manifestation identification
    """
    def __init__(self, model_name, num_labels=6, dropout_rate=0.2):
        super(ManifestationClassifier, self).__init__()
        self.num_labels = num_labels

        # Load base model
        self.deberta = AutoModel.from_pretrained(model_name)
        hidden_size = self.deberta.config.hidden_size

        # Simple classifier
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_labels)

        # Initialize weights
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weights if hasattr(self, 'pos_weights') else None)
            loss = loss_fct(logits, labels)

        return {'loss': loss, 'logits': logits}

# ==================== DATASET ====================
class ManifestationDataset(Dataset):
    """
    Dataset with task-specific instruction
    """
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.instruction = "Identify polarization manifestations: " + ", ".join(LABEL_ORDER) + ". Text: "

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        full_text = self.instruction + text

        encoding = self.tokenizer(
            full_text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

# ==================== METRICS ====================
def compute_metrics_multilabel(p):
    """Compute macro and per-class F1 scores"""
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    preds = (probs > 0.5).int().numpy()

    macro_f1 = f1_score(p.label_ids, preds, average='macro', zero_division=0)
    per_class_f1 = f1_score(p.label_ids, preds, average=None, zero_division=0)

    print(f"\n  Per-class F1: {dict(zip(LABEL_ORDER, [f'{x:.3f}' for x in per_class_f1]))}")

    return {'f1_macro': macro_f1}

# ==================== TRAIN AND EVALUATE MODEL ====================
def train_and_evaluate_model(model_name, language, X_train, y_train, X_val, y_val, X_test, test_df, pos_weights):
    """
    Train and evaluate a single model
    """
    print(f"\n{'='*80}")
    print(f"Training: {model_name} on {language.upper()}")
    print(f"{'='*80}")

    start_time = datetime.now()

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create datasets
    train_dataset = ManifestationDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LENGTH)
    val_dataset = ManifestationDataset(X_val.tolist(), y_val.tolist(), tokenizer, MAX_LENGTH)

    # Initialize model
    model = ManifestationClassifier(
        model_name=model_name,
        num_labels=len(LABEL_ORDER),
        dropout_rate=DROPOUT_RATE
    ).to(device)

    model.pos_weights = pos_weights

    # Training arguments
    total_steps = (len(train_dataset) // BATCH_SIZE) * NUM_EPOCHS
    warmup_steps = int(0.1 * total_steps)

    model_short_name = model_name.split('/')[-1]
    output_dir = f"./output_{language}_{model_short_name}_subtask3"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        warmup_steps=warmup_steps,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=50,
        logging_first_step=True,
        report_to="none",
        seed=42
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_multilabel,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    # Train
    print(f"\nStarting training...")
    trainer.train()

    # Evaluate on validation set
    print(f"\nEvaluating on validation set...")
    val_results = trainer.evaluate(val_dataset)

    print(f"\nValidation Results:")
    print(f"   Macro F1: {val_results['eval_f1_macro']:.4f}")

    # Predict on test set
    print(f"\nGenerating predictions on test set...")

    # Handle test labels
    if set(LABEL_ORDER).issubset(test_df.columns):
        test_df[LABEL_ORDER] = test_df[LABEL_ORDER].fillna(0)
        test_labels = test_df[LABEL_ORDER].values.tolist()
    else:
        print("Label columns not found in test set. Using dummy labels.")
        test_labels = np.zeros((len(test_df), len(LABEL_ORDER))).tolist()

    test_dataset = ManifestationDataset(test_df['text'].tolist(), test_labels, tokenizer, MAX_LENGTH)

    # Disable metrics for prediction
    trainer.compute_metrics = None
    prediction_output = trainer.predict(test_dataset)

    # Convert to binary predictions
    logits = torch.tensor(prediction_output.predictions)
    probs = torch.sigmoid(logits)
    predictions = (probs > 0.5).int().numpy()

    # Print distribution
    print(f"\nPrediction Distribution:")
    for i, label in enumerate(LABEL_ORDER):
        count = predictions[:, i].sum()
        print(f"   {label:20s}: {count} ({count/len(predictions)*100:.1f}%)")

    training_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\nTraining time: {training_time:.2f} minutes")

    # Save predictions
    os.makedirs(PREDICTIONS_DIR, exist_ok=True)

    submission_df = pd.DataFrame(predictions, columns=LABEL_ORDER)

    if 'id' in test_df.columns:
        submission_df.insert(0, 'id', test_df['id'])
    elif 'ID' in test_df.columns:
        submission_df.insert(0, 'id', test_df['ID'])

    pred_filename = f"{PREDICTIONS_DIR}/predictions_{language}_{model_short_name}.csv"
    submission_df.to_csv(pred_filename, index=False)
    print(f"\nPredictions saved to {pred_filename}")

    # Compile results
    results = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_name': model_name,
        'model_short_name': model_short_name,
        'language': language,
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'val_f1_macro': val_results['eval_f1_macro'],
        'training_time_minutes': training_time
    }

    # Add per-class predictions
    for i, label in enumerate(LABEL_ORDER):
        results[f'pred_{label}'] = int(predictions[:, i].sum())

    return results

In [None]:
def main():
    print(f"\n{'='*80}")
    print(f"MANIFESTATION CLASSIFICATION - SUBTASK 3")
    print(f"Language: {LANGUAGE.upper()}")
    print(f"{'='*80}")

    # Load data
    print(f"\nLoading data for {LANGUAGE}...")
    train_path = f'./subtask3/train/{LANGUAGE}.csv'
    dev_path = f'./subtask3/dev/{LANGUAGE}.csv'

    try:
        train_full = pd.read_csv(train_path)
        test_df = pd.read_csv(dev_path)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return

    # Extract features and labels
    X = train_full['text'].values
    y = train_full[LABEL_ORDER].values

    # Calculate pos_weights
    print(f"\nCalculating class weights...")
    pos_counts = train_full[LABEL_ORDER].sum().values
    num_samples = len(train_full)
    neg_counts = num_samples - pos_counts

    pos_weights = np.where(pos_counts > 0, neg_counts / pos_counts, 1.0)
    pos_weights = torch.tensor(pos_weights, dtype=torch.float).to(device)

    print(f"   Label distribution:")
    for i, col in enumerate(LABEL_ORDER):
        print(f"      {col:20s}: {int(pos_counts[i])} positive ({pos_counts[i]/num_samples*100:.1f}%) - weight: {pos_weights[i]:.2f}")

    # Train/val split
    print(f"\nCreating train/val split...")
    label_sums = y.sum(axis=0)
    primary_label_idx = np.argmax(label_sums)
    primary_labels = y[:, primary_label_idx]

    unique, counts = np.unique(primary_labels, return_counts=True)
    if np.min(counts) >= 2:
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=primary_labels
        )
        print(f"   Stratified split on '{LABEL_ORDER[primary_label_idx]}'")
    else:
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        print(f"   Random split (stratification not possible)")

    print(f"   Train: {len(X_train)}, Val: {len(X_val)}")

    X_test = test_df['text'].values

    # Get models for this language
    models_to_test = MODELS_CONFIG.get(LANGUAGE, [])
    if not models_to_test:
        print(f"No models configured for language: {LANGUAGE}")
        return

    print(f"\nModels to test: {len(models_to_test)}")
    for i, model in enumerate(models_to_test, 1):
        print(f"   {i}. {model}")

    # Train all models
    all_results = []
    for model_name in models_to_test:
        try:
            results = train_and_evaluate_model(
                model_name, LANGUAGE,
                X_train, y_train,
                X_val, y_val,
                X_test,
                test_df,
                pos_weights
            )
            all_results.append(results)

            # Save results after each model
            results_df = pd.DataFrame(all_results)

            if os.path.exists(RESULTS_CSV):
                existing_df = pd.read_csv(RESULTS_CSV)
                results_df = pd.concat([existing_df, results_df], ignore_index=True)

            results_df.to_csv(RESULTS_CSV, index=False)
            print(f"\nResults updated in {RESULTS_CSV}")

        except Exception as e:
            print(f"\nError training {model_name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Print summary
    print(f"\n{'='*80}")
    print(f"TRAINING COMPLETE")
    print(f"{'='*80}")
    print(f"\nSuccessfully trained {len(all_results)} out of {len(models_to_test)} models")
    print(f"Results saved to: {RESULTS_CSV}")
    print(f"Predictions saved to: {PREDICTIONS_DIR}/")

    if all_results:
        print(f"\nBest Model (by Validation Macro F1):")
        best_model = max(all_results, key=lambda x: x['val_f1_macro'])
        print(f"   Model: {best_model['model_short_name']}")
        print(f"   Val Macro F1: {best_model['val_f1_macro']:.4f}")
        print(f"   Training Time: {best_model['training_time_minutes']:.1f} min")

In [None]:
if __name__ == "__main__":
    main()


MANIFESTATION CLASSIFICATION - SUBTASK 3
Language: SWA

Loading data for swa...

Calculating class weights...
   Label distribution:
      stereotype          : 2775 positive (39.7%) - weight: 1.52
      vilification        : 2883 positive (41.2%) - weight: 1.42
      dehumanization      : 893 positive (12.8%) - weight: 6.83
      extreme_language    : 1673 positive (23.9%) - weight: 3.18
      lack_of_empathy     : 2080 positive (29.8%) - weight: 2.36
      invalidation        : 1637 positive (23.4%) - weight: 3.27

Creating train/val split...
   Stratified split on 'vilification'
   Train: 5592, Val: 1399

Models to test: 2
   1. cardiffnlp/twitter-roberta-base-hate-latest
   2. Davlan/afro-xlmr-base

Training: cardiffnlp/twitter-roberta-base-hate-latest on SWA


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate-latest and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.9308,0.88615,0.510872
2,0.8651,0.831958,0.554195
3,0.8188,0.821776,0.558768
4,0.7883,0.852088,0.561213
5,0.7094,0.926818,0.549465
6,0.6474,1.003363,0.554712
7,0.5483,1.059301,0.563871
8,0.5567,1.10634,0.564276



  Per-class F1: {'stereotype': '0.675', 'vilification': '0.659', 'dehumanization': '0.253', 'extreme_language': '0.429', 'lack_of_empathy': '0.544', 'invalidation': '0.504'}

  Per-class F1: {'stereotype': '0.695', 'vilification': '0.694', 'dehumanization': '0.307', 'extreme_language': '0.492', 'lack_of_empathy': '0.596', 'invalidation': '0.541'}

  Per-class F1: {'stereotype': '0.727', 'vilification': '0.705', 'dehumanization': '0.301', 'extreme_language': '0.489', 'lack_of_empathy': '0.608', 'invalidation': '0.523'}

  Per-class F1: {'stereotype': '0.722', 'vilification': '0.693', 'dehumanization': '0.303', 'extreme_language': '0.505', 'lack_of_empathy': '0.602', 'invalidation': '0.543'}

  Per-class F1: {'stereotype': '0.720', 'vilification': '0.681', 'dehumanization': '0.309', 'extreme_language': '0.479', 'lack_of_empathy': '0.581', 'invalidation': '0.526'}

  Per-class F1: {'stereotype': '0.717', 'vilification': '0.696', 'dehumanization': '0.266', 'extreme_language': '0.500', 'la


  Per-class F1: {'stereotype': '0.727', 'vilification': '0.701', 'dehumanization': '0.302', 'extreme_language': '0.492', 'lack_of_empathy': '0.613', 'invalidation': '0.550'}

Validation Results:
   Macro F1: 0.5643

Generating predictions on test set...

Prediction Distribution:
   stereotype          : 187 (53.6%)
   vilification        : 179 (51.3%)
   dehumanization      : 121 (34.7%)
   extreme_language    : 153 (43.8%)
   lack_of_empathy     : 169 (48.4%)
   invalidation        : 160 (45.8%)

Training time: 17.56 minutes

Predictions saved to predictions_subtask3_simple/predictions_swa_twitter-roberta-base-hate-latest.csv

Results updated in results_subtask3_simple.csv

Training: Davlan/afro-xlmr-base on SWA


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.955,0.890266,0.505058
2,0.8131,0.82005,0.540572
3,0.756,0.824564,0.564171
4,0.7324,0.827669,0.584409
5,0.6003,0.963477,0.573645
6,0.5485,0.965549,0.578483
7,0.4749,0.997974,0.58208
8,0.5021,1.06757,0.578747



  Per-class F1: {'stereotype': '0.707', 'vilification': '0.640', 'dehumanization': '0.237', 'extreme_language': '0.419', 'lack_of_empathy': '0.522', 'invalidation': '0.506'}

  Per-class F1: {'stereotype': '0.708', 'vilification': '0.699', 'dehumanization': '0.281', 'extreme_language': '0.460', 'lack_of_empathy': '0.593', 'invalidation': '0.503'}

  Per-class F1: {'stereotype': '0.738', 'vilification': '0.714', 'dehumanization': '0.307', 'extreme_language': '0.490', 'lack_of_empathy': '0.609', 'invalidation': '0.527'}

  Per-class F1: {'stereotype': '0.746', 'vilification': '0.715', 'dehumanization': '0.346', 'extreme_language': '0.520', 'lack_of_empathy': '0.620', 'invalidation': '0.559'}

  Per-class F1: {'stereotype': '0.719', 'vilification': '0.711', 'dehumanization': '0.324', 'extreme_language': '0.515', 'lack_of_empathy': '0.627', 'invalidation': '0.546'}

  Per-class F1: {'stereotype': '0.729', 'vilification': '0.713', 'dehumanization': '0.337', 'extreme_language': '0.515', 'la


  Per-class F1: {'stereotype': '0.746', 'vilification': '0.715', 'dehumanization': '0.346', 'extreme_language': '0.520', 'lack_of_empathy': '0.620', 'invalidation': '0.559'}

Validation Results:
   Macro F1: 0.5844

Generating predictions on test set...

Prediction Distribution:
   stereotype          : 153 (43.8%)
   vilification        : 164 (47.0%)
   dehumanization      : 150 (43.0%)
   extreme_language    : 166 (47.6%)
   lack_of_empathy     : 163 (46.7%)
   invalidation        : 167 (47.9%)

Training time: 26.01 minutes

Predictions saved to predictions_subtask3_simple/predictions_swa_afro-xlmr-base.csv

Results updated in results_subtask3_simple.csv

TRAINING COMPLETE

Successfully trained 2 out of 2 models
Results saved to: results_subtask3_simple.csv
Predictions saved to: predictions_subtask3_simple/

Best Model (by Validation Macro F1):
   Model: afro-xlmr-base
   Val Macro F1: 0.5844
   Training Time: 26.0 min
