In [None]:
# Only if using colab
# Mount drive and load the dataset
from google.colab import drive
drive.mount('/content/drive')

zip_path = "/content/drive/My Drive/dev_phase.zip"
!unzip "$zip_path" -d /content/


Mounted at /content/drive
Archive:  /content/drive/My Drive/dev_phase.zip
   creating: /content/subtask1/
   creating: /content/subtask1/dev/
  inflating: /content/subtask1/dev/nep.csv  
  inflating: /content/subtask1/dev/ita.csv  
  inflating: /content/subtask1/dev/pol.csv  
  inflating: /content/subtask1/dev/rus.csv  
  inflating: /content/subtask1/dev/tel.csv  
  inflating: /content/subtask1/dev/hin.csv  
  inflating: /content/subtask1/dev/hau.csv  
  inflating: /content/subtask1/dev/pan.csv  
  inflating: /content/subtask1/dev/ori.csv  
  inflating: /content/subtask1/dev/spa.csv  
  inflating: /content/subtask1/dev/deu.csv  
  inflating: /content/subtask1/dev/fas.csv  
  inflating: /content/subtask1/dev/arb.csv  
  inflating: /content/subtask1/dev/ben.csv  
  inflating: /content/subtask1/dev/amh.csv  
  inflating: /content/subtask1/dev/khm.csv  
  inflating: /content/subtask1/dev/tur.csv  
  inflating: /content/subtask1/dev/zho.csv  
  inflating: /content/subtask1/dev/eng.csv  
  i

In [None]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
import wandb
import os
from datetime import datetime

# Disable wandb
wandb.init(mode="disabled")
os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


  | |_| | '_ \/ _` / _` |  _/ -_)


Using device: cuda


In [None]:
LANGUAGE = 'swa'  # Change to 'eng', 'swa' or 'amh'

MODELS_CONFIG = {
    'eng': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'microsoft/deberta-v3-base',
        'FacebookAI/xlm-roberta-base'
    ],
    'swa': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ],
    'amh': [
        'cardiffnlp/twitter-roberta-base-hate-latest',
        'Davlan/afro-xlmr-base'
    ]
}

NUM_EPOCHS = 5
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
DROPOUT_RATE = 0.2

RESULTS_CSV = 'results_subtask1_simple.csv'
PREDICTIONS_DIR = 'predictions_subtask1_simple'

In [None]:
class LabelAwareDebertaBinary(nn.Module):
    """
    Custom model for Binary Classification (0: Polarized, 1: Not Polarized)
    """
    def __init__(self, model_name, num_labels=2, dropout_rate=0.2):
        super(LabelAwareDebertaBinary, self).__init__()

        # Load base model
        self.deberta = AutoModel.from_pretrained(model_name)
        hidden_size = self.deberta.config.hidden_size

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_labels)

        # Initialize weights
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights if hasattr(self, 'class_weights') else None)
            loss = loss_fct(logits, labels.view(-1))

        return {'loss': loss, 'logits': logits}


class PolarizationDatasetBinary(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.instruction = "Classify if this text is polarizing (0) or not polarized (1). Text: "

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        try:
            label = int(self.labels[idx])
        except (ValueError, TypeError):
            label = 0

        full_text = self.instruction + text

        encoding = self.tokenizer(
            full_text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item


def compute_metrics_binary(p):
    logits = p.predictions
    preds = np.argmax(logits, axis=1)
    labels = p.label_ids

    macro_f1 = f1_score(labels, preds, average='macro')
    weighted_f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'f1_macro': macro_f1,
        'f1_weighted': weighted_f1,
        'accuracy': acc
    }

def train_and_evaluate_model(model_name, language, X_train, y_train, X_val, y_val, X_test, test_df, class_weights):
    """
    Train and evaluate a single model
    """
    print(f"\n{'='*80}")
    print(f"Training: {model_name} on {language.upper()}")
    print(f"{'='*80}")

    start_time = datetime.now()

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_dataset = PolarizationDatasetBinary(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LENGTH)
    val_dataset = PolarizationDatasetBinary(X_val.tolist(), y_val.tolist(), tokenizer, MAX_LENGTH)

    model = LabelAwareDebertaBinary(
        model_name=model_name,
        num_labels=2,
        dropout_rate=DROPOUT_RATE
    ).to(device)

    model.class_weights = class_weights

    # Training arguments
    total_steps = (len(train_dataset) // BATCH_SIZE) * NUM_EPOCHS
    warmup_steps = int(0.1 * total_steps)

    model_short_name = model_name.split('/')[-1]
    output_dir = f"./output_{language}_{model_short_name}_subtask1"

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        weight_decay=0.01,
        warmup_steps=warmup_steps,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        logging_steps=50,
        report_to="none",
        seed=42
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_binary,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    print(f"\nStarting training...")
    trainer.train()

    print(f"\nEvaluating on validation set...")
    val_results = trainer.evaluate(val_dataset)

    print(f"\nValidation Results:")
    print(f"   Macro F1:    {val_results['eval_f1_macro']:.4f}")
    print(f"   Weighted F1: {val_results['eval_f1_weighted']:.4f}")
    print(f"   Accuracy:    {val_results['eval_accuracy']:.4f}")


    print(f"\nGenerating predictions on test set...")

    if 'polarization' in test_df.columns:
        test_labels = test_df['polarization'].fillna(0).astype(int).tolist()
    else:
        test_labels = [0] * len(test_df)

    test_dataset = PolarizationDatasetBinary(test_df['text'].tolist(), test_labels, tokenizer, MAX_LENGTH)

    trainer.compute_metrics = None
    prediction_output = trainer.predict(test_dataset)

    logits = prediction_output.predictions
    predictions = np.argmax(logits, axis=1)

    count_0 = (predictions == 0).sum()
    count_1 = (predictions == 1).sum()
    print(f"\nPrediction Distribution:")
    print(f"   Polarized (0):     {count_0} ({count_0/len(predictions)*100:.1f}%)")
    print(f"   Not Polarized (1): {count_1} ({count_1/len(predictions)*100:.1f}%)")

    training_time = (datetime.now() - start_time).total_seconds() / 60
    print(f"\nTraining time: {training_time:.2f} minutes")

    # Save predictions
    os.makedirs(PREDICTIONS_DIR, exist_ok=True)

    submission_df = pd.DataFrame()
    if 'id' in test_df.columns:
        submission_df['id'] = test_df['id']
    elif 'ID' in test_df.columns:
        submission_df['id'] = test_df['ID']

    submission_df['polarization'] = predictions

    pred_filename = f"{PREDICTIONS_DIR}/predictions_{language}_{model_short_name}.csv"
    submission_df.to_csv(pred_filename, index=False)
    print(f"\nPredictions saved to {pred_filename}")

    results = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_name': model_name,
        'model_short_name': model_short_name,
        'language': language,
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'val_f1_macro': val_results['eval_f1_macro'],
        'val_f1_weighted': val_results['eval_f1_weighted'],
        'val_accuracy': val_results['eval_accuracy'],
        'pred_polarized': int(count_0),
        'pred_not_polarized': int(count_1),
        'training_time_minutes': training_time
    }

    return results


In [None]:
def main():
    print(f"\n{'='*80}")
    print(f"BINARY POLARIZATION CLASSIFICATION - SUBTASK 1")
    print(f"Language: {LANGUAGE.upper()}")
    print(f"{'='*80}")

    # Load data
    print(f"\nLoading data for {LANGUAGE}...")
    train_path = f'./subtask1/train/{LANGUAGE}.csv'
    dev_path = f'./subtask1/dev/{LANGUAGE}.csv'

    try:
        train_full = pd.read_csv(train_path)
        test_df = pd.read_csv(dev_path)
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return

    if 'polarization' not in train_full.columns:
        print("Warning: 'polarization' column not found. Deriving from multilabel columns...")
        old_labels = ['gender/sexual', 'political', 'religious', 'racial/ethnic', 'other']
        if set(old_labels).issubset(train_full.columns):
            is_polarized = train_full[old_labels].sum(axis=1) > 0
            train_full['polarization'] = np.where(is_polarized, 0, 1)
        else:
            raise ValueError("Input CSV must have 'polarization' column or multilabel columns.")

    X = train_full['text'].values
    y = train_full['polarization'].values

    print(f"\nCalculating class weights...")
    labels_unique, counts = np.unique(y, return_counts=True)
    class_counts = dict(zip(labels_unique, counts))
    print(f"   Class distribution: {class_counts}")

    count_0 = class_counts.get(0, 0)
    count_1 = class_counts.get(1, 0)
    total = count_0 + count_1

    weight_0 = (1 / count_0) * (total / 2.0)
    weight_1 = (1 / count_1) * (total / 2.0)

    class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float).to(device)
    print(f"   Class weights: 0={weight_0:.2f}, 1={weight_1:.2f}")

    print(f"\nCreating train/val split...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"   Train: {len(X_train)}, Val: {len(X_val)}")

    X_test = test_df['text'].values

    models_to_test = MODELS_CONFIG.get(LANGUAGE, [])
    if not models_to_test:
        print(f"No models configured for language: {LANGUAGE}")
        return

    print(f"\nModels to test: {len(models_to_test)}")
    for i, model in enumerate(models_to_test, 1):
        print(f"   {i}. {model}")

    all_results = []
    for model_name in models_to_test:
        try:
            results = train_and_evaluate_model(
                model_name, LANGUAGE,
                X_train, y_train,
                X_val, y_val,
                X_test,
                test_df,
                class_weights
            )
            all_results.append(results)

            results_df = pd.DataFrame(all_results)

            if os.path.exists(RESULTS_CSV):
                existing_df = pd.read_csv(RESULTS_CSV)
                results_df = pd.concat([existing_df, results_df], ignore_index=True)

            results_df.to_csv(RESULTS_CSV, index=False)
            print(f"\nResults updated in {RESULTS_CSV}")

        except Exception as e:
            print(f"\nError training {model_name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    print(f"\n{'='*80}")
    print(f"TRAINING COMPLETE")
    print(f"{'='*80}")
    print(f"\nSuccessfully trained {len(all_results)} out of {len(models_to_test)} models")
    print(f"Results saved to: {RESULTS_CSV}")
    print(f"Predictions saved to: {PREDICTIONS_DIR}/")

    if all_results:
        print(f"\nBest Model (by Validation Macro F1):")
        best_model = max(all_results, key=lambda x: x['val_f1_macro'])
        print(f"   Model: {best_model['model_short_name']}")
        print(f"   Val Macro F1: {best_model['val_f1_macro']:.4f}")
        print(f"   Training Time: {best_model['training_time_minutes']:.1f} min")

In [None]:
if __name__ == "__main__":
    main()


BINARY POLARIZATION CLASSIFICATION - SUBTASK 1
Language: SWA

Loading data for swa...

Calculating class weights...
   Class distribution: {np.int64(0): np.int64(3487), np.int64(1): np.int64(3504)}
   Class weights: 0=1.00, 1=1.00

Creating train/val split...
   Train: 5592, Val: 1399

Models to test: 2
   1. cardiffnlp/twitter-roberta-base-hate-latest
   2. Davlan/afro-xlmr-base

Training: cardiffnlp/twitter-roberta-base-hate-latest on SWA


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate-latest and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,0.5878,0.508916,0.75942,0.759353,0.763402
2,0.5067,0.488754,0.763168,0.763184,0.763402
3,0.4337,0.48223,0.76983,0.769827,0.769836
4,0.3194,0.539,0.772633,0.772641,0.772695
5,0.3264,0.55288,0.780482,0.78049,0.780558



Evaluating on validation set...



Validation Results:
   Macro F1:    0.7805
   Weighted F1: 0.7805
   Accuracy:    0.7806

Generating predictions on test set...

Prediction Distribution:
   Polarized (0):     157 (45.0%)
   Not Polarized (1): 192 (55.0%)

Training time: 12.08 minutes

Predictions saved to predictions_subtask1_simple/predictions_swa_twitter-roberta-base-hate-latest.csv

Results updated in results_subtask1_simple.csv

Training: Davlan/afro-xlmr-base on SWA


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting training...


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,0.4984,0.491822,0.770936,0.770918,0.771265
2,0.4549,0.460894,0.785411,0.785423,0.785561
3,0.3804,0.463838,0.788093,0.788111,0.78842
4,0.3111,0.512312,0.793271,0.793283,0.793424
5,0.2703,0.601291,0.793219,0.793233,0.793424



Evaluating on validation set...



Validation Results:
   Macro F1:    0.7933
   Weighted F1: 0.7933
   Accuracy:    0.7934

Generating predictions on test set...

Prediction Distribution:
   Polarized (0):     161 (46.1%)
   Not Polarized (1): 188 (53.9%)

Training time: 16.28 minutes

Predictions saved to predictions_subtask1_simple/predictions_swa_afro-xlmr-base.csv

Results updated in results_subtask1_simple.csv

TRAINING COMPLETE

Successfully trained 2 out of 2 models
Results saved to: results_subtask1_simple.csv
Predictions saved to: predictions_subtask1_simple/

Best Model (by Validation Macro F1):
   Model: afro-xlmr-base
   Val Macro F1: 0.7933
   Training Time: 16.3 min
