# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

In [None]:
from IPython.display import clear_output
import time
import os

In [None]:
! pip install -q gdown
clear_output()

In [None]:
! mkdir experiments
! mkdir finetuned_models

## Imports

In [None]:
dev_phase_id = "1tbAwUWN8X2JvXgdarjZ31f4XkqcpFVDk"
# subtask1_id = "1q_I6dw9ZbCg3MbQ1wnC-419s2ocCyqaa"
# subtask2_id = "1iHFDd_uihFi7vukWFq1hj32wfEH4dgBc"
# subtask3_id = "1JA7_BbJDYORbmH06gWzz4-UhXgRBe1eI"
translated_tasks_id = "1wHoKpZo8iMhHOm5TpvSS6Nr63Zk2w-P0"

! gdown --id $dev_phase_id
! gdown --id $translated_tasks_id
# ! gdown --id $subtask1_id
# ! gdown --id $subtask2_id
# ! gdown --id $subtask3_id

Downloading...
From: https://drive.google.com/uc?id=1tbAwUWN8X2JvXgdarjZ31f4XkqcpFVDk
To: /content/dev_phase.zip
100% 18.9M/18.9M [00:00<00:00, 117MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1wHoKpZo8iMhHOm5TpvSS6Nr63Zk2w-P0
To: /content/translated_subtasks.zip
100% 1.93M/1.93M [00:00<00:00, 81.4MB/s]


In [None]:
! unzip dev_phase.zip
! unzip translated_subtasks.zip
clear_output()

In [None]:
# @title Module Imports

import numpy as np
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset

In [None]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)


In [None]:
# @title Dataset Class
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

In [None]:
# @title Functions

# Define metrics function
# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)
#     return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}


# def compute_metrics(p):
#     print("=" * 50)
#     print(f"Type of p.predictions: {type(p.predictions)}")
#     print(f"Type of p.label_ids: {type(p.label_ids)}")

#     if isinstance(p.predictions, tuple):
#         print(f"Predictions is tuple with {len(p.predictions)} elements")
#         for i, item in enumerate(p.predictions):
#             print(f"  Element {i}: {type(item)}, shape: {getattr(item, 'shape', 'no shape')}")
#         logits = p.predictions[0]
#     elif isinstance(p.predictions, np.ndarray):
#         print(f"Predictions shape: {p.predictions.shape}")
#         logits = p.predictions
#     else:
#         print(f"Predictions: {p.predictions}")
#         logits = np.array(p.predictions)

#     print(f"Label_ids shape: {p.label_ids.shape}")
#     print(f"Logits shape after extraction: {logits.shape}")

#     preds = np.argmax(logits, axis=1)
#     print(f"Preds shape after argmax: {preds.shape}")
#     print("=" * 50)

#     return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# def compute_metrics(p):
#     # Handle predictions
#     logits = p.predictions
#     preds = np.argmax(logits, axis=1)

#     # Handle labels - convert tuple to array if needed
#     if isinstance(p.label_ids, tuple):
#         labels = np.array(p.label_ids)
#     else:
#         labels = p.label_ids

#     # Flatten if needed
#     if labels.ndim > 1:
#         labels = labels.flatten()

#     return {'f1_macro': f1_score(labels, preds, average='macro')}

def compute_metrics(p):
    """Simple, robust compute_metrics function"""

    # Predictions should be (n_samples, n_classes)
    logits = p.predictions
    preds = np.argmax(logits, axis=1)

    # Labels should be (n_samples,)
    labels = p.label_ids

    # Ensure labels are 1D
    if isinstance(labels, (list, tuple)):
        labels = np.array(labels)

    if labels.ndim > 1:
        labels = labels.flatten()

    # Calculate F1
    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_weighted': f1_score(labels, preds, average='weighted')
    }

In [None]:
# @title Experiment Class

from typing import List

import yaml
import numpy as np
import torch


class Experiment:

  def __init__(self, name, dir, description, baseline=None):
    self.name = name
    self.dir = dir
    self.description = description
    self.parameters = dict()
    self.baseline = baseline

  def update_param(self, parameter: 'Parameter'):
    var_name = parameter.get_var_name()
    parameter_class = parameter.get_parameter_class()
    value = parameter.get_value()

    assert isinstance(
        value,
        (
            int, float, str, dict, list,
            np.ndarray, torch.tensor
        )
    )

    if (parameter_class is None) or (parameter_class.lower() == 'global'):
      self.parameters[var_name] = value
      return

    if parameter_class not in self.parameters:
      self.parameters[parameter_class] = dict()
    self.parameters[parameter_class][var_name] = value

  def save(self):
    experiment_dict = {
        'name': self.name,
        'baseline': self.baseline,
        'description': self.description,
        'parameters': self.parameters,
    }

    with open(self.dir, "w") as f:
      yaml.dump(experiment_dict, f, default_flow_style=False)

    print(f"Model saved to {self.dir}")

  def add_params(self, parameters: List['Parameter']):

    for parameter in parameters:
      self.update_param(parameter)


class Parameter:

  def __init__(self, value, var_name, parameter_class):
    self.__var_name = var_name
    self.__value = value
    self.__parameter_class = parameter_class

  def get_var_name(self):
    return self.__var_name

  def get_parameter_class(self):
    return self.__parameter_class

  def get_value(self):
    return self.__value

  def set_value(self, value):
    self.__value = value

  def set_var_name(self, var_name):
    self.__var_name = var_name

  def set_parameter_class(self, parameter_class):
    self.__parameter_class = parameter_class

# MultiTask Trainer Classes

In [None]:
# @title PolarPairs Approach

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import AutoModel, AutoTokenizer, Trainer
import numpy as np

class PolarPairsDataset(Dataset):
    """Combined dataset with stratified target sampling"""
    def __init__(self, source_texts, target_texts, source_labels, target_labels,
                 tokenizer, max_length=128, subset_size=20):
        self.source_labels = source_labels
        self.target_labels = np.array(target_labels)
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        self.max_length = max_length
        self.subset_size = subset_size

        # Pre-compute indices by class for stratified sampling
        self.pos_indices = np.where(self.target_labels == 1)[0]
        self.neg_indices = np.where(self.target_labels == 0)[0]

        print(f"Target distribution - Positive: {len(self.pos_indices)}, "
              f"Negative: {len(self.neg_indices)}")

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = str(self.source_texts[idx])
        source_label = int(self.source_labels[idx])

        # Stratified sampling: ensure balanced positive/negative targets
        n_pos = self.subset_size // 2
        n_neg = self.subset_size - n_pos

        # Sample with replacement if needed
        pos_sample = np.random.choice(self.pos_indices, size=n_pos,
                                     replace=len(self.pos_indices) < n_pos)
        neg_sample = np.random.choice(self.neg_indices, size=n_neg,
                                     replace=len(self.neg_indices) < n_neg)

        target_indices = np.concatenate([pos_sample, neg_sample])
        np.random.shuffle(target_indices)

        target_texts_batch = [str(self.target_texts[i]) for i in target_indices]
        target_labels_batch = [int(self.target_labels[i]) for i in target_indices]

        # Encode
        source_encoding = self.tokenizer(
            source, max_length=self.max_length, padding='max_length',
            truncation=True, return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            target_texts_batch, max_length=self.max_length, padding='max_length',
            truncation=True, return_tensors='pt'
        )

        return {
            'x_input_ids': source_encoding['input_ids'].squeeze(0),
            'x_attention_mask': source_encoding['attention_mask'].squeeze(0),
            'x_hat_input_ids': target_encoding['input_ids'],
            'x_hat_attention_mask': target_encoding['attention_mask'],
            'polar_labels': torch.tensor(source_label, dtype=torch.long),
            'hat_labels': torch.tensor(target_labels_batch, dtype=torch.long)
        }


class MultiLingualPolarPairsAlignment(nn.Module):

  def __init__(self, encoder_model_name, pretrained_encoder_name, num_labels, alignment_normalization=True):
    super(MultiLingualPolarPairsAlignment, self).__init__()

    self.encoder = AutoModel.from_pretrained(pretrained_encoder_name)
    self.encoder.train()
    self.pretrained_encoder = AutoModel.from_pretrained(pretrained_encoder_name)
    self.pretrained_encoder.eval()

    encoder_config = self.encoder.config
    embedding_size = encoder_config.hidden_size

    self.alignment_head = nn.Linear(embedding_size, embedding_size)
    self.classification_head = nn.Linear(embedding_size, num_labels)
    self.alignment_layer_norm = nn.LayerNorm(embedding_size)
    self.alignment_normalization = alignment_normalization

  def forward(self, x_input_ids, x_attention_mask, x_hat_input_ids, x_hat_attention_mask, polar_labels, hat_labels):
    # FIX 1: Extract last_hidden_state
    x1_hidden = self.encoder(
        input_ids=x_input_ids,
        attention_mask=x_attention_mask
    ).last_hidden_state  # âœ“ Shape: (batch_size, seq_len, hidden_size)

    # FIX 2: Flatten targets before encoding
    batch_size, subset_size, seq_len = x_hat_input_ids.shape

    x_hat_input_ids_flat = x_hat_input_ids.view(batch_size * subset_size, seq_len)
    x_hat_attention_mask_flat = x_hat_attention_mask.view(batch_size * subset_size, seq_len)

    x2_hidden_flat = self.pretrained_encoder(
        input_ids=x_hat_input_ids_flat,
        attention_mask=x_hat_attention_mask_flat
    ).last_hidden_state  # âœ“ Shape: (batch_size * subset_size, seq_len, hidden_size)

    # FIX 3: Reshape back correctly
    hidden_size = x2_hidden_flat.size(-1)
    x2_hidden = x2_hidden_flat.view(batch_size, subset_size, seq_len, hidden_size)

    # Extract CLS tokens
    x1_cls = x1_hidden[:, 0, :]  # (batch_size, hidden_size)
    x2_cls = x2_hidden[:, :, 0, :]  # (batch_size, subset_size, hidden_size)

    # Apply alignment head
    # x1_aligned = F.leaky_relu(self.alignment_head(x1_cls))  # (batch_size, hidden_size)
    x1_aligned = self.alignment_head(x1_cls) + x1_cls  # (batch_size, hidden_size)
    if self.alignment_normalization:
      x1_aligned = self.alignment_layer_norm(x1_cls + x1_aligned)

    # # Apply alignment to all target embeddings
    # batch_size, subset_size, hidden_size = x2_cls.shape
    # x2_cls_flat = x2_cls.view(batch_size * subset_size, hidden_size)
    # x2_aligned_flat = F.leaky_relu(self.alignment_head(x2_cls_flat)) + x2_cls_flat
    # if self.alignment_normalization:
    #   x2_aligned_flat = self.alignment_layer_norm(x2_cls_flat + x2_aligned_flat)
    # x2_aligned = x2_aligned_flat.view(batch_size, subset_size, hidden_size)

    # Use target embeddings directly (no transformation)
    x2_aligned = x2_cls  # (batch_size, subset_size, hidden_size)

    # Classification logits (only from source)
    logits = self.classification_head(x1_aligned)

    return logits, x1_aligned, x2_aligned


def polar_pairs_contrastive_loss(
    logits: torch.Tensor,
    x1_aligned: torch.Tensor,
    x2_aligned: torch.Tensor,
    polar_labels: torch.Tensor,
    hat_labels: torch.Tensor,
    lambda_align: float = 0.5,
    temperature: float = 0.07
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Improved loss with:
    1. Classification loss
    2. Contrastive alignment loss (pull same class together, push different apart)
    3. Class balance handling
    """

    # 1. Classification loss with class weights
    num_classes = logits.size(1)  # Get number of classes from logits
    class_counts = torch.bincount(polar_labels, minlength=num_classes)
    class_weights = 1.0 / (class_counts.float() + 1e-6)
    class_weights = class_weights / class_weights.sum()

    # Move weights to same device as logits
    class_weights = class_weights.to(logits.device)

    classification_loss = F.cross_entropy(logits, polar_labels, weight=class_weights)

    # 2. Contrastive alignment loss
    batch_size, subset_size, hidden_size = x2_aligned.shape

    # Expand source embeddings to match all targets
    x1_expanded = x1_aligned.unsqueeze(1).expand(-1, subset_size, -1)
    x1_flat = x1_expanded.reshape(batch_size * subset_size, hidden_size)
    x2_flat = x2_aligned.reshape(batch_size * subset_size, hidden_size)

    # Expand source labels to match
    source_labels_expanded = polar_labels.unsqueeze(1).expand(-1, subset_size)
    source_labels_flat = source_labels_expanded.reshape(-1)
    target_labels_flat = hat_labels.reshape(-1)

    # Compute similarity matrix
    x1_norm = F.normalize(x1_flat, p=2, dim=1)
    x2_norm = F.normalize(x2_flat, p=2, dim=1)
    similarity = torch.mm(x1_norm, x2_norm.t()) / temperature

    # Create masks for positive and negative pairs
    labels_equal = (source_labels_flat.unsqueeze(1) == target_labels_flat.unsqueeze(0))

    # InfoNCE-style contrastive loss
    exp_sim = torch.exp(similarity)

    # Mask out self-similarities (diagonal)
    mask_self = torch.eye(similarity.size(0), device=similarity.device).bool()
    exp_sim = exp_sim.masked_fill(mask_self, 0)

    # Positive pairs: same label
    pos_sim = (exp_sim * labels_equal.float()).sum(dim=1)

    # All pairs (excluding self)
    all_sim = exp_sim.sum(dim=1)

    # Contrastive loss: -log(pos / (pos + neg))
    alignment_loss = -torch.log((pos_sim + 1e-8) / (all_sim + 1e-8))
    alignment_loss = alignment_loss.mean()

    # 3. Total loss
    total_loss = (1.0 - lambda_align) * classification_loss + lambda_align * alignment_loss

    return total_loss, classification_loss, alignment_loss


class PolarPairsTrainer(Trainer):
    def __init__(self, lambda_align=0.5, temperature=0.07, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lambda_align = lambda_align
        self.temperature = temperature
        self.label_names = ['polar_labels', 'hat_labels']

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        logits, x1_aligned, x2_aligned = model(
            x_input_ids=inputs['x_input_ids'],
            x_attention_mask=inputs['x_attention_mask'],
            x_hat_input_ids=inputs['x_hat_input_ids'],
            x_hat_attention_mask=inputs['x_hat_attention_mask'],
            polar_labels=inputs['polar_labels'],
            hat_labels=inputs['hat_labels']
        )

        total_loss, classification_loss, alignment_loss = polar_pairs_contrastive_loss(
            logits=logits,
            x1_aligned=x1_aligned,
            x2_aligned=x2_aligned,
            polar_labels=inputs['polar_labels'],
            hat_labels=inputs['hat_labels'],
            lambda_align=self.lambda_align,
            temperature=self.temperature
        )

        # Log component losses
        if self.state.global_step % 10 == 0:
            self.log({
                'classification_loss': classification_loss.item(),
                'alignment_loss': alignment_loss.item(),
                'lambda': self.lambda_align
            })

        if return_outputs:
            outputs = {'logits': logits}
            return total_loss, outputs
        else:
            return total_loss

    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        with torch.no_grad():
            loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
            logits = outputs['logits']

        labels = inputs['polar_labels'].detach().cpu()

        if prediction_loss_only:
            return (loss.detach(), None, None)

        return (loss.detach(), logits.detach().cpu(), labels)


# Enhanced collator
from dataclasses import dataclass
from typing import Any, Dict, List

@dataclass
class PolarPairsCollator:
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        batch = {}

        batch['x_input_ids'] = torch.stack([f['x_input_ids'] for f in features])
        batch['x_attention_mask'] = torch.stack([f['x_attention_mask'] for f in features])
        batch['x_hat_input_ids'] = torch.stack([f['x_hat_input_ids'] for f in features])
        batch['x_hat_attention_mask'] = torch.stack([f['x_hat_attention_mask'] for f in features])

        polar_labels = []
        for f in features:
            label = f['polar_labels']
            if isinstance(label, torch.Tensor):
                polar_labels.append(label.item() if label.dim() == 0 else label[0].item())
            else:
                polar_labels.append(int(label))

        batch['polar_labels'] = torch.tensor(polar_labels, dtype=torch.long)
        batch['hat_labels'] = torch.stack([f['hat_labels'] for f in features])

        return batch

In [None]:
# @title MT5

from transformers import MT5ForConditionalGeneration, T5Tokenizer
import torch.nn as nn

class MultiTaskDataset(Dataset):
    """Combined dataset for classification and translation"""
    def __init__(self, texts, labels, source_texts, target_texts, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Classification inputs
        text = str(self.texts[idx])
        class_encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Translation inputs
        source = str(self.source_texts[idx])
        target = str(self.target_texts[idx])

        trans_source_encoding = self.tokenizer(
            source,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        trans_target_encoding = self.tokenizer(
            target,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        translation_labels = trans_target_encoding['input_ids'].squeeze(0)
        translation_labels[translation_labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': class_encoding['input_ids'].squeeze(0),
            'attention_mask': class_encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(int(self.labels[idx]), dtype=torch.long),
            'translation_input_ids': trans_source_encoding['input_ids'].squeeze(0),
            'translation_attention_mask': trans_source_encoding['attention_mask'].squeeze(0),
            'translation_labels': translation_labels
        }


class MultiTaskMT5(nn.Module):
    """mT5 model for both classification and translation"""
    def __init__(self, model_name='google/mt5-small', num_labels=3):
        super(MultiTaskMT5, self).__init__()

        # Load pretrained mT5
        self.mt5 = MT5ForConditionalGeneration.from_pretrained(model_name)
        self.config = self.mt5.config

        # Classification head (uses encoder output)
        self.classification_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(self.config.d_model, num_labels)
        )

    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
        """Enable gradient checkpointing for the underlying mT5 model"""
        self.mt5.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)

    def gradient_checkpointing_disable(self):
        """Disable gradient checkpointing for the underlying mT5 model"""
        self.mt5.gradient_checkpointing_disable()

    def freeze_base_model(self):
        """Freeze the mT5 encoder and decoder, only train the classification head"""
        print("Freezing mT5 base model (encoder + decoder)...")

        # Freeze encoder
        for param in self.mt5.encoder.parameters():
            param.requires_grad = False

        # Freeze decoder
        for param in self.mt5.decoder.parameters():
            param.requires_grad = False

        # Keep lm_head (language modeling head) trainable for translation
        # This is the final projection layer that outputs vocabulary logits
        for param in self.mt5.lm_head.parameters():
            param.requires_grad = True

        # Classification head is trainable by default (not frozen)

        # Print trainable parameters
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.parameters())
        print(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")

    def forward(self,
                input_ids=None, attention_mask=None, labels=None,
                translation_input_ids=None, translation_attention_mask=None,
                translation_labels=None):

        classification_loss = None
        classification_logits = None

        # Classification task
        if input_ids is not None:
            encoder_outputs = self.mt5.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            hidden_states = encoder_outputs.last_hidden_state
            pooled_output = (hidden_states * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
            classification_logits = self.classification_head(pooled_output)

            if labels is not None:
                loss_fct = nn.CrossEntropyLoss()
                classification_loss = loss_fct(classification_logits, labels)

        # Translation task
        translation_loss = None
        translation_logits = None

        if translation_input_ids is not None and translation_labels is not None:
            translation_outputs = self.mt5(
                input_ids=translation_input_ids,
                attention_mask=translation_attention_mask,
                labels=translation_labels
            )
            translation_loss = translation_outputs.loss
            translation_logits = translation_outputs.logits

        # Combine losses
        total_loss = None
        if classification_loss is not None and translation_loss is not None:
            total_loss = classification_loss + translation_loss
        elif classification_loss is not None:
            total_loss = classification_loss
        elif translation_loss is not None:
            total_loss = translation_loss

        return {
            'loss': total_loss,
            'classification_loss': classification_loss,
            'translation_loss': translation_loss,
            'logits': classification_logits,
            'classification_logits': classification_logits,
            'translation_logits': translation_logits
        }

class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Compute loss for multi-task learning.
        Works with BERT, mT5, mBART, or any custom model.
        """
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            translation_input_ids=inputs.get('translation_input_ids'),
            translation_attention_mask=inputs.get('translation_attention_mask'),
            labels=inputs.get('labels'),
            translation_labels=inputs.get('translation_labels')
        )

        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss

# Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [None]:
# @title Experiment Definition & Parameter Setting

import os

languages = ['amh', 'eng', 'hau', 'swa'] # @param
languages_param = Parameter(languages, 'language', None)

model_name = "microsoft/deberta-v3-small" # @param {type: "string"}
model_param = Parameter(model_name, 'model_name', None)

tokenizer_choice = "microsoft/deberta-v3-small" # @param {type:"string"}
tokenizer_param = Parameter(tokenizer_choice, 'tokenizer', 'Preprocessing')

n_labels = 2 # @param
n_labels_param = Parameter(n_labels, 'n_labels', None)

max_length = 128 # @param
max_length_param = Parameter(max_length, 'max_length', 'Hyperparameter')

num_epochs = 3 # @param
epochs_param = Parameter(num_epochs, 'epochs', 'Hyperparameter')

lr = 1e-4 # @param
lr_param = Parameter(lr, 'learning_rate', 'Hyperparameter')

train_batch = 8 # @param
train_batch_param = Parameter(train_batch, 'train_batch_size', 'Hyperparameter')

eval_batch = 8 # @param
eval_batch_param = Parameter(eval_batch, 'eval_batch_size', 'Hyperparameter')

eval_strategy = "epoch" # @param {type: "string"}
eval_strategy_param = Parameter(eval_strategy, 'eval_strategy', 'Hyperparameter')

experiment_version = "v1.1.0" # @param {type: "string"}
experiment_dir = f"experiments/{experiment_version}.yaml"
if os.path.exists(experiment_dir):
  raise ValueError(f"Experiment {experiment_version} already exists")
experiment_description = "Contrastive Polar Pairs Alignment using microsoft-deberta-v3-small" # @param {type: "string"}
experiment_baseline = "v1.0.0" # @param {type: "string"}
experiment = Experiment(
    experiment_version,
    experiment_dir,
    experiment_description,
    experiment_baseline
)
experiment.add_params([
    languages_param, model_param, max_length_param,
    tokenizer_param, epochs_param, lr_param,
    train_batch_param, eval_batch_param,
    eval_strategy_param
])

## Training & Evaluation

In [None]:
# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs_param.get_value(),
    per_device_train_batch_size=train_batch_param.get_value(),  # mT5-small can handle 2-4
    per_device_eval_batch_size=eval_batch_param.get_value(),
    gradient_accumulation_steps=8,
    eval_strategy=eval_strategy_param.get_value(),
    save_strategy="no",
    logging_steps=50,
    fp16=True,
    dataloader_num_workers=0,
    load_best_model_at_end=False,
    eval_accumulation_steps=1,
    gradient_checkpointing=False,
)

In [None]:
# @title Data Loading and Splitting

import pandas as pd
import math
from sklearn.model_selection import StratifiedShuffleSplit


def load_and_split_bilingual_data(subtask, source_lang, target_lang, test_size=0.2, random_state=42, verbose=True):
    """
    Load and split bilingual polarization data with stratified sampling.

    Args:
        subtask (str): Subtask name (e.g., 'subtask1', 'subtask2')
        source_lang (str): Source language code (e.g., 'swa', 'eng')
        target_lang (str): Target language code (e.g., 'eng', 'swa')
        test_size (float): Proportion of validation set (default: 0.2)
        random_state (int): Random seed for reproducibility (default: 42)
        verbose (bool): Print distribution statistics (default: True)

    Returns:
        tuple: (train_df, val_df) with columns:
            - source_text
            - polarization (source label)
            - target_text
            - target_polarization (target label)
    """

    # Load the DataFrames
    source = pd.read_csv(f'{subtask}/train/{source_lang}.csv')
    target = pd.read_csv(f'{subtask}/train/{target_lang}.csv')

    # --- 1. Shape Matching and Alignment ---
    source_len = source.shape[0]
    target_len = target.shape[0]

    if target_len < source_len:
        # Case A: Target is shorter than Source (Repeat/Tile the Target)
        repeat_factor = math.ceil(source_len / target_len)
        target_aligned = pd.concat([target] * repeat_factor, ignore_index=True).iloc[0:source_len]

    elif target_len > source_len:
        # Case B: Target is longer than Source (Truncate the Target)
        target_aligned = target.iloc[0:source_len]

    else:
        # Case C: Target and Source are already the same length
        target_aligned = target.copy()

    # --- 2. Data Combination ---
    data = pd.DataFrame({
        'source_text': source['text'],
        'polarization': source['polarization'],
        'target_text': target_aligned['text'],
        'target_polarization': target_aligned['polarization']
    })

    # --- 3. Stratified Splitting ---
    sss_source = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)

    # Split based on source polarization
    for train_index, val_index in sss_source.split(data, data['polarization']):
        train = data.iloc[train_index].reset_index(drop=True)
        val = data.iloc[val_index].reset_index(drop=True)

    # Stratified split for target language in validation set
    full_target_pool = data[['target_text', 'target_polarization']].copy()
    val_size = val.shape[0]
    test_ratio = val_size / data.shape[0]

    sss_target = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=random_state)
    for _, new_val_target_index in sss_target.split(full_target_pool, full_target_pool['target_polarization']):
        new_val_target = full_target_pool.iloc[new_val_target_index].reset_index(drop=True)

    # Verify and update validation target data
    if new_val_target.shape[0] == val.shape[0]:
        val['target_text'] = new_val_target['target_text'].values
        val['target_polarization'] = new_val_target['target_polarization'].values
    else:
        raise ValueError(f"Stratified target split size ({new_val_target.shape[0]}) "
                        f"does not match validation set size ({val.shape[0]}).")

    # --- 4. Print Results (if verbose) ---
    if verbose:
        print("="*60)
        print(f"Dataset: {subtask} | {source_lang} â†’ {target_lang}")
        print("="*60)
        print(f"Total samples: {data.shape[0]}")
        print(f"Train set size: {train.shape[0]} ({train.shape[0]/data.shape[0]*100:.1f}%)")
        print(f"Validation set size: {val.shape[0]} ({val.shape[0]/data.shape[0]*100:.1f}%)")

        print("\n--- Polarization Distribution ---")
        print("\nTrain Set (Source):")
        print(train['polarization'].value_counts().sort_index())
        print(f"  Ratio: {train['polarization'].value_counts(normalize=True).sort_index().to_dict()}")

        print("\nValidation Set (Source):")
        print(val['polarization'].value_counts().sort_index())
        print(f"  Ratio: {val['polarization'].value_counts(normalize=True).sort_index().to_dict()}")

        print("\nValidation Set (Target):")
        print(val['target_polarization'].value_counts().sort_index())
        print(f"  Ratio: {val['target_polarization'].value_counts(normalize=True).sort_index().to_dict()}")
        print("="*60)

    return train, val

In [None]:
# source = pd.read_csv(f'subtask1/train/amh.csv')
# target = pd.read_csv(f'subtask1/train/eng.csv')

# train_stop_idx = int(0.8*source.shape[0])

# data = pd.DataFrame()
# data['source_text'] = source['text']
# data['polarization'] = source['polarization']
# data['target_text'] = target['text']
# data['target_polarization'] = target['polarization']

# train = data[0: train_stop_idx]
# val = data[train_stop_idx: ]

In [None]:
# @title Search for Base Model (English Prioritized)

np.random.seed(42)
torch.manual_seed(42)

for language in languages_param.get_value():
  time.sleep(0.25)
  clear_output()

  # Load the tokenizer
  tokenizer = AutoTokenizer.from_pretrained(tokenizer_param.get_value())

  train, val = load_and_split_bilingual_data(
      subtask = 'subtask1',
      source_lang = language,
      target_lang = 'eng'
  )

  # Create datasets
  train_dataset = PolarizationDataset(train['source_text'].tolist(), train['polarization'].tolist(), tokenizer)
  val_dataset = PolarizationDataset(val['source_text'].tolist(), val['polarization'].tolist(), tokenizer)

  # Load the model
  model = AutoModelForSequenceClassification.from_pretrained(tokenizer_param.get_value(), num_labels=n_labels_param.get_value())

  # Initialize the Trainer
  trainer = Trainer(
      model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      compute_metrics=compute_metrics,     # the callback that computes metrics of interest
      data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
  )

  # Train the model
  trainer.train()

  eval_results = trainer.evaluate()
  print(f"Macro F1 score on {language} validation set: {eval_results['eval_f1_macro']}")

  # ===== SAVE THE FINE-TUNED MODEL =====
  save_path = f'finetuned_models/{language}_{tokenizer_param.get_value()}'
  model.save_pretrained(save_path)
  tokenizer.save_pretrained(save_path)
  print(f"Saved {language} model to {save_path}")
  # ====================================

  eval_results_param = Parameter(eval_results, f"{language}_eval_results", "Performance")
  experiment.add_params([eval_results_param])

In [None]:
# @title Update Parameters 4 Contrastive Alignment

num_epochs = 3 # @param
epochs_param = Parameter(num_epochs, 'epochs', 'Hyperparameter')

train_batch = 8 # @param
train_batch_param = Parameter(train_batch, 'train_batch_size', 'Hyperparameter')

eval_batch = 8 # @param
eval_batch_param = Parameter(eval_batch, 'eval_batch_size', 'Hyperparameter')

contrastive_subset = 2 # @param
contrastive_subset_param = Parameter(
    contrastive_subset,
    'subset_size',
    'Hyperparameter'
)

contrastive_temperature = 0.07 # @param
contrastive_temperature_param = Parameter(
    contrastive_temperature,
    'tau',
    'Hyperparameter'
)

lambda_align = 0.1 # @param
lambda_align_param = Parameter(
    lambda_align,
    'lambda',
    'Hyperparameter'
)

encoder_lr = 1e-4 # @param
encoder_lr_param = Parameter(
    encoder_lr,
    'encoder_lr',
    'Hyperparameter'
)

alignment_lr = 5e-3 # @param
alignment_lr_param = Parameter(
    alignment_lr,
    'alignment_lr',
    'Hyperparameter'
)

classification_lr = 5e-3 # @param
classification_lr_param = Parameter(
    classification_lr,
    'encoder_lr',
    'Hyperparameter'
)

alignment_normalization = False # @param {type: 'boolean'}
alignment_normalization_param = Parameter(
    alignment_normalization,
    'alignment_normalization',
    'Hyperparameter'
)

In [None]:
# @title Update Training Arguments

# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs_param.get_value(),
    per_device_train_batch_size=train_batch_param.get_value(),  # mT5-small can handle 2-4
    per_device_eval_batch_size=eval_batch_param.get_value(),
    gradient_accumulation_steps=8,
    eval_strategy=eval_strategy_param.get_value(),
    save_strategy="no",
    logging_steps=50,
    fp16=True,
    dataloader_num_workers=0,
    load_best_model_at_end=False,
    eval_accumulation_steps=1,
    gradient_checkpointing=False,
)

In [None]:
# @title PolarPairsAlignment Training

from torch.optim import AdamW
np.random.seed(42)
torch.manual_seed(42)

SRC_LANG = 'swa'
TGT_LANG = 'eng'

train, val = load_and_split_bilingual_data(
      subtask = 'subtask1',
      source_lang = SRC_LANG,
      target_lang = TGT_LANG
)
val['target_text'] = val['source_text']
val['target_polarization'] = val['polarization']

# Initialize dataset
train_dataset = PolarPairsDataset(
    source_texts = train['source_text'].tolist(),
    target_texts = train['target_text'].tolist(),  # or different language
    source_labels = train['polarization'].tolist(),
    target_labels = train['target_polarization'].tolist(),  # labels for targets
    tokenizer = tokenizer_param.get_value(),
    max_length = max_length_param.get_value(),
    subset_size = contrastive_subset_param.get_value()
)

val_dataset = PolarPairsDataset(
    source_texts = val['source_text'].tolist(),
    target_texts = val['target_text'].tolist(),  # or different language
    source_labels = val['polarization'].tolist(),
    target_labels = val['target_polarization'].tolist(),  # labels for targets
    tokenizer = tokenizer_param.get_value(),
    max_length = max_length_param.get_value(),
    subset_size = contrastive_subset_param.get_value()
)

# Initialize model
model = MultiLingualPolarPairsAlignment(
    encoder_model_name=tokenizer_param.get_value(),
    pretrained_encoder_name=f'finetuned_models/eng_{tokenizer_param.get_value()}',  # or 'bert-base-multilingual-cased'
    num_labels=n_labels_param.get_value(),
    alignment_normalization=alignment_normalization_param.get_value()
)

optimizer = AdamW([
    {'params': model.encoder.parameters(), 'lr': encoder_lr_param.get_value()},
    # {'params': model.translation_encoder.parameters(), 'lr': 1e-4},
    {'params': model.alignment_head.parameters(), 'lr': alignment_lr_param.get_value()},
    {'params': model.classification_head.parameters(), 'lr': classification_lr_param.get_value()}
])

# Freeze encoders (only train alignment + classification heads)
# for param in model.encoder.parameters():
#     param.requires_grad = False
# for param in model.translation_encoder.parameters():
#     param.requires_grad = False

collator = PolarPairsCollator()

# Train
trainer = PolarPairsTrainer(
    lambda_align = lambda_align_param.get_value(),
    temperature=contrastive_temperature_param.get_value(),
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
    data_collator = collator,
    optimizers = (optimizer, None)
)

trainer.train()

target_results = trainer.evaluate()
ft_results = {
    'src_lang': SRC_LANG,
    'tgt_lang': TGT_LANG,
    'src_performance': target_results
}
_results_param = Parameter(ft_results, f"language_fineTuning_results", "Performance")
experiment.add_params([_results_param])

# ===== SAVE THE FINE-TUNED MODEL =====
# save_path = f'finetuned_models/{TGT_LANG}-{SRC_LANG}_{tokenizer_param.get_value()}'
# model.save_pretrained(save_path)
# tokenizer.save_pretrained(save_path)
# print(f"Saved {TGT_LANG}-{SRC_LANG} model to {save_path}")
# ====================================

Dataset: subtask1 | swa â†’ eng
Total samples: 6991
Train set size: 5592 (80.0%)
Validation set size: 1399 (20.0%)

--- Polarization Distribution ---

Train Set (Source):
polarization
0    2789
1    2803
Name: count, dtype: int64
  Ratio: {0: 0.49874821173104433, 1: 0.5012517882689557}

Validation Set (Source):
polarization
0    698
1    701
Name: count, dtype: int64
  Ratio: {0: 0.498927805575411, 1: 0.501072194424589}

Validation Set (Target):
target_polarization
0    929
1    470
Name: count, dtype: int64
  Ratio: {0: 0.6640457469621158, 1: 0.3359542530378842}




Target distribution - Positive: 1903, Negative: 3689
Target distribution - Positive: 701, Negative: 698


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted
1,1.5172,0.701415,0.33381,0.334525
2,0.651,0.76967,0.558695,0.559013
3,0.4928,0.545295,0.756093,0.75608


In [None]:
experiment.parameters['Performance']

{'amh_eval_results': {'eval_loss': 0.516802966594696,
  'eval_f1_macro': 0.430401366353544,
  'eval_f1_weighted': 0.6504416451040065,
  'eval_runtime': 1.5397,
  'eval_samples_per_second': 433.208,
  'eval_steps_per_second': 54.557,
  'epoch': 3.0},
 'eng_eval_results': {'eval_loss': 0.46227237582206726,
  'eval_f1_macro': 0.7995317226803696,
  'eval_f1_weighted': 0.8141202139870262,
  'eval_runtime': 1.3686,
  'eval_samples_per_second': 471.293,
  'eval_steps_per_second': 59.186,
  'epoch': 3.0},
 'hau_eval_results': {'eval_loss': 0.16815398633480072,
  'eval_f1_macro': 0.8620973827395846,
  'eval_f1_weighted': 0.9477186029288843,
  'eval_runtime': 5.1359,
  'eval_samples_per_second': 142.332,
  'eval_steps_per_second': 17.913,
  'epoch': 3.0},
 'swa_eval_results': {'eval_loss': 0.5171564817428589,
  'eval_f1_macro': 0.7659861363539869,
  'eval_f1_weighted': 0.7660033537968377,
  'eval_runtime': 3.0066,
  'eval_samples_per_second': 465.309,
  'eval_steps_per_second': 58.205,
  'epoch'

In [None]:
# experiment.dir = f"experiments/{experiment_version}.yaml"
experiment.save()

Model saved to experiments/v1.1.0.yaml


In [None]:
! zip NLP_LLMS_subtask1_experiments.zip experiments/*

  adding: experiments/v1.0.0.yaml (deflated 61%)
  adding: experiments/v1.0.1.yaml (deflated 60%)
  adding: experiments/v1.0.2.yaml (deflated 61%)
  adding: experiments/v1.0.3.yaml (deflated 60%)
  adding: experiments/v1.0.4.yaml (deflated 61%)


In [None]:
from google.colab import files
files.download('NLP_LLMS_subtask1_experiments.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_dataset = PolarPairsDataset(
    source_texts = train['source_text'].tolist(),
    target_texts = train['target_text'].tolist(),  # or different language
    source_labels = train['polarization'].tolist(),
    target_labels = train['target_polarization'].tolist(),  # labels for targets
    tokenizer = tokenizer_param.get_value(),
    max_length = max_length_param.get_value(),
    subset_size = contrastive_subset_param.get_value()
)

Target distribution - Positive: 1903, Negative: 3689


In [None]:
train_dataset

<__main__.PolarPairsDataset at 0x79ebd75a8740>

In [None]:
example = train_dataset[[0, 1, 2, 3, 4]]

TypeError: list indices must be integers or slices, not list

In [None]:
example['x_hat_input_ids'].size(), example['x_input_ids'].size()

(torch.Size([20, 128]), torch.Size([128]))

# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [None]:
train = pd.read_csv('subtask2/train/eng.csv')
val = pd.read_csv('subtask2/train/eng.csv')
train.head()

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5, problem_type="multi_label_classification") # 5 labels

In [None]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [None]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

train.head()

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification") # use 6 labels

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")