# Bert baseline for POLAR

## Introduction

In this part of the starter notebook, we will take you through the process of all three Subtasks.

In [None]:
from IPython.display import clear_output
import time
import os

In [None]:
! pip install -q gdown

In [None]:
! mkdir experiments

In [None]:
# @title Experiment Class

from typing import List

import yaml
import numpy as np
import torch


class Experiment:

  def __init__(self, name, dir, description, baseline=None):
    self.name = name
    self.dir = dir
    self.description = description
    self.parameters = dict()
    self.baseline = baseline

  def update_param(self, parameter: 'Parameter'):
    var_name = parameter.get_var_name()
    parameter_class = parameter.get_parameter_class()
    value = parameter.get_value()

    assert isinstance(
        value,
        (
            int, float, str, dict, list,
            np.ndarray, torch.tensor
        )
    )

    if (parameter_class is None) or (parameter_class.lower() == 'global'):
      self.parameters[var_name] = value
      return

    if parameter_class not in self.parameters:
      self.parameters[parameter_class] = dict()
    self.parameters[parameter_class][var_name] = value

  def save(self):
    experiment_dict = {
        'name': self.name,
        'baseline': self.baseline,
        'description': self.description,
        'parameters': self.parameters,
    }

    with open(self.dir, "w") as f:
      yaml.dump(experiment_dict, f, default_flow_style=False)

    print(f"Model saved to {self.dir}")

  def add_params(self, parameters: List['Parameter']):

    for parameter in parameters:
      self.update_param(parameter)


class Parameter:

  def __init__(self, value, var_name, parameter_class):
    self.__var_name = var_name
    self.__value = value
    self.__parameter_class = parameter_class

  def get_var_name(self):
    return self.__var_name

  def get_parameter_class(self):
    return self.__parameter_class

  def get_value(self):
    return self.__value

  def set_value(self, value):
    self.__value = value

  def set_var_name(self, var_name):
    self.__var_name = var_name

  def set_parameter_class(self, parameter_class):
    self.__parameter_class = parameter_class

## Imports

In [None]:
dev_phase_id = "1tbAwUWN8X2JvXgdarjZ31f4XkqcpFVDk"
# subtask1_id = "1q_I6dw9ZbCg3MbQ1wnC-419s2ocCyqaa"
# subtask2_id = "1iHFDd_uihFi7vukWFq1hj32wfEH4dgBc"
# subtask3_id = "1JA7_BbJDYORbmH06gWzz4-UhXgRBe1eI"
translated_tasks_id = "1wHoKpZo8iMhHOm5TpvSS6Nr63Zk2w-P0"

! gdown --id $dev_phase_id
! gdown --id $translated_tasks_id
# ! gdown --id $subtask1_id
# ! gdown --id $subtask2_id
# ! gdown --id $subtask3_id

Downloading...
From: https://drive.google.com/uc?id=1tbAwUWN8X2JvXgdarjZ31f4XkqcpFVDk
To: /content/dev_phase.zip
100% 18.9M/18.9M [00:00<00:00, 214MB/s]


In [None]:
! unzip dev_phase.zip
clear_output()

In [None]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [None]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)


In [None]:
# @title Dataset Class
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

In [None]:
# @title Functions

# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# MultiTask Trainer Classes

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset

class MultiTaskDataset(Dataset):
    """Dataset for both classification and translation tasks"""
    def __init__(self, texts, labels, source_texts, target_texts, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Tokenize for classification
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize source and target for translation
        source_encoding = self.tokenizer(
            self.source_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            self.target_texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'translation_input_ids': source_encoding['input_ids'].squeeze(0),
            'translation_attention_mask': source_encoding['attention_mask'].squeeze(0),
            'translation_labels': target_encoding['input_ids'].squeeze(0)
        }


class MultiTaskBERT(nn.Module):
    """BERT model with classification and translation heads"""
    def __init__(self, model_name, num_labels, vocab_size, hidden_size=768):
        super(MultiTaskBERT, self).__init__()

        # Load pre-trained BERT
        self.bert = AutoModel.from_pretrained(model_name)
        self.hidden_size = hidden_size

        # Classification head (uses CLS token)
        self.classification_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, num_labels)
        )

        # Translation head (uses all token embeddings)
        self.translation_head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(hidden_size, vocab_size)
        )

    def forward(self, input_ids, attention_mask,
                translation_input_ids=None, translation_attention_mask=None,
                labels=None, translation_labels=None):

        # Get BERT outputs for classification
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Classification: Use CLS token (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]
        classification_logits = self.classification_head(cls_output)

        classification_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            classification_loss = loss_fct(classification_logits, labels)

        # Translation: Use all token embeddings
        translation_loss = None
        translation_logits = None

        if translation_input_ids is not None:
            translation_outputs = self.bert(
                input_ids=translation_input_ids,
                attention_mask=translation_attention_mask
            )
            # Use all token embeddings for translation
            token_embeddings = translation_outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
            translation_logits = self.translation_head(token_embeddings)  # [batch_size, seq_len, vocab_size]

            if translation_labels is not None:
                loss_fct = nn.CrossEntropyLoss(ignore_index=self.bert.config.pad_token_id)
                translation_loss = loss_fct(
                    translation_logits.view(-1, translation_logits.size(-1)),
                    translation_labels.view(-1)
                )

        # Combine losses
        total_loss = None
        if classification_loss is not None and translation_loss is not None:
            # You can weight these differently if needed
            total_loss = classification_loss + translation_loss
        elif classification_loss is not None:
            total_loss = classification_loss
        elif translation_loss is not None:
            total_loss = translation_loss

        return {
            'loss': total_loss,
            'classification_loss': classification_loss,
            'translation_loss': translation_loss,
            'classification_logits': classification_logits,
            'translation_logits': translation_logits
        }


# Custom Trainer for multi-task learning
from transformers import Trainer

class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            translation_input_ids=inputs.get('translation_input_ids'),
            translation_attention_mask=inputs.get('translation_attention_mask'),
            labels=inputs.get('labels'),
            translation_labels=inputs.get('translation_labels')
        )

        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss

# Subtask 1 - Polarization detection

This is a binary classification to determine whether a post contains polarized content (Polarized or Not Polarized).

In [None]:
# @title Experiment Definition & Parameter Setting

import os

languages = ['amh', 'eng', 'hau', 'swa'] # @param
languages_param = Parameter(languages, 'language', None)

model_name = "RobertForSequenceClassification" # @param {type: "string"}
model_param = Parameter(model_name, 'model_name', None)

n_labels = 2 # @param
n_labels_param = Parameter(n_labels, 'n_labels', None)

max_length = 128 # @param
max_length_param = Parameter(max_length, 'max_length', 'Hyperparameter')

tokenizer_choice = "roberta-base-case" # @param {type:"string"}
tokenizer_param = Parameter(tokenizer_choice, 'tokenizer', 'Preprocessing')

num_epochs = 3 # @param
epochs_param = Parameter(num_epochs, 'epochs', 'Hyperparameter')

lr = 2e-5 # @param
lr_param = Parameter(lr, 'learning_rate', 'Hyperparameter')

train_batch = 64 # @param
train_batch_param = Parameter(train_batch, 'train_batch_size', 'Hyperparameter')

eval_batch = 8 # @param
eval_batch_param = Parameter(eval_batch, 'eval_batch_size', 'Hyperparameter')

eval_strategy = "epoch" # @param {type: "string"}
eval_strategy_param = Parameter(eval_strategy, 'eval_strategy', 'Hyperparameter')

experiment_version = "v1.1.0" # @param {type: "string"}
experiment_dir = f"experiments/{experiment_version}.yaml"
if os.path.exists(experiment_dir):
  raise ValueError(f"Experiment {experiment_version} already exists")
experiment_description = "Testingif data augmentation improves amharic, " # @param {type: "string"}
experiment_baseline = "v1.0.0" # @param {type: "string"}
experiment = Experiment(
    experiment_version,
    experiment_dir,
    experiment_description,
    experiment_baseline
)
experiment.add_params([
    languages_param, model_param, max_length_param,
    tokenizer_param, epochs_param, lr_param,
    train_batch_param, eval_batch_param,
    eval_strategy_param
])

## Training & Evaluation

In [None]:
# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=epochs_param.get_value(),
        learning_rate=lr_param.get_value(),
        per_device_train_batch_size=train_batch_param.get_value(),
        per_device_eval_batch_size=eval_batch_param.get_value(),
        eval_strategy=eval_strategy_param.get_value(),
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )


In [None]:
# Training loop
for language in languages_param.get_value():
    train = pd.read_csv(f'subtask1_translated/train/{language}.csv')
    val = pd.read_csv(f'subtask1_translated/dev/{language}.csv')

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_param.get_value())

    # Create multi-task datasets
    # Assuming you have 'source_text' and 'target_text' columns for translation
    train_dataset = MultiTaskDataset(
        texts=train['text'].tolist(),
        labels=train['polarization'].tolist(),
        source_texts=train['source_text'].tolist(),  # Add these columns
        target_texts=train['target_text'].tolist(),   # Add these columns
        tokenizer=tokenizer
    )

    val_dataset = MultiTaskDataset(
        texts=val['text'].tolist(),
        labels=val['polarization'].tolist(),
        source_texts=val['source_text'].tolist(),
        target_texts=val['target_text'].tolist(),
        tokenizer=tokenizer
    )

    # Initialize multi-task model
    model = MultiTaskBERT(
        model_name=tokenizer_param.get_value(),
        num_labels=n_labels_param.get_value(),
        vocab_size=tokenizer.vocab_size
    )

    # Optional: Freeze BERT layers and only train the heads
    # for param in model.bert.parameters():
    #     param.requires_grad = False

    # Initialize the Multi-Task Trainer
    trainer = MultiTaskTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    eval_results = trainer.evaluate()
    print(f"Results on {language} validation set: {eval_results}")

    eval_results_param = Parameter(eval_results, f"{language}_eval_results", "Performance")
    experiment.add_params([eval_results_param])

    time.sleep(1)
    clear_output()

In [None]:
# for language in languages_param.get_value():

#   train = pd.read_csv(f'subtask1/train/{language}.csv')
#   val = pd.read_csv(f'subtask1/train/{language}.csv')
#   # Load the tokenizer
#   tokenizer = AutoTokenizer.from_pretrained(tokenizer_param.get_value())

#   # Create datasets
#   train_dataset = PolarizationDataset(train['text'].tolist(), train['polarization'].tolist(), tokenizer)
#   val_dataset = PolarizationDataset(val['text'].tolist(), val['polarization'].tolist(), tokenizer)

#   # Load the model
#   model = AutoModelForSequenceClassification.from_pretrained(tokenizer_param.get_value(), num_labels=n_labels_param.get_value())

#   # Initialize the Trainer
#   trainer = Trainer(
#       model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
#       args=training_args,                  # training arguments, defined above
#       train_dataset=train_dataset,         # training dataset
#       eval_dataset=val_dataset,            # evaluation dataset
#       compute_metrics=compute_metrics,     # the callback that computes metrics of interest
#       data_collator=DataCollatorWithPadding(tokenizer) # Data collator for dynamic padding
#   )

#   # Train the model
#   trainer.train()

#   eval_results = trainer.evaluate()
#   print(f"Macro F1 score on {language} validation set: {eval_results['eval_f1_macro']}")

#   eval_results_param = Parameter(eval_results, f"{language}_eval_results", "Performance")
#   experiment.add_params([eval_results_param])

#   time.sleep(1)
#   clear_output()

In [None]:
experiment.parameters['Performance']

{'amh_eval_results': {'eval_loss': 0.5247534513473511,
  'eval_f1_macro': 0.4304273504273504,
  'eval_runtime': 12.3662,
  'eval_samples_per_second': 269.444,
  'eval_steps_per_second': 33.721,
  'epoch': 3.0},
 'eng_eval_results': {'eval_loss': 0.3241764307022095,
  'eval_f1_macro': 0.8575853816004853,
  'eval_runtime': 8.9028,
  'eval_samples_per_second': 361.91,
  'eval_steps_per_second': 45.267,
  'epoch': 3.0},
 'hau_eval_results': {'eval_loss': 0.20966212451457977,
  'eval_f1_macro': 0.6924550413608028,
  'eval_runtime': 21.9743,
  'eval_samples_per_second': 166.149,
  'eval_steps_per_second': 20.797,
  'epoch': 3.0},
 'swa_eval_results': {'eval_loss': 0.35833677649497986,
  'eval_f1_macro': 0.8517672616733353,
  'eval_runtime': 26.703,
  'eval_samples_per_second': 261.806,
  'eval_steps_per_second': 32.73,
  'epoch': 3.0}}

In [None]:
# experiment.dir = f"experiments/{experiment_version}.yaml"
experiment.save()

Model saved to experiments/v1.0.3.yaml


In [None]:
! zip NLP_LLMS_subtask1_experiments.zip experiments/*

  adding: experiments/v1.0.0.yaml (deflated 59%)
  adding: experiments/v1.0.1.yaml (deflated 61%)
  adding: experiments/v1.0.2.yaml (deflated 61%)
  adding: experiments/v1.0.3.yaml (deflated 61%)


In [None]:
from google.colab import files
files.download('NLP_LLMS_subtask1_experiments.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Subtask 2: Polarization Type Classification
Multi-label classification to identify the target of polarization as one of the following categories: Gender/Sexual, Political, Religious, Racial/Ethnic, or Other.
For this task we will load the data for subtask 2.

In [None]:
train = pd.read_csv('subtask2/train/eng.csv')
val = pd.read_csv('subtask2/train/eng.csv')
train.head()

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)
dev_dataset = PolarizationDataset(val['text'].tolist(), val[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer)


In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5, problem_type="multi_label_classification") # 5 labels

In [None]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

# Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



In [None]:
train = pd.read_csv('subtask3/train/eng.csv')
val = pd.read_csv('subtask3/train/eng.csv')

train.head()

In [None]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create train and Test dataset for multilabel
train_dataset = PolarizationDataset(train['text'].tolist(), train[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)
val_dataset = PolarizationDataset(val['text'].tolist(), val[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer)

In [None]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification") # use 6 labels

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)

# Define metrics function for multi-label classification
def compute_metrics_multilabel(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics_multilabel,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")