# HUHU@IberLEF2023 Task 2 (Multi-label Classification)

Task: https://sites.google.com/view/huhuatiberlef23/huhu

This notebook contains the code to fine-tune several pre-trained transformers for the task of hurtful humour detection (multi-label classification).

In particular, the models are:

* BERT Multilingual: ``bert-base-multilingual-cased`` and ``bert-base-multilingual-uncased``
* RoBERTa: ``roberta-base``
* BETO: ``dccuchile/bert-base-spanish-wwm-cased`` and ``dccuchile/bert-base-spanish-wwm-uncased``
* DistilBERT Multilingual: ``distilbert-base-multilingual-cased``

To take advantage of these transformer models, different ensembles are configured resulting from all their possible combinations.

Experiments show that combining the prediction capabilities of these models allow to achieve better results than when used independently.

# Setting up the environment

In [None]:
import torch

# Check GPU availability on Google Colab
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

use_cuda = torch.cuda.is_available()

/bin/bash: nvidia-smi: command not found


In [None]:
# Install libraries
!pip install simpletransformers
!pip install datasets
!pip install ipywidgets
!pip install --upgrade huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.11-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.7/250.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m27.3 MB/

In [None]:
# Define global variables

SEED = 42 # allow for experiments' reproductibility
WEIGHTED = True # use weighted ensemble (in favour of models with higher F1-score)

# Test split load

In [None]:
from huggingface_hub import notebook_login
# Notebook login via HF's token
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import *
import pandas as pd

# Avoid warnings
logging.set_verbosity_error()

# Load test split
test = pd.DataFrame(load_dataset("huhu2023/test-huhu2023", split="test"))

In [None]:
# Define label encoding
labels = ["prejudice_group1", "prejudice_group2", "prejudice_group3", "prejudice_group4"]

# Function to rename fields and drop unnecessary ones
def get_text_and_label(df):
  return df.rename(columns={"tweet": "text"})[["index", "text"]]

# Get treated dataframe for test split
test = get_text_and_label(test)

print("Test split size:", len(test.index))
test.head()

Test split size: 778


Unnamed: 0,index,text
0,52830,-Mamá en la escuela me dicen gorda -Pobresilla...
1,78883,"No te sientas diferente, da igual si eres negr..."
2,78926,Si esta asi.. SUPER SI.. y que se pongan celos...
3,61844,—Bebé ¿Me veo gorda con este vestido?\n—¡No mi...
4,78830,Las mujeres solo desean 2 cosas en la vida: co...


# Set-up the working environment

In [None]:
# Select the name of the experiment to be evaluated
EXP = "lr-2e-05-optimizer-AdamW-epochs-10-RMSEduringTraining"
TRANSFORMERS = "top2_transformers"

In [None]:
# Load and mount the Drive helper
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Define the path to the experiment folder
PATH = "/content/drive/My Drive/HUHU-IberLEF2023/multi-label/outputs/"
EXP_PATH = os.path.join(PATH, EXP)
TRANSFORMERS_PATH = os.path.join(PATH, TRANSFORMERS)
print("Current working dir:", EXP_PATH)

# Create a folder for the test results
OUTPUT = os.path.join(EXP_PATH, "test")
os.mkdir(OUTPUT)

Current working dir: /content/drive/My Drive/HUHU-IberLEF2023/multi-label/outputs/lr-2e-05-optimizer-AdamW-epochs-10-RMSEduringTraining


# Models' load

In this section, the different transformers that will be evaluated are gathered. For this purpose, the implementation mainly relies in the ``simpletransformers`` Python library, which allows to train and test transformers within few steps.

For further information: https://simpletransformers.ai/

In [None]:
# Define transformers' initialization dictionary 
models = {
    "mbert-cased": {
        "model_type": "bert",
        "model_name": "bert-base-multilingual-cased"
    },
    "mbert-uncased": {
        "model_type": "bert",
        "model_name": "bert-base-multilingual-uncased"
    },
    "roberta": {
        "model_type": "roberta",
        "model_name": "roberta-base"
    },
    "beto-cased": {
        "model_type": "bert",
        "model_name": "dccuchile/bert-base-spanish-wwm-cased"
    },
    "beto-uncased": {
        "model_type": "bert",
        "model_name": "dccuchile/bert-base-spanish-wwm-uncased"
    },
    "distilbert-multi": {
        "model_type": "distilbert",
        "model_name": "distilbert-base-multilingual-cased"
    }
}

In [None]:
# Import pre-trained simpletransformers models for classification
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs

# Define a dictionary where each key matches its corresponding transformer
for model, fields in models.items():    
  models[model] = MultiLabelClassificationModel(fields["model_type"], os.path.join(TRANSFORMERS_PATH, model))

# Best ensemble's definition

The ensemble that performed the best in the selected experiment is defined.

This will be the one used for the predictions to be performed on the test set.

In [None]:
import json

# Get the data relative to the best ensemble
best_ensemble = {}
with open(os.path.join(EXP_PATH, "best-ensemble.json")) as json_file:
    best_ensemble = json.load(json_file)

print("----- BEST ENSEMBLE -----")
for field in ["name", "models", "metrics"]:
  print(f"{field}:", best_ensemble.get(field))

----- BEST ENSEMBLE -----
name: ensemble56
models: ['mbert-cased', 'mbert-uncased', 'roberta', 'beto-cased', 'beto-uncased']
metrics: {'accuracy': 0.91011, 'macro_f1': 0.94293, 'macro_precision': 0.95054, 'macro_recall': 0.93577, 'weighted_f1': 0.94559}


# Best ensemble's predictions

In [None]:
# Load model evaluation JSON
model_evaluation = {}
for model in best_ensemble.get("models"):
  with open(os.path.join(os.path.join(EXP_PATH, model), "model-evaluation.json")) as json_file:
      model_evaluation[model] = json.load(json_file)

In [None]:
from sklearn.preprocessing import normalize

# Function which determines the ensembler prediction based on its
# transformers' predictions. A weighted voting system may be used
def vote(predictions, weighted=False, weights=None):
  votes = list()
  # Get the vote for each label individually
  for i in range(len(labels)):
    # Get the models' predictions for the current label
    curr_label_preds = [preds[i] for preds in predictions]
    # Calculate and append the binary for the current label
    voting = sum(curr_label_preds * weights) if weighted else sum(curr_label_preds)/len(curr_label_preds)
    votes.append(0 if voting < 0.5 else 1)
  return votes

# ensemble metrics
test_predictions = list()

# Function to predict the label of the instances in a dataset split (validation
# ("val") or test ("test")) for each ensemble
def predict_ensemble(ensemble_name, dataset_name, dataset, weighted=False):
  # Traverse each dataset instance
  for i in range(len(dataset.index)):
    predictions = list()
    ensemble_models = best_ensemble.get("models")
    # Get the raw output of each model in the ensemble for the instance at hand
    for model_name in ensemble_models:
      curr_model_outputs = model_evaluation[model_name].get(f"{dataset_name}_model_outputs")
      predictions.append(curr_model_outputs[i])

    # Define the list of weights if a weighted voting system must be used
    weights = list()
    if weighted:
      # The weights' list is obtained by normalizing the weighted F1-scores of
      # the models in the ensemble
      f1_scores_list = [model_evaluation[model_name]["metrics"].get("weighted_f1")
                        for model_name in best_ensemble.get("models")]
      weights = normalize([f1_scores_list], norm="l1")[0]

    # Append the predicted label to the predictions of the ensemble
    ensemble_pred = vote(predictions, weighted, weights)
    test_predictions.append(ensemble_pred)

In [None]:
# Predicting the label of the test set's instances with each individual
# transformer that composes the best ensemble
for model_name in best_ensemble.get("models"):
  model_predictions, model_raw_outputs = models.get(model_name).predict(test["text"].tolist())
  model_evaluation[model_name]["test_model_outputs"] = model_raw_outputs
  model_evaluation[model_name]["test_predictions"] = model_predictions

# Calculating the test predictions of the best ensemble
predict_ensemble(best_ensemble.get("name"), "test", test, weighted=WEIGHTED)

# Show some predictions
n = 5
print(f"First {n} predictions:", test_predictions[:n])

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/778 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

First 5 predictions: [[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0]]


# Save test predictions

A new pandas dataframe is created. Further, the test predictions are saved to an output CSV file as per required by the competition.

In [None]:
# Create output dataframe for test predictions
test_output = test[["index"]].rename(columns={"index": "tweet_id"})

# Split predictions in separated columns
for i in range(len(labels)):
  test_output[labels[i]] = [pred[i] for pred in test_predictions]

test_output.head(10)

Unnamed: 0,tweet_id,prejudice_group1,prejudice_group2,prejudice_group3,prejudice_group4
0,52830,0,0,0,1
1,78883,0,0,1,0
2,78926,1,0,0,0
3,61844,0,0,0,1
4,78830,1,0,0,0
5,48393,1,0,0,1
6,11875,1,0,0,1
7,62245,1,0,0,1
8,62467,1,0,0,1
9,40664,1,0,0,1


In [None]:
# Save results in CSV file
test_output.to_csv(os.path.join(OUTPUT, "results.csv"), index=False)