In [None]:
!pip install seqeval
!pip install evaluate
!pip install accelerate -U
!pip install ipywidgets==7.7.1

In [2]:
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import RobertaTokenizerFast, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
from torch import nn

In [None]:
# Log-in to Hugging Face
notebook_login()

In [None]:
# Adapt label to the new binary task
id2label = {0:"No linguistic marker of sexism in sentence", 1: "Linguistic marker of sexism in sentence"}
label2id = {v:k for k,v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForSequenceClassification.from_pretrained("osiria/roberta-base-italian", num_labels=num_labels, id2label=id2label, label2id=label2id)
tokenizer = RobertaTokenizerFast.from_pretrained("osiria/roberta-base-italian", add_prefix_space=True)
LEARNING_RATE = 2e-5
EPOCHS = 10
CURRENT_FOLD = 4

In [None]:
# ------------------------------------------------------- Load dataset from local folder ------------------------------------------------

def get_dataset(path):
  """
      Reads a dataset from the given path and returns the train and test set.
  
      @:param path: The path to the dataset file for pipeline.

      @:return train_set, test_set: The train and test set in the dataset.
      
  """
  train = []
  test = []

  for i in range(5):
    with open(path+str(i)+".json", mode="r", encoding="utf-8") as data:
      dataset = json.load(data)
    
    print(len(dataset["Data"]))
    if i == CURRENT_FOLD:
      test.extend(dataset["Data"])
    else:
      train.extend(dataset["Data"])
  return train, test

In [5]:
# ------------------------------------------------------- Load dataset from Hugging Face ------------------------------------------------
def get_dataset(name):
  """
      Reads a dataset from the given Hugging Face dataset name and fold and returns the train and test set.
  
      @:param name: The name of the dataset for the baseline.

      @:return train_set, test_set: The train and test set in the dataset.
      
  """
  train = []
  test = []
  for i in range(5):
    dataset = load_dataset(name, "fold"+str(i))
    if i == CURRENT_FOLD:
      test.extend(dataset["train"]["Data"][0])
    else:
      train.extend(dataset["train"]["Data"][0])

  return train, test


In [10]:
# ---------------------------------------------------------- Train with RoBERTa ----------------------------------------------------------------
# Used to experiment with the different weights for classes, but did not work. So the default Trainer is used
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weights = [0.05] +[0.50]*(self.model.config.num_labels - 1) # Assign different weights to the classes
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights, device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    # In this case, we manually calculate the metrics
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results_acc = accuracy_score(labels, predictions)
    results_precision, results_recall, results_f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    print(results_precision)
    print(results_recall)
    print(results_f1)
    return {
        "precision": results_precision,
        "recall": results_recall,
        "f1": results_f1,
        "accuracy": results_acc
    }

def tokenize_data(dataset):
    """
    Tokenizes the dataset using the RoBERTa tokenizer.
    
    @:param dataset: The input dataset containing the sentence tokenized dataset and the binary labels
                    
    @:return The tokenized dataset.
    """
    tokenized_dataset = []
    for data in dataset:
      tokenized_data = tokenizer(data["text"], truncation=True, padding= True)
      new_data = tokenized_data
      new_data["label"] = data["label"]
      tokenized_dataset.append(new_data)
    return tokenized_dataset


def train_RoBERTa(train, test):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir="fgsd_models",
        learning_rate= LEARNING_RATE,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=8,
        num_train_epochs=EPOCHS,
        weight_decay=0.1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args = training_args,
        train_dataset= train,
        eval_dataset= test,
        tokenizer = tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()


def main():
  # Get dataset from Hugging Face
  train, test = get_dataset("fede-m/setfit_dataset_coreference_folds")
  # Get dataset from local folder
  # train, test = get_dataset("dataset/pipeline/data_sent_coref_fold")
  
  # For this task, we only need to tokenize the text (no padding, truncation and alignment is needed)
  tokenized_train = tokenize_data(train)
  tokenized_test = tokenize_data(test)
  train_RoBERTa(tokenized_train, tokenized_test)
  # Push the model to the Hub
  #model.push_to_hub("fede-m/roberta_binary_"+str(CURRENT_FOLD))


main()

Output hidden; open in https://colab.research.google.com to view.