In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import load_metric, Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed
)
from sklearn.utils import class_weight

In [2]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").long()
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss
    

acc = load_metric("accuracy")
f1 = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return {
        "accuracy": acc.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"],
    } 

In [3]:
max_seq_length = 128
set_seed(42)

## EVALITA 18

In [4]:
model_name = "distilbert-base-cased" 
output_dir = model_name + "_ami18"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_text(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [5]:
train = pd.read_csv("data/miso_train.tsv", sep="\t")
validation = pd.read_csv("data/miso_dev.tsv", sep="\t")
test = pd.read_csv("data/miso_test.tsv", sep="\t")

raw_datasets = DatasetDict(
    train=Dataset.from_pandas(train),
    validation=Dataset.from_pandas(validation),
    test=Dataset.from_pandas(test)
)
raw_datasets = raw_datasets.rename_column("misogynous", "label")

proc_datasets = raw_datasets.map(preprocess_text, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [7]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    save_steps=50,
    save_strategy="steps",
    dataloader_num_workers=4,
    report_to="wandb",
    metric_for_best_model="loss"
)

In [8]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(proc_datasets["train"]["label"]),
    y=np.array(proc_datasets["train"]["label"])
)
class_weights = torch.tensor(class_weights, device="cuda", dtype=torch.float32)

In [9]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=proc_datasets["train"],
    eval_dataset=proc_datasets["validation"],
    callbacks=[early_stopping],
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text, misogyny_category, target.
***** Running training *****
  Num examples = 3600
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 675
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mg8a9[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.6812,0.519057,0.76,0.777778
100,0.4892,0.455647,0.7925,0.796069
150,0.5034,0.427674,0.81,0.806122
200,0.4568,0.416428,0.81,0.796791
250,0.3719,0.454887,0.825,0.798851
300,0.3093,0.582734,0.79,0.721854


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text, misogyny_category, target.
***** Running Evaluation *****
  Num examples = 400
  Batch size = 8
Saving model checkpoint to roberta-base_ami18/checkpoint-50
Configuration saved in roberta-base_ami18/checkpoint-50/config.json
Model weights saved in roberta-base_ami18/checkpoint-50/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text, misogyny_category, target.
***** Running Evaluation *****
  Num examples = 400
  Batch size = 8
Saving model checkpoint to roberta-base_ami18/checkpoint-100
Configuration saved in roberta-base_ami18/checkpoint-100/config.json
Model weights saved in roberta-base_ami18/checkpoint-100/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument 

TrainOutput(global_step=300, training_loss=0.46862095514933266, metrics={'train_runtime': 73.5776, 'train_samples_per_second': 146.784, 'train_steps_per_second': 9.174, 'total_flos': 315733266432000.0, 'train_loss': 0.46862095514933266, 'epoch': 1.33})

In [11]:
trainer.save_model(output_dir)

Saving model checkpoint to roberta-base_ami18
Configuration saved in roberta-base_ami18/config.json
Model weights saved in roberta-base_ami18/pytorch_model.bin


In [12]:
predictions = trainer.predict(test_dataset=proc_datasets["test"])
predictions.metrics

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: id, text, misogyny_category, target.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


{'test_loss': 0.5903980135917664,
 'test_accuracy': 0.668,
 'test_f1': 0.6891385767790261,
 'test_runtime': 2.9371,
 'test_samples_per_second': 340.477,
 'test_steps_per_second': 42.56}

## EVALITA 20

In [5]:
model_name = "dbmdz/bert-base-italian-cased"
output_dir = model_name + "_ami20"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_text(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

In [6]:
train = pd.read_csv("data/AMI2020_training_raw_90.csv")
validation = pd.read_csv("data/AMI2020_validation_raw_10.csv")
test = pd.read_csv("data/AMI2020_test_raw_gt.tsv", sep="\t")

raw_datasets = DatasetDict(
    train=Dataset.from_pandas(train),
    validation=Dataset.from_pandas(validation),
    test=Dataset.from_pandas(test)
)
raw_datasets = raw_datasets.rename_column("misogynous", "label")

proc_datasets = raw_datasets.map(preprocess_text, batched=True)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dbmdz/bert-base-italian-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

In [8]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    load_best_model_at_end=True,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    save_steps=50,
    save_strategy="steps",
    dataloader_num_workers=4,
    report_to="none",
    metric_for_best_model="loss"
)

In [9]:
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(proc_datasets["train"]["label"]),
    y=np.array(proc_datasets["train"]["label"])
)
class_weights = torch.tensor(class_weights, device="cuda", dtype=torch.float32)

In [10]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=proc_datasets["train"],
    eval_dataset=proc_datasets["validation"],
    callbacks=[early_stopping],
    compute_metrics=compute_metrics
)

In [11]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aggressiveness, id, text.
***** Running training *****
  Num examples = 4500
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 843


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.6514,0.488335,0.778,0.792523
100,0.4187,0.388581,0.834,0.840691
150,0.3551,0.254121,0.884,0.882114
200,0.3373,0.235381,0.91,0.906054
250,0.2931,0.309616,0.888,0.890625
300,0.2446,0.257937,0.916,0.9125


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aggressiveness, id, text.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to dbmdz/bert-base-italian-cased_ami20/checkpoint-50
Configuration saved in dbmdz/bert-base-italian-cased_ami20/checkpoint-50/config.json
Model weights saved in dbmdz/bert-base-italian-cased_ami20/checkpoint-50/pytorch_model.bin
Deleting older checkpoint [dbmdz/bert-base-italian-cased_ami20/checkpoint-200] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aggressiveness, id, text.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to dbmdz/bert-base-italian-cased_ami20/checkpoint-100
Configuration saved in dbmdz/bert-base-italian-cased_ami20/checkpoint-100/con

TrainOutput(global_step=300, training_loss=0.3833550294240316, metrics={'train_runtime': 66.8984, 'train_samples_per_second': 201.799, 'train_steps_per_second': 12.601, 'total_flos': 315996377487360.0, 'train_loss': 0.3833550294240316, 'epoch': 1.07})

In [12]:
trainer.save_model(output_dir)

Saving model checkpoint to dbmdz/bert-base-italian-cased_ami20
Configuration saved in dbmdz/bert-base-italian-cased_ami20/config.json
Model weights saved in dbmdz/bert-base-italian-cased_ami20/pytorch_model.bin


In [13]:
predictions = trainer.predict(test_dataset=proc_datasets["test"])
predictions.metrics

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: aggressiveness, id, text.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 8


{'test_loss': 0.4890109896659851,
 'test_accuracy': 0.769,
 'test_f1': 0.7979002624671916,
 'test_runtime': 3.103,
 'test_samples_per_second': 322.264,
 'test_steps_per_second': 40.283}

## EXPERT ANNOTATIONS