<a href="https://colab.research.google.com/github/elylaila/seminario/blob/main/bertita_adam_hyperparameter_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import sys

!{sys.executable} -m pip install transformers[torch] datasets evaluate torch

Collecting transformers[torch]
  Using cached transformers-4.34.0-py3-none-any.whl (7.7 MB)
Collecting datasets
  Using cached datasets-2.14.5-py3-none-any.whl (519 kB)
Collecting evaluate
  Using cached evaluate-0.4.0-py3-none-any.whl (81 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Using cached huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.20.3 (from transformers[torch])
  Down

In [5]:
from datasets import load_dataset, Features, Value, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertConfig, AutoConfig
import numpy as np
import evaluate
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Data loading

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/Seminario

/content/drive/MyDrive/Seminario


In [8]:
df_dev = load_dataset("csv", data_files="dataset/df_train_clean.csv")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

# Data partioning

In [9]:
dev = df_dev['train'].train_test_split(test_size=0.3, seed=42)
dev

DatasetDict({
    train: Dataset({
        features: ['text', 'homotransphobic'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['text', 'homotransphobic'],
        num_rows: 1500
    })
})

In [10]:
df_train = dev["train"]
df_val = dev["test"]

In [11]:
df_train = df_train.rename_column("homotransphobic", "labels")
df_val = df_val.rename_column("homotransphobic", "labels")

In [12]:
df_val

Dataset({
    features: ['text', 'labels'],
    num_rows: 1500
})

# Model loading

In [13]:
nome = 'bert_base_italian'
model_name = 'dbmdz/bert-base-italian-cased'

In [14]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

# Data preprocessing

In [15]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
tokenized_train = df_train.map(tokenize_function, batched=True)
tokenized_val = df_val.map(tokenize_function, batched=True)
# tokenized_test = df_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [17]:
tokenized_train

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3500
})

# Training

In [18]:
def compute_metrics(eval_pred):
  # takes as input model predictions, applies argmax to them and comput F-Score between predictions and true labels
  f1_metric = evaluate.load("f1")
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return f1_metric.compute(predictions=predictions, references=labels, average="macro")

In [19]:
def hyperparameter_search(model_name=model_name, optim='adamw_hf', learning_rate=0.00005, dropout=0.1):
# inserire i parametri di default del modello
    training_args = TrainingArguments(
            output_dir=f"{nome}-finetuned",
            evaluation_strategy = "epoch", # monitor the evaluation metrics during fine-tuning at the end of each epoch
            save_strategy = "epoch",
            logging_strategy="epoch",
            num_train_epochs=10,
            load_best_model_at_end=True, # the best model might not be the one at the end of training => we load the best saved model
            metric_for_best_model='eval_f1',
            seed=42,
            optim=optim,
            learning_rate=learning_rate # default 0.00005
            #per_device_train_batch_size # default 8
            #per_device_eval_batch_size # default 8
            #weight_decay=0.0001 # default 0 # disastro
            )
    config = AutoConfig.from_pretrained(model_name)
    config.dropout = dropout
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    return training_args, model

In [None]:
%%time
dropout_values = [0.1]
lr_values = [0.00005, 0.00003, 0.00007]
histories = {}
for dropout_value in dropout_values:
  for lr_value in lr_values:
    if not (dropout_value == 0.05 and lr_value == 0.00005):
      config_name = f'Dropout_{dropout_value}_lr_{lr_value}'
      print(config_name)
      training_args, model = hyperparameter_search(dropout=dropout_value, learning_rate=lr_value)
      trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics)
      trainer.train()
      log_history = trainer.state.log_history
      histories[config_name] = log_history

Dropout_0.1_lr_5e-05


Downloading model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.5415,0.654014,0.784903
2,0.3956,0.547316,0.806504
3,0.2724,0.671902,0.800292
4,0.1625,0.804821,0.803225
5,0.0785,1.01451,0.814691
6,0.0471,1.116435,0.818402
7,0.026,1.141669,0.822343
8,0.0095,1.215717,0.826201
9,0.0042,1.289132,0.823548
10,0.0064,1.294548,0.822906


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Dropout_0.1_lr_3e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.5745,0.605612,0.773085
2,0.4197,0.507378,0.805513
3,0.293,0.715431,0.819473
4,0.1764,0.802997,0.814094
5,0.1201,0.913703,0.810014
6,0.0678,0.992671,0.827737
7,0.0537,1.083185,0.821734
8,0.0242,1.181608,0.81914
9,0.0206,1.238689,0.820118
10,0.0139,1.271285,0.823772


Dropout_0.1_lr_7e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-italian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_name)

# Evaluation

In [None]:
log_history = trainer.state.log_history

In [None]:
df = pd.DataFrame(columns=["Epoch", "Loss", "Dataset"])

for log_data in log_history:
  epoch = int(log_data["epoch"])
  if "loss" in log_data.keys():
    loss = log_data["loss"]
    df_train = pd.DataFrame({"Epoch": epoch, "Loss": loss, "Dataset": "Training"}, index=[0])
    df = pd.concat([df, df_train], ignore_index=True)
  if "eval_loss" in log_data.keys():
    loss = log_data["eval_loss"]
    df_val = pd.DataFrame({"Epoch": epoch, "Loss": loss, "Dataset": "Validation"}, index=[0])
    df = pd.concat([df, df_val], ignore_index=True)
df

In [None]:
# loss lineplot
sns.lineplot(data=df, x="Epoch", y="Loss", hue="Dataset")
plt.xticks(range(1, 16))
plt.xlabel('Epoca')
plt.savefig(f'loss_curves_{nome}.pdf', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from transformers.modelcard import parse_log_history

In [None]:
his = parse_log_history(log_history)

In [None]:
df_his = pd.DataFrame(his[1], columns=['Step','Epoch', 'Training Loss', 'Validation Loss', 'F1'])

In [None]:
df_his

In [None]:
output_predictions = trainer.predict(tokenized_val)

In [None]:
y_val = tokenized_val["labels"]
y_val_pred = np.argmax(output_predictions.predictions, axis=1)

report = classification_report(y_val, y_val_pred)
print(report)

In [None]:
plt.rcParams.update({'font.size': 18})
ConfusionMatrixDisplay.from_predictions(y_val, y_val_pred, cmap='Blues', normalize='true')
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('Etichetta predetta', fontsize=16)
plt.ylabel('Etichetta reale', fontsize=16)
plt.savefig(f'conf_matrix_{nome}.pdf', dpi=300, bbox_inches='tight')
plt.show()