# Fine-tuning BERT for Multi-class Classification


In [None]:
# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Basic Python modules
import os
import re
from collections import defaultdict, Counter
import random
import pickle

# For data manipulation and analysis
import pandas as pd
import numpy as np

# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split

# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

In [None]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
new_poems_dir = os.path.join(incerto_dir, 'data', 'poems')
poems_split_df = pd.read_csv(os.path.join(new_poems_dir, 'poems_split.csv'))
len(poems_split_df)

In [None]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# using DistilBERT for testing --> can switch to BERT once set up
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
# Choose the GPU we want to process this script
device_name = 'cuda'

# Choose the BERT model that we want to use (make sure to keep the cased/uncased consistent)
#model = 'dbmdz/bert-base-italian-xxl-uncased'
model = os.path.join(incerto_dir, 'contbertoldo-all', 'checkpoint')

# This is the maximum number of tokens in any document sent to BERT
max_length = 512                                                        

In [None]:
if 'contbertoldo' in model:
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bertoldo')
elif 'italian':
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bert-ita')
if not os.path.exists(finetuned_path):
  os.makedirs(finetuned_path)

### BERT setup

In [None]:
annotations_df = poems_split_df.loc[poems_split_df['author'] != 'Unknown']
len(annotations_df)

622

In [None]:
X = annotations_df['poem'].tolist()
y = annotations_df['author'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print('Y train', Counter(y_train))
print('Y test', Counter(y_test))
print(X_test[0:3])

Y train Counter({'Franco': 174, 'Petrarca': 121, 'AntonGiacomoCorso': 45, 'CelioMagno': 39, 'PietroBembo': 39, 'DomenicoVenier': 31, 'GiorgioGradenigo': 9, 'MarcoVenier': 8})
Y test Counter({'Franco': 50, 'Petrarca': 42, 'CelioMagno': 18, 'PietroBembo': 15, 'AntonGiacomoCorso': 14, 'DomenicoVenier': 9, 'MarcoVenier': 4, 'GiorgioGradenigo': 4})
['Mie venture al venir son tarde e pigre La speme incerta e l desir monta e cresce Onde l lasciar e l aspettar m incresce E poi al partir son piu levi che tigre Lasso le nevi fien tepide e nigre E l mar senz onda e per l alpe ogni pesce E corcherassi l Sol la oltre ond esce D un medesimo fonte Eufrate e Tigre Prima ch i trovi in cio pace ne triegua O Amor o Madonna altr uso impari Che m hanno congiurato a torto incontra E s i ho alcun dolce e dopo tanti amari Che per disdegno il gusto si dilegua Altro mai di lor gratie non m incontra', 'Questa col canto suo frenar s udio Spesso i fiumi nel corso e i monti e i sassi Seguaci far di sua rara dolcezz

In [None]:
unique_labels = list(set(annotations_df['author'].tolist()))
unique_labels.sort()
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}
print(label2id)
print(id2label)

{'AntonGiacomoCorso': 0, 'CelioMagno': 1, 'DomenicoVenier': 2, 'Franco': 3, 'GiorgioGradenigo': 4, 'MarcoVenier': 5, 'Petrarca': 6, 'PietroBembo': 7}
{0: 'AntonGiacomoCorso', 1: 'CelioMagno', 2: 'DomenicoVenier', 3: 'Franco', 4: 'GiorgioGradenigo', 5: 'MarcoVenier', 6: 'Petrarca', 7: 'PietroBembo'}


In [None]:
# load the encoder/tokenizer
tokenizer = BertTokenizer.from_pretrained(model)

In [None]:
# class for Torch dataset
class SCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=70,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
)

In [None]:
# load pre-trained model
model = BertForSequenceClassification.from_pretrained(model,  num_labels=len(id2label)).to(device_name)

Some weights of the model checkpoint at /content/drive/MyDrive/incerto-autore/contbertoldo-all/checkpoint were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [None]:
print(model.config.num_labels)

8


In [None]:
# Define a custom evaluation function (this could be changes to return accuracy metrics)
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

## Classification task setup

In [None]:
# Pass training/testing sentences to tokenizer, truncate them if over max length, and add padding (PAD tokens up to 512)
train_encodings = tokenizer(X_train,  truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test,  truncation=True, padding=True, max_length=max_length)

# Encoding labels as integer numbers
train_labels_encoded = [label2id[y] for y in y_train]
test_labels_encoded  = [label2id[y] for y in y_test]
print(len(set(train_labels_encoded)),len(set(test_labels_encoded)))

8 8


In [None]:
# Combine encoded text and labels into a torch dataset object.
train_dataset = SCDataset(train_encodings, train_labels_encoded)
test_dataset = SCDataset(test_encodings, test_labels_encoded)

In [None]:
# Create the trainer object based on what we've set up prior to this point! This combines our model, training_args, train_dataset and test_dataset, and custom evaluation function compute_metrics.
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # custom evaluation function
)

In [None]:
# Fine-tune the model on our dataset/labels. The trainer object will periodically output the state of the model.
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
10,1.9688,1.879503,0.320513
20,1.7735,1.76624,0.320513
30,1.7467,1.744009,0.320513
40,1.7279,1.63346,0.487179
50,1.4391,1.536915,0.49359
60,1.5236,1.43711,0.532051
70,1.3082,1.291756,0.557692
80,1.1544,1.300723,0.538462
90,1.1142,1.152556,0.583333
100,0.8326,1.255529,0.596154


TrainOutput(global_step=150, training_loss=1.1838725725809733, metrics={'train_runtime': 287.1846, 'train_samples_per_second': 8.113, 'train_steps_per_second': 0.522, 'total_flos': 613081784893440.0, 'train_loss': 1.1838725725809733, 'epoch': 5.0})

In [None]:
# built in evaluation function
trainer.evaluate()

{'eval_loss': 1.0586260557174683,
 'eval_accuracy': 0.6538461538461539,
 'eval_runtime': 4.996,
 'eval_samples_per_second': 31.225,
 'eval_steps_per_second': 1.601,
 'epoch': 5.0}

In [None]:
#save model
model.save_pretrained(finetuned_path)

In [None]:
print(Counter(y_test))

Counter({'Franco': 50, 'Petrarca': 42, 'CelioMagno': 18, 'PietroBembo': 15, 'AntonGiacomoCorso': 14, 'DomenicoVenier': 9, 'MarcoVenier': 4, 'GiorgioGradenigo': 4})


In [None]:
predicted_labels = trainer.predict(test_dataset)
actual_predicted_labels = predicted_labels.predictions.argmax(-1)

In [None]:
class_report = classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten(), output_dict=True)
print(classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten()))

              precision    recall  f1-score   support

           0       0.31      0.57      0.40        14
           1       0.60      0.67      0.63        18
           2       0.33      0.11      0.17         9
           3       0.77      0.94      0.85        50
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         4
           6       0.89      0.60      0.71        42
           7       0.50      0.60      0.55        15

    accuracy                           0.65       156
   macro avg       0.43      0.44      0.41       156
weighted avg       0.65      0.65      0.63       156



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# New + simple save of classification report
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df.to_csv(os.path.join(finetuned_path, 'classification_report.csv'))