### Training BERT on Labeled Endometriosis Dataset
This script trains, tests, and saves a BERT model on our labeled paragraphs (or posts) from the endometriosis dataset.

Additional resources for this code:


*   HuggingFace's docs on [fine-tuning a pre-trained model](https://huggingface.co/docs/transformers/training)
*   BERT for Humanist's [Fine-Tuning for Classification](https://colab.research.google.com/drive/19jDqa5D5XfxPU6NQef17BC07xQdRnaKU?usp=sharing) tutorial



In [1]:
# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Basic Python modules
import os
import re
from collections import defaultdict, Counter
import random
import pickle

# For data manipulation and analysis
import pandas as pd
import numpy as np

# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split

# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

In [3]:
incerto_dir = '/content/drive/MyDrive/incerto-autore'
new_poems_dir = os.path.join(incerto_dir, 'data', 'poems')
poems_split_df = pd.read_csv(os.path.join(new_poems_dir, 'poems_split.csv'))
len(poems_split_df)

1176

In [4]:
!pip3 install transformers



In [5]:
# using DistilBERT for testing --> can switch to BERT once set up
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [6]:
# Choose the GPU we want to process this script
device_name = 'cuda'

# Choose the BERT model that we want to use (make sure to keep the cased/uncased consistent)
#model = 'dbmdz/bert-base-italian-xxl-uncased'
model = os.path.join(incerto_dir, 'contbertoldo-all', 'checkpoint')

# This is the maximum number of tokens in any document sent to BERT
max_length = 512

In [7]:
if 'contbertoldo' in model:
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bertoldo')
elif 'italian':
  finetuned_path = os.path.join(incerto_dir, 'output','finetuned-models', 'multi-class', 'bert-ita')
if not os.path.exists(finetuned_path):
  os.makedirs(finetuned_path)

### BERT setup

In [8]:
annotations_df = poems_split_df.loc[poems_split_df['author'] != 'UnknownAuthor']
len(annotations_df)

1116

In [9]:
X = annotations_df['poem'].tolist()
y = annotations_df['author'].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print('Y train', Counter(y_train))
print('Y test', Counter(y_test))
print(X_test[0:3])

Y train Counter({'VeronicaFranco': 172, 'Petrarca': 168, 'OrsattoGiustinian': 125, 'PietroBembo': 63, 'DomenicoVenier': 61, 'MuzioManfredi': 50, 'CelioMagno': 43, 'AntonGiacomoCorso': 43, 'ValerioSali': 39, 'MaffioVenier': 32, 'MarcoStecchini': 22, 'GiorgioGradenigo': 8, 'MarcoVenier': 8, 'BartolomeoZacco': 3})
Y test Counter({'VeronicaFranco': 61, 'Petrarca': 54, 'OrsattoGiustinian': 27, 'DomenicoVenier': 24, 'MuzioManfredi': 20, 'PietroBembo': 19, 'MaffioVenier': 17, 'AntonGiacomoCorso': 15, 'CelioMagno': 14, 'ValerioSali': 11, 'GiorgioGradenigo': 5, 'MarcoVenier': 5, 'MarcoStecchini': 5, 'BartolomeoZacco': 2})
['Poiche tradirmi Amor veggio da quella Che tu gia per si fida a me donasti Ha gia sett anni e nel mio petto entrasti Rendendo l alma a le sue voglie ancella Hor per me rotto hai l arco e la facella Del tutto spenta e i tuoi dissegni guasti Te piu non curo in darno hora contrasti Lei mostrando d ogn altra esser piu bella Che ragion ch in me parve un tempo morta A te cedendo ho

In [10]:
unique_labels = list(set(annotations_df['author'].tolist()))
unique_labels.sort()
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}
print(label2id)
print(id2label)

{'AntonGiacomoCorso': 0, 'BartolomeoZacco': 1, 'CelioMagno': 2, 'DomenicoVenier': 3, 'GiorgioGradenigo': 4, 'MaffioVenier': 5, 'MarcoStecchini': 6, 'MarcoVenier': 7, 'MuzioManfredi': 8, 'OrsattoGiustinian': 9, 'Petrarca': 10, 'PietroBembo': 11, 'ValerioSali': 12, 'VeronicaFranco': 13}
{0: 'AntonGiacomoCorso', 1: 'BartolomeoZacco', 2: 'CelioMagno', 3: 'DomenicoVenier', 4: 'GiorgioGradenigo', 5: 'MaffioVenier', 6: 'MarcoStecchini', 7: 'MarcoVenier', 8: 'MuzioManfredi', 9: 'OrsattoGiustinian', 10: 'Petrarca', 11: 'PietroBembo', 12: 'ValerioSali', 13: 'VeronicaFranco'}


In [11]:
# load the encoder/tokenizer
tokenizer = BertTokenizer.from_pretrained(model)

In [12]:
# class for Torch dataset
class SCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights for all classes
unique_classes = np.unique(y_train)
class_weights = compute_class_weight(
    'balanced',
    classes=unique_classes,
    y=y_train
)

print("Class distribution:")
from collections import Counter
print(Counter(y_train))
print(f"Class weights: {dict(zip(unique_classes, class_weights))}")

# Custom trainer with multi-class weights
class MultiClassWeightedTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Multi-class weighted loss
        loss_fct = torch.nn.CrossEntropyLoss(
            weight=self.class_weights.to(logits.device)
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

Class distribution:
Counter({'VeronicaFranco': 172, 'Petrarca': 168, 'OrsattoGiustinian': 125, 'PietroBembo': 63, 'DomenicoVenier': 61, 'MuzioManfredi': 50, 'CelioMagno': 43, 'AntonGiacomoCorso': 43, 'ValerioSali': 39, 'MaffioVenier': 32, 'MarcoStecchini': 22, 'GiorgioGradenigo': 8, 'MarcoVenier': 8, 'BartolomeoZacco': 3})
Class weights: {np.str_('AntonGiacomoCorso'): np.float64(1.3903654485049834), np.str_('BartolomeoZacco'): np.float64(19.928571428571427), np.str_('CelioMagno'): np.float64(1.3903654485049834), np.str_('DomenicoVenier'): np.float64(0.9800936768149883), np.str_('GiorgioGradenigo'): np.float64(7.473214285714286), np.str_('MaffioVenier'): np.float64(1.8683035714285714), np.str_('MarcoStecchini'): np.float64(2.7175324675324677), np.str_('MarcoVenier'): np.float64(7.473214285714286), np.str_('MuzioManfredi'): np.float64(1.1957142857142857), np.str_('OrsattoGiustinian'): np.float64(0.47828571428571426), np.str_('Petrarca'): np.float64(0.35586734693877553), np.str_('PietroBe

In [14]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # Increased from 3
    per_device_train_batch_size=16,
    per_device_eval_batch_size=20,
    learning_rate=2e-5,              # Lowered from 5e-5
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=100,                  # Added
    save_strategy='steps',           # Added
    save_steps=100,                  # Added
    load_best_model_at_end=True,     # Added
    metric_for_best_model='eval_f1_macro', # Added
    greater_is_better=True,          # Added
)

In [15]:
# load pre-trained model
model = BertForSequenceClassification.from_pretrained(model,  num_labels=len(id2label))#.to(device_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/incerto-autore/contbertoldo-all/checkpoint and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
print(model.config.num_labels)

14


In [17]:
def compute_multiclass_metrics(eval_pred):
    from sklearn.metrics import classification_report, f1_score
    import numpy as np

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Get per-class and overall metrics
    report = classification_report(labels, predictions, output_dict=True, zero_division=0)

    # Extract metrics
    metrics = {
        'accuracy': report['accuracy'],
        'f1_macro': f1_score(labels, predictions, average='macro', zero_division=0),
        'f1_micro': f1_score(labels, predictions, average='micro', zero_division=0),
        'f1_weighted': f1_score(labels, predictions, average='weighted', zero_division=0),
    }

    # Add per-class F1 scores
    for class_id in np.unique(labels):
        if str(class_id) in report:
            metrics[f'f1_class_{class_id}'] = report[str(class_id)]['f1-score']

    return metrics

## Classification task setup

In [18]:
# Pass training/testing sentences to tokenizer, truncate them if over max length, and add padding (PAD tokens up to 512)
train_encodings = tokenizer(X_train,  truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(X_test,  truncation=True, padding=True, max_length=max_length)

# Encoding labels as integer numbers
train_labels_encoded = [label2id[y] for y in y_train]
test_labels_encoded  = [label2id[y] for y in y_test]
print(len(set(train_labels_encoded)),len(set(test_labels_encoded)))

14 14


In [19]:
# Combine encoded text and labels into a torch dataset object.
train_dataset = SCDataset(train_encodings, train_labels_encoded)
test_dataset = SCDataset(test_encodings, test_labels_encoded)

In [20]:
# Create the trainer object based on what we've set up prior to this point! This combines our model, training_args, train_dataset and test_dataset, and custom evaluation function compute_metrics.
trainer = MultiClassWeightedTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=compute_multiclass_metrics      # custom evaluation function
)

In [21]:
# Fine-tune the model on our dataset/labels. The trainer object will periodically output the state of the model.
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mfb265[0m ([33mfedericabologna[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro,F1 Weighted,F1 Class 0,F1 Class 1,F1 Class 2,F1 Class 3,F1 Class 4,F1 Class 5,F1 Class 6,F1 Class 7,F1 Class 8,F1 Class 9,F1 Class 10,F1 Class 11,F1 Class 12,F1 Class 13
100,2.438,2.505495,0.311828,0.180104,0.311828,0.280875,0.0,0.0,0.068966,0.08,0.0,0.604651,0.421053,0.0,0.181818,0.0,0.508475,0.0,0.16092,0.495575
200,1.8695,2.113059,0.383513,0.314774,0.383513,0.37824,0.416667,0.0,0.142857,0.333333,0.0,0.557377,0.727273,0.285714,0.451613,0.227273,0.405405,0.066667,0.238806,0.553846


TrainOutput(global_step=265, training_loss=2.2040239190155604, metrics={'train_runtime': 154.4864, 'train_samples_per_second': 27.09, 'train_steps_per_second': 1.715, 'total_flos': 335533576463280.0, 'train_loss': 2.2040239190155604, 'epoch': 5.0})

In [22]:
# built in evaluation function
trainer.evaluate()

{'eval_loss': 2.113058567047119,
 'eval_accuracy': 0.3835125448028674,
 'eval_f1_macro': 0.3147736451136069,
 'eval_f1_micro': 0.3835125448028674,
 'eval_f1_weighted': 0.37824016565124063,
 'eval_f1_class_0': 0.4166666666666667,
 'eval_f1_class_1': 0.0,
 'eval_f1_class_2': 0.14285714285714285,
 'eval_f1_class_3': 0.3333333333333333,
 'eval_f1_class_4': 0.0,
 'eval_f1_class_5': 0.5573770491803278,
 'eval_f1_class_6': 0.7272727272727273,
 'eval_f1_class_7': 0.2857142857142857,
 'eval_f1_class_8': 0.45161290322580644,
 'eval_f1_class_9': 0.22727272727272727,
 'eval_f1_class_10': 0.40540540540540543,
 'eval_f1_class_11': 0.06666666666666667,
 'eval_f1_class_12': 0.23880597014925373,
 'eval_f1_class_13': 0.5538461538461539,
 'eval_runtime': 2.3961,
 'eval_samples_per_second': 116.437,
 'eval_steps_per_second': 5.843,
 'epoch': 5.0}

In [23]:
#save model
model.save_pretrained(finetuned_path)

In [24]:
print(Counter(y_test))

Counter({'VeronicaFranco': 61, 'Petrarca': 54, 'OrsattoGiustinian': 27, 'DomenicoVenier': 24, 'MuzioManfredi': 20, 'PietroBembo': 19, 'MaffioVenier': 17, 'AntonGiacomoCorso': 15, 'CelioMagno': 14, 'ValerioSali': 11, 'GiorgioGradenigo': 5, 'MarcoVenier': 5, 'MarcoStecchini': 5, 'BartolomeoZacco': 2})


In [25]:
predicted_labels = trainer.predict(test_dataset)
actual_predicted_labels = predicted_labels.predictions.argmax(-1)

In [26]:
class_report = classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten(), output_dict=True)
print(classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten()))

              precision    recall  f1-score   support

           0       0.56      0.33      0.42        15
           1       0.00      0.00      0.00         2
           2       0.11      0.21      0.14        14
           3       0.83      0.21      0.33        24
           4       0.00      0.00      0.00         5
           5       0.39      1.00      0.56        17
           6       0.67      0.80      0.73         5
           7       0.50      0.20      0.29         5
           8       0.64      0.35      0.45        20
           9       0.29      0.19      0.23        27
          10       0.75      0.28      0.41        54
          11       0.09      0.05      0.07        19
          12       0.14      0.73      0.24        11
          13       0.52      0.59      0.55        61

    accuracy                           0.38       279
   macro avg       0.39      0.35      0.31       279
weighted avg       0.50      0.38      0.38       279



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# New + simple save of classification report
class_report_df = pd.DataFrame(class_report).transpose()
class_report_df.to_csv(os.path.join(finetuned_path, 'classification_report.csv'))