<a href="https://colab.research.google.com/github/cheatham1/EU-JAV/blob/main/Model_Selection_EU_JAV_Finetuning_and_Evaluating_a_BERT_model_for_classification_using_PyTorch_and_Huggingface_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## EU-JAV - Finetuning and Evaluating a BERT model for Classification



In this notebook we will finetune BERT base models. 


After training, we will save the model, evaluate it and use it for predictions.


Thanks to Per from the National Library of Norway

In [None]:
# Runtime > Change runtime type menu - GPU

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!pip install --upgrade transformers
!pip install sentencepiece

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import EarlyStoppingCallback
#from transformers import BertForPreTraining
#from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
#from transformers import CamembertModel, CamembertTokenizer
#from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import RobertaTokenizer, XLMRobertaXLForSequenceClassification

#from transformers import ElectraForSequenceClassification
#from transformers import XLMTokenizer, XLMWithLMHeadModel
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification


In [None]:
#@markdown Set the main model that the training should start from
#
#model_name = 'bert-base-multilingual-uncased' #@param ["NbAiLab/nb-bert-base", "NbAiLab/nb-bert-large", "bert-base-multilingual-cased", "bert-base-multilingual-uncased"]
#model_name = 'dbmdz/bert-base-italian-xxl-uncased' #@param ["dbmdz/bert-base-italian-xxl-uncased", "dbmdz/bert-base-italian-xxl-cased","m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0","bert-base-multilingual-cased","bert-base-multilingual-uncased"]
#model_name = 'digitalepidemiologylab/covid-twitter-bert-v2'
#model_name = 'dbmdz/bert-base-italian-xxl-uncased' #@param ["dbmdz/bert-base-italian-xxl-uncased","m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0","bert-base-uncased", "bert-base-multilingual-cased"]
#model_name = 'xlm-roberta-base' #@param['roberta-base','xlm-roberta-base','roberta-large-mnli','cardiffnlp/twitter-roberta-base', 'dbmdz/bert-base-italian-uncased']


#model_name = "google/electra-small-discriminator"
#model_name = "dbmdz/electra-base-italian-xxl-cased-discriminator"
model_name = "xlm-roberta-large"

#@markdown ---
#@markdown Set training parameters
batch_size =  16 #16@param {type: "integer"} 
learning_rate = 2e-5 #@param {type: "number"}
warmup_proportion = 0.15 #@param {type: "number"}

num_epochs = 13 #@param {type: "integer"} # 13
max_seq_length = 96 #256 , 128, 98, 128@param {type: "integer"}
weight_decay = 0.01 #@param {type: "number"} # 0.01


#tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertModel.from_pretrained(model_name, num_labels=3)

#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModel.from_pretrained(model_name)

# xlm-roberta-large and base **
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)


In [None]:
addData = False  # False = use just dataset 1 True = use dataset1+2
#useRemovedDifficultTrainingData = False # datset saved after initial run to remove difficult tweets

In [None]:
vocab = tokenizer.convert_ids_to_tokens(range(tokenizer.vocab_size))
print(len(vocab))

In [None]:
#words = ["maschera", "mask", "covid", "coronavirus", "virus", "isolation", "confinement", "vaccination", "vaccine"]
#for word in words:
#  print(word, (word in vocab))

In [None]:
#print(len(tokenizer.vocab))

## Load and Prepare the Dataset used for Finetuning
The selected dataset is loaded directly from a web resource. It is coded with labels and text in a comma-separated file. You can replace this with any other data source. This data is here converted into the pytorch data format. 

In [None]:
# ----- Data set 1 Aleady split into train, dev, test-----#

train_data1 = pd.read_csv(
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetA_train_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
)
dev_data1 = pd.read_csv(
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetA_dev_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
)
test_data1 = pd.read_csv(
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetA_test_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
)

print("Dataset1: ", train_data1.shape, dev_data1.shape, test_data1.shape)



In [None]:
# ----- Data set 2 Aleady split into train, dev, test-----#

train_data2 = pd.read_csv(
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetB_train_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
    )

dev_data2 = pd.read_csv(
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetB_dev_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
    )

test_data2 = pd.read_csv( 
    'https://raw.githubusercontent.com/cheatham1/EU-JAV-AB/main/3categories/datasetB_dev_3categories.csv',
    names=['Annotator1','Annotator2','Annotator3','label','text','index']
    )

In [None]:
if addData == True:
  print("adding datasets")
   
  train_data = train_data1.append(train_data2)
  dev_data  = dev_data1.append(dev_data2)
  test_data  = test_data1.append(test_data2)

else:
  train_data= train_data1.copy()
  dev_data= dev_data1.copy()
  test_data= test_data1.copy()

In [None]:
train_data.groupby(['label']).count()

In [None]:
#!pip install emoji
#import emoji
#train_data['text'] = train_data['text'].apply(lambda x: emoji.demojize(x))
#dev_data['text']   = dev_data['text'].apply(lambda x: emoji.demojize(x))
#test_data['text']  = test_data['text'].apply(lambda x: emoji.demojize(x))

In [None]:
train_data.groupby(['label']).count()

In [None]:
dev_data.groupby(['label']).count()

In [None]:
print(f'The dataset is imported.\n\nThe training dataset has {len(train_data)} items.\nThe development dataset has {len(dev_data)} items. \nThe test dataset has {len(test_data)} items')


In [None]:
train_data.label = train_data.label.astype('float').astype('Int64')
dev_data.label = dev_data.label.astype('float').astype('Int64')
test_data.label = test_data.label.astype('float').astype('Int64')

In [None]:
train_data.head()

In [None]:
#train_data['text'] = train_data.text.str.replace('\n',' ')
#dev_data['text'] = dev_data.text.str.replace('\n',' ')
#test_data['text'] = test_data.text.str.replace('\n',' ')

In [None]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
dev_data = dev_data.sample(frac=1).reset_index(drop=True)

In [None]:
# ----- Preprocess data -----#
# Preprocess data
X_train = list(train_data["text"])
y_train = list(train_data["label"])
X_dev = list(dev_data["text"])
y_dev = list(dev_data["label"])
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=max_seq_length)
X_dev_tokenized = tokenizer(X_dev, padding=True, truncation=True, max_length=max_seq_length)

# Create torch dataset


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_dev_tokenized, y_dev)

print(f'The dataset is imported.\n\nThe training dataset has {len(train_data)} items.\nThe development dataset has {len(dev_data)} items. \nThe test dataset has {len(test_data)} items')
steps = round(len(train_data)/batch_size)

num_warmup_steps = round(steps*warmup_proportion*num_epochs)
print(f'You are planning to train for a total of {steps} steps * {num_epochs} epochs = {num_epochs*steps} steps. Warmup is {num_warmup_steps} steps or {round(100*num_warmup_steps/(steps*num_epochs))}%. We recommend at least 10%.')


In [None]:
totalsize = len(train_data) + len(dev_data) + len(test_data)
print(f'The dataset total size {totalsize}')

In [None]:
#X_train[0]

# Start Training
We are here using the HuggingFace Trainer interface. An alternative implementation could be to use Tensorflow/Keras or native PyTorch.

Please note that training the large BERT-model on a GPU might be a challenge. The two critical parameters are batch_size and sequence_length. Reduce these until you no longer are getting Out-of-memory(OOM) errors. The political speeches corpus above have very long sequences. You might want to truncate them at 128 tokens. This makes the task harder since the model is allowed to see less of each sequence. Reducing batch_size below 8 might lead to unstability and very long training time.

In [None]:
#from transformers.optimization import Adafactor, AdafactorSchedule

#optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
#lr_scheduler = AdafactorSchedule(optimizer)

In [None]:
#def forward(self, input_ids, attention_mask=None, token_type_ids=None,
#            position_ids=None, head_mask=None, labels=None):

In [None]:
# ----- Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, label = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=label, y_pred=pred)

    recall = recall_score(y_true=label, y_pred=pred, average='weighted')
    precision = precision_score(y_true=label, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=label, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    #evaluation_strategy="steps",
    #eval_steps=round(steps/2),
    #logging_steps=round(steps/10),
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate, #The default here is linear decay to 0. 
    warmup_steps=num_warmup_steps,
    num_train_epochs=num_epochs,
    weight_decay = weight_decay,
    #save_steps=steps, #Only saves at the end
    #seed= 3, #3,
    metric_for_best_model= "accuracy",
    load_best_model_at_end=True,
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-EUJAV",
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #optimizers=(optimizer, lr_scheduler),  ##### do we want this?
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    #callbacks = [tboard_callback]
)



# Train pre-trained model
trainer.train()

In [None]:
from google.colab import files
#files.download('output/checkpoint-66/trainer_state.json')

In [None]:
#print(model)

In [None]:
trainer.evaluate()

# Run Preditions and print Evaluation Report
The code below first runs predictions on the train dataset. After that it prints and evaluation report using a tool from sklearn.

Typically it is two number you want from this: The accuracy score (the first number on the "accuracy"-line. In addtion most journals want you to report the F1-macro-score since this is a sequence classification task. This is the number beneath accuracy (or in the intersection between f1-score and macro-avg).

One of the tasks above is a balanced dataset. Both are binary classification. In a balanced binary classification the F1-macro and the average is basically the same (rounding differences only). In the unbalanced set, these values will vary greatly. The F1-macro is typically a much better measurement of how good your network is doing.

We repeat the same raport for eval and test as well.

In [None]:
#Print report
from sklearn.metrics import classification_report

print("\nValidation-set Evaluation")
dev_dataset = Dataset(X_dev_tokenized)
dev_pred, _, _ = trainer.predict(dev_dataset)
y_pred_bool_dev = np.argmax(dev_pred, axis=1)
#print(classification_report(dev_data["label"], y_pred_bool, digits=3))
print(classification_report(dev_data["label"].to_numpy().astype("int"), y_pred_bool_dev, digits=3))


print("\nTrain-set Evaluation")
train_dataset = Dataset(X_train_tokenized)
train_pred, _, _ = trainer.predict(train_dataset)
y_pred_bool_train = np.argmax(train_pred, axis=1)
#print(classification_report(train_data["label"], y_pred_bool, digits=3))
print(classification_report(train_data["label"].to_numpy().astype("int"), y_pred_bool_train, digits=3))


In [None]:
df_label_prediction = train_data.copy()
df_label_prediction['prediction'] = y_pred_bool_train.tolist()

In [None]:
df_label_prediction.head()

In [None]:
# Number wrongly labelled 
indexNames = df_label_prediction[ df_label_prediction['label'] != df_label_prediction['prediction'] ].index
print(len(indexNames))

In [None]:
for i in indexNames:
    print(i, df_label_prediction.text.iloc[i], " : ",df_label_prediction.label.iloc[i], " : ",df_label_prediction.prediction.iloc[i],)

In [None]:
train_data.shape

In [None]:
from google.colab import files

df_label_prediction.to_csv('trainingDataLabelsPredictions.csv') 
#files.download('trainingDataLabelsPredictions.csv')

In [None]:
# Remove difficult to label data from training dataset
#indexNames = df_label_prediction[ df_label_prediction['label'] != df_label_prediction['prediction'] ].index

# Remove easy to label data from training dataset
#indexNames = df_label_prediction[ df_label_prediction['label'] == df_label_prediction['prediction'] ].index

#df_label_prediction.drop(indexNames , inplace=True)
#size=df_label_prediction.shape[0]
#orig_size = train_data.shape[0]
#print(size, (orig_size-size)/orig_size)

In [None]:
from sklearn.metrics import f1_score
f1  = f1_score(y_dev, y_pred_bool_dev, average='macro')
f1

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_dev, y_pred_bool_dev)
acc

In [None]:
def evaluateModelwDataset(dataset):

  X_dev = list(dataset["text"])
  X_dev_tokenized = tokenizer(X_dev, padding=True, truncation=True, max_length=max_seq_length)

  dev_dataset = Dataset(X_dev_tokenized)
  dev_pred, _, _ = trainer.predict(dev_dataset)
  y_pred_dev = np.argmax(dev_pred, axis=1)
  y_dev = dataset["label"].to_numpy().astype("int")


  print(classification_report(y_dev, y_pred_dev, digits=3))

  f1_dataset = f1_score(y_dev, y_pred_dev, average='macro')
  acc_dataset = accuracy_score(y_dev, y_pred_dev)

  print("f1: ",f1_dataset, "  accuracy: ",acc_dataset)

  results = [f1_dataset, acc_dataset]

  return(results)


In [None]:
# Eval with dataset A. 
results_dataset1 = evaluateModelwDataset(dev_data1)

In [None]:
# Eval with dataset B. 
results_dataset2 = evaluateModelwDataset(dev_data2)


In [None]:
## Test with dataset A+B 
dev_data3 = dev_data1.copy()
dev_data3 = dev_data3.append(dev_data2)
dev_data3 = dev_data3.sample(frac=1).reset_index(drop=True)

results_dataset3 =evaluateModelwDataset(dev_data3)


In [None]:
print("Results from validation dataset")

print("F1")

print("DatasetA: {:.3f}".format( results_dataset1[0]))
print("DatasetB: {:.3f}".format( results_dataset2[0]))
print("DatasetA+B: {:.3f}".format(results_dataset3[0]))

print("Accuracy")

print("DatasetA: {:.3f}".format( results_dataset1[1]))
print("DatasetB: {:.3f}".format( results_dataset2[1]))
print("DatasetA+B: {:.3f}".format(results_dataset3[1]))



In [None]:
# Eval with test data
results_test_dataset1 = evaluateModelwDataset(test_data1)
results_test_dataset2 = evaluateModelwDataset(test_data2)

test_data3 = test_data1.copy()
test_data3 = test_data3.append(test_data2)
test_data3 = test_data3.sample(frac=1).reset_index(drop=True)
results_test_dataset3 = evaluateModelwDataset(test_data3)




In [None]:
print("Results from TEST dataset")

print("F1")

print("DatasetA: {:.3f}".format( results_test_dataset1[0]))
print("DatasetB: {:.3f}".format( results_test_dataset2[0]))
print("DatasetA+B: {:.3f}".format(results_test_dataset3[0]))

print("Accuracy")

print("DatasetA: {:.3f}".format( results_test_dataset1[1]))
print("DatasetB: {:.3f}".format( results_test_dataset2[1]))
print("DatasetA+B: {:.3f}".format(results_test_dataset3[1]))

In [None]:
from matplotlib import rcParams
rcParams['figure.figsize'] = 8, 8
font_size = 22

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

import itertools
class_names= ["Promotional","Neutral","Discouraging"]

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
   
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    
    print(cm)
    print("\n")

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(shrink=0.7)
    tick_marks = np.arange(len(classes))
    #plt.xticks(tick_marks, classes, rotation=45)
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)


    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=font_size)

    plt.tight_layout()
    plt.ylabel('Predicted label',fontsize=font_size)
    plt.xlabel('True label',fontsize=font_size)

In [None]:
X_test = list(test_data1["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=max_seq_length)

test_dataset = Dataset(X_test_tokenized)
test_pred, _, _ = trainer.predict(test_dataset)
y_pred = np.argmax(test_pred, axis=1)
y_test = test_data1["label"].to_numpy().astype("int")

cnf_matrix = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(cnf_matrix, classes= class_names, normalize=True)
plt.title('Test datasetA',fontsize=font_size)

In [None]:
X_test = list(test_data2["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=max_seq_length)

test_dataset = Dataset(X_test_tokenized)
test_pred, _, _ = trainer.predict(test_dataset)
y_pred = np.argmax(test_pred, axis=1)
y_test = test_data2["label"].to_numpy().astype("int")

cnf_matrix = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(cnf_matrix, classes= class_names, normalize=True)
plt.title('Test datasetB',fontsize=font_size)


In [None]:
X_test = list(test_data3["text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=max_seq_length)

test_dataset = Dataset(X_test_tokenized)
test_pred, _, _ = trainer.predict(test_dataset)
y_pred = np.argmax(test_pred, axis=1)
y_test = test_data3["label"].to_numpy().astype("int")

cnf_matrix = confusion_matrix(y_test, y_pred)

plot_confusion_matrix(cnf_matrix, classes= class_names, normalize=True)
plt.title('Test datasetA+B',fontsize=font_size)

In [None]:
:: stop here

# **Save the model**

In [None]:
model_name_to_save = "xlm-roberta-large-finetuned-dAB-002"

In [None]:
### Install git lfs
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash 
!sudo apt-get install git-lfs
!git lfs install

In [None]:
# save on huggingface

In [None]:
!git config --global user.email "Susancheatham1@gmail.com"

In [None]:
!huggingface-cli login

In [None]:
!git config --global credential.helper store

In [None]:
dir_to_save = './' + model_name_to_save + '/'
dir_to_save

In [None]:
# Save locally first
model.save_pretrained(dir_to_save)
tokenizer.save_pretrained(dir_to_save)
trainer.save_model(dir_to_save)

In [None]:
#!ls 

In [None]:
    # Files to expect...
    # a config.json file, which saves the configuration of your model ;
    # a pytorch_model.bin file, which is the PyTorch checkpoint (unless you can’t have it for some reason) ;
    # a tf_model.h5 file, which is the TensorFlow checkpoint (unless you can’t have it for some reason) ;
    # a special_tokens_map.json, which is part of your tokenizer save;
    # a tokenizer_config.json, which is part of your tokenizer save;
    # files named vocab.json, vocab.txt, merges.txt, or similar, which contain the vocabulary of your tokenizer, part of your tokenizer save;
    # maybe a added_tokens.json, which is part of your tokenizer save.


In [None]:
!huggingface-cli repo create model_name_to_save --yes


In [None]:
hface_dir_to_save = 'https://huggingface.co/Cheatham/'+ model_name_to_save + '/'
print(hface_dir_to_save)

model.push_to_hub(hface_dir_to_save)
tokenizer.push_to_hub(hface_dir_to_save)


In [None]:
#####import os
#### Mount Google Drive to this Notebook instance.

from google.colab import drive
drive.mount('/content/drive')

In [None]:
ls drive/MyDrive/EU-JAV/Models

In [None]:
cd MyDrive/EU-JAV/Models

In [None]:
trainer.save_model(model_name_to_save)

In [None]:
#tokenizer.save_pretrained("EU-JAV-finetuned-xlmroberta-tokenizer")


In [None]:
#model.save_pretrained("EU-JAV-finetuned-xlmroberta-model")

In [None]:
###drive.flush_and_unmount()

In [None]:
#ls EU-JAV-models/EUJAV-tokenizer

In [None]:
#ls EU-JAV-models/EUJAV-finetuned-roberta-model_uncased