Code references:  
[1] Joshi, P. (2020). Transfer Learning for NLP: Fine-Tuning BERT for Text Classification. https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/  
[2] Tran, C. (2021) Tutorial: Fine tuning BERT for Sentiment Analysis. https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

In [1]:
!pip install transformers

## Import librairies

In [1]:
import numpy as np
import torch
import pickle
import time
from hyperopt import tpe, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import accuracy_score
import fine_tuning_functions
import model_tokenizer_loaders

# specify GPU
device = torch.device("cuda")

## Load data

In [2]:
with open("data_nl_english_french_23300", 'rb') as f:
    data_english_french_nl = pickle.load(f)

## Define model to fine-tune

In [3]:
model_name_ = "RobBERT"

## Split dataset into train, validation and test sets

In [4]:
train_text, train_labels, val_text, val_labels, test_text, test_labels = fine_tuning_functions.split_dataset(data_english_french_nl, model_name_)

## Choose model to fine-tune

In [5]:
space, nb_trials, model_name = model_tokenizer_loaders.models_hyperparameters(model_name_)

## TPE

In [6]:
def objective_function(params):
    """ Define objective function for TPE algorithm.
    The input has to be only the model parameters to be able to work with
    the TPE algorithm.
    train_text, val_text, test_text, train_labels, val_labels, test_text, 
    test_labels, model_name has to be defined beforehand.
    Returns -accuracy of the model on test set.

    Input:
    - params: parameters of BERT model and architecture
    """

    # Get the time when the model starts running
    start_time = time.time()

    # Print selected hyperparameters by TPE
    print(params)

    # Get the hyperparameters
    dropout, epochs, folder_name, lr = params.values()

    # Fine-tunes the model with the associated hyperparameters
    model = fine_tuning_functions.fine_tuning_model(epochs, lr, dropout, folder_name, train_text, val_text, test_text, train_labels, val_labels, model_name)

    # The objective funtion returns -accuracy on test set
    # To be able to test, we need the test tensors:
    tokenizer = model_tokenizer_loaders.load_tokenizer(model_name)
    
    #If this is multilingual
    if "test" in model_name or "all" in model_name:
        test_seq_nl, test_mask_nl, test_seq_en, test_mask_en, test_seq_fr, test_mask_fr, test_y = fine_tuning_functions.to_tensor_test(tokenizer, test_text, test_labels)
    else:
        test_seq, test_mask, test_y = fine_tuning_functions.to_tensor_test(tokenizer, test_text, test_labels)

    # Load best model just fine-tuned with the previous hyperparameters
    path = folder_name + '/saved_weights_lr{:}_dropout{:}_epochs{:}.pt'.format(lr, dropout, epochs)
    model.load_state_dict(torch.load(path))

    # Compute the accuracy on test set
    # The model is too big to be able to predict on the whole test set
    # Predict on 500 observations at a time and take the average
    # Disctinct two cases: unilingual and multilingual model
    
    #If this is a multilingual model
    if "test" in model_name or "all" in model_name:
        acc_list_nl = []
        acc_list_en = []
        acc_list_fr = []
        for i in range(0, len(test_seq), 500):
            with torch.no_grad():
                preds_nl = model(test_seq_nl[i:i+500].to(device), test_mask_nl[i:i+500].to(device), model_name)
                preds_nl = preds_nl.detach().cpu().numpy()

                preds_en = model(test_seq_en[i:i+500].to(device), test_mask_en[i:i+500].to(device), model_name)
                preds_en = preds_en.detach().cpu().numpy()

                preds_fr = model(test_seq_fr[i:i+500].to(device), test_mask_fr[i:i+500].to(device), model_name)
                preds_fr = preds_fr.detach().cpu().numpy()
            
            # Keep the label number with the highest probability
            preds_nl = np.argmax(preds_nl, axis = 1)
            preds_en = np.argmax(preds_en, axis = 1)
            preds_fr = np.argmax(preds_fr, axis = 1)
            
            # Add the accuracy of 500 observations to the list
            acc_list_nl.append(accuracy_score(test_y[i:i+500], preds_nl))
            acc_list_en.append(accuracy_score(test_y[i:i+500], preds_en))
            acc_list_fr.append(accuracy_score(test_y[i:i+500], preds_fr))
            
        acc_nl = sum(acc_list_nl) / len(acc_list_nl)
        acc_en = sum(acc_list_en) / len(acc_list_en)
        acc_fr = sum(acc_list_fr) / len(acc_list_fr)

        acc = (acc_nl + acc_en + acc_fr) / 3
                
    #If this is an unilingual model
    else:
        acc_list = []
        for i in range(0, len(test_seq), 500):
            with torch.no_grad():
                preds = model(test_seq[i:i+500].to(device), test_mask[i:i+500].to(device), model_name)
                preds = preds.detach().cpu().numpy()
            
            # Keep the label number with the highest probability
            preds = np.argmax(preds, axis = 1)

            # Add the accuracy of 500 observations to the list
            acc_list.append(accuracy_score(test_y[i:i+500], preds))        

        # Take the average to have the complete accuracy
        acc = sum(acc_list) / len(acc_list)

    print("Accuracy:")
    print(acc)
    print("With one combination of parameters:")
    print("--- %s seconds ---" % (time.time() - start_time))

    return {"loss": -acc, "status": STATUS_OK}

## Run TPE

In [None]:
# Initialize trials object
trials = Trials()

# Get time of when TPE algorithm starts
start_time = time.time()

# Function that run the TPE algorithm with the associated space and number of trials
best = fmin(
    fn=objective_function,
    space = space, 
    algo=tpe.suggest, 
    max_evals=nb_trials, 
    trials=trials,
    return_argmin=False
)

print("Total time:")
print("--- %s seconds ---" % (time.time() - start_time))
print("Best: {}".format(best))