In [1]:
#from google.colab import drivedrive.mount('/content/drive')
#%cd /content/drive/MyDrive/tfm_code/03 Training
#%pwd
# Install `transformers` from master
#!pip install transformers==4.5.1
# !pip install git+https://github.com/huggingface/transformers
#!pip install torch
#!pip install sklearn
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

## 1. Load the base models and the tokenizers


In [2]:
# Check that we have a GPU
!nvidia-smi

Thu May 13 09:52:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla M60           Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   34C    P0    38W / 150W |      0MiB /  7618MiB |     99%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

### Model checkpoint

In [4]:
# Select the model baseline to perform the transfer learning from
model_checkpoint = 'distilbert-base-uncased'
new_tokens = ['covid', 'covid-19', 'coronavirus', 'sars', 'sars-cov-2', 'pandemic', 'outbreak']

### Tokenizer

In [5]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(new_tokens)
tokenizer.save_pretrained('../data/03_models/tokenizer')

('../data/03_models/tokenizer/tokenizer_config.json',
 '../data/03_models/tokenizer/special_tokens_map.json',
 '../data/03_models/tokenizer/vocab.txt',
 '../data/03_models/tokenizer/added_tokens.json')

### Model
Finally let's initialize our model. We are looking to train from a pretrained model

In [6]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer)) 

Embedding(30528, 768)

In [7]:
model.num_parameters()

66990144

## 2. Model training

### 2.1 Define a grid search function for the training

Finally, in order to perform the training, a grid search function that allows random search is created. The functions does the following steps:
* It finds all possible combinations of parameters among the parameters grid
* Then, if a maximum number of fits is provided, it selects n_combinations random parameter combinations from the total list
* It creates a directory to store the temporary trained models so they don't have to be loaded in memory
* It splits the data into train and validations sets
* Then, for every parameter combination in the list:
    * It creates a transformers.Trainer object with a transfomers.TrainingArguments, that is created from the parameter combination dictionary, the baseline model, the datacollator to created the training batches by masking random tokens in the training set
    * It trains the model
    * It evaluates the perplexity of the model on the validation set
    * It saves the score and the parameters used in a list
    * It writes the model to disk
* After all the models are trained, it finds the one with the lowest perplexity
* It moves the model to the output folder
* Finally, it returns all the models and perplexitys cores

In [8]:
from transformers import Trainer, TrainingArguments
import time
from itertools import product
from sklearn.model_selection import train_test_split
import math
import os
from random import sample
from shutil import rmtree, move
def grid_search_for_language_modeling(baseline_model=None, param_grid={}, n_combinations=None, X=None, data_collator=None, validation_size=0.15, random_state=42, model_name='model', out_dir='models/', tmp_dir='grid_search/'): 
    """
    Performs grid search over a grid of parameters for an ML model and another grid of parameters for a function applied to training data in order to augment it
    It uses a custom cross validation function that only applies the function to the training data and validates on clean data
    -------------------------------------------------------------------------------
    Parameters:
        - model: scikit-learn like model
        - model_grid: dictionary of parameters to perform gird search on the _model
        - X (dataframe): train data (np.array)
        - validation_size

    Returns:
        - best_model: a dictionaty that contains the results with the best model found by performing the grid search over the _model and _function
            + _best_model'
            + _best_model_params
            + _best_function_params
            + _best_score
    """
    # Get all combinations of parameters in grid
    keys, values = zip(*param_grid.items())
    param_combinations_list = [dict(zip(keys, v)) for v in product(*values)]
    
    for param_combination in param_combinations_list:
        param_combination['eval_steps'] = (len(train_dataset) // param_combination["per_device_train_batch_size"]) + 1

    # If a max number of combinations is provided then n_combinations random param combinations are selected from the list
    if n_combinations:
        param_combinations_list = sample(param_combinations_list, n_combinations)
    total_fits = len(param_combinations_list)
    model_params, scores = [], []

    # Create directory to save temporary models
    if os.path.isdir(tmp_dir):
        rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    # Divide the dataset for validation
    X_train, X_test = train_test_split(X, test_size=validation_size, random_state=random_state)

    # Start grid search
    start = time.time() # Get initial time of training
    print(f'- Starting grid search, totalling {total_fits} jobs -')
    for param_combination in param_combinations_list:

        # Instantiate a model with the given param combination in the iteration
        print(f'  - Training model {param_combination}')
        training_args = TrainingArguments(**param_combination) # Unpacking the param grid
        trainer = Trainer(
                            model=baseline_model,
                            args=training_args,
                            train_dataset=X_train,
                            eval_dataset=X_test,
                            data_collator=data_collator,
                        )
        # Train the model
        trainer.train()

        # Save the model
        trainer.save_model(f'{tmp_dir}/{len(os.listdir(tmp_dir))}')

        # Evaluate performance
        eval_results = trainer.evaluate()
        model_score = math.exp(eval_results['eval_loss'])

        model_params.append(param_combination)
        scores.append(model_score)

        elapsed_time = time.time() - start
        print(f'------ - Perplexity: {model_score:.2f} | Fitted {len(scores)} jobs out of {total_fits}. Elapsed {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))} ------') 
            
    elapsed_time = time.time() - start
    print('--- Ending grid search, totalling {} jobs. Elapsed {} ---'.format(total_fits, time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) 
    
    # Get index og the model with the best score
    best_index = scores.index(min(scores))
    
    # Move model to output_dir
    os.rename(f'{tmp_dir}/{best_index}', f'{tmp_dir}/{model_name}')

    # Remove the model in the output dir
    if os.path.isdir(f'{out_dir}/{model_name}'):
        rmtree(f'{out_dir}/{model_name}')
    
    # Move the contents of the new best model to the output dir
    move(f'{tmp_dir}/{model_name}', f'{out_dir}/{model_name}')
    # Remove working dir
    rmtree(tmp_dir)

    # Return the best params and score in a dict
    return model_params, scores

### 2.2 Model training on European COVID texts


#### Dataset build

We'll build our dataset by applying our tokenizer to our text file.

TextDataset: reads the full input text, tokenizes it and cuts it in block_sized chunks. Then adds special tokens (here just <s> or ["SEP"]/["CLS"])

LineByLineTextDataset: reads each line separately, tokenizes and truncates the lines to block_size. Adds special tokens.

use TextDataset because --line-by-line will throw away a lot of data if not used correctly.

In [9]:
%%time
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_eu_text.txt",
    block_size=128,
)
print(len(dataset))



26388
CPU times: user 1min 10s, sys: 685 ms, total: 1min 10s
Wall time: 1min 10s


In [10]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
21110 5278


#### Data collator
[Data collators](https://huggingface.co/transformers/master/main_classes/data_collator.html) are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset. This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

[DataCollatorForLanguageModeling](https://huggingface.co/transformers/master/main_classes/data_collator.html#datacollatorforlanguagemodeling): Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.

For best performance, this data collator should be used with a dataset having items that are dictionaries or BatchEncoding, with the "special_tokens_mask" key, as returned by a PreTrainedTokenizer or a PreTrainedTokenizerFast with the argument return_special_tokens_mask=True.

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search
First the parameters to perform the search over the training are defined

In [12]:
param_grid = dict(
            output_dir=["../data/03_models/trainer/"],
            overwrite_output_dir=[True],
            num_train_epochs=[3],
            learning_rate=[2e-5, 3e-5, 5e-5],
            weight_decay=[0.01, 0.005],
            per_device_train_batch_size=[16],
            per_device_eval_batch_size=[32],
            save_steps=[0],
            warmup_steps=[0],
            save_total_limit=[1],
            prediction_loss_only=[True],
            eval_accumulation_steps=[1]
            )

Then the model is trained

In [13]:
eu_models_params, eu_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=None,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='eu_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(eu_models_params, open('../data/03_models/eu_models_params.p', 'wb'))
pickle.dump(eu_scores, open('../data/03_models/eu_scores.p', 'wb'))

- Starting grid search, totalling 6 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,2.3139
1000,2.125
1500,2.0443
2000,1.9903
2500,1.9452
3000,1.9135


------ - Perplexity: 6.09 | Fitted 1 jobs out of 6. Elapsed 00:22:18 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,1.6602
1000,1.6529
1500,1.6642
2000,1.6849
2500,1.7015
3000,1.7267


------ - Perplexity: 5.50 | Fitted 2 jobs out of 6. Elapsed 00:44:34 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,1.2932
1000,1.3472
1500,1.4039
2000,1.472
2500,1.5326
3000,1.5969


------ - Perplexity: 5.17 | Fitted 3 jobs out of 6. Elapsed 01:06:50 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,0.8414
1000,0.9431
1500,1.058
2000,1.1984
2500,1.3355
3000,1.4772


------ - Perplexity: 5.10 | Fitted 4 jobs out of 6. Elapsed 01:29:06 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,0.5212
1000,0.6656
1500,0.8184
2000,1.0115
2500,1.2014
3000,1.3972


------ - Perplexity: 5.09 | Fitted 5 jobs out of 6. Elapsed 01:51:21 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}


Step,Training Loss
500,0.2421
1000,0.343
1500,0.4708
2000,0.6814
2500,0.9452
3000,1.2584


------ - Perplexity: 5.26 | Fitted 6 jobs out of 6. Elapsed 02:13:37 ------
--- Ending grid search, totalling 6 jobs. Elapsed 02:13:37 ---


We evaluate the results obtained in training

In [14]:
print('- Training results: ')
for model_params, score in zip(eu_models_params, eu_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 6.09 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}
  - Model Perplexity: 5.50 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1320}
  - Model Perplexity: 5.17 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_b

And finally evaluate the results of the final best found model

In [15]:
model=AutoModelForMaskedLM.from_pretrained("../data/03_models/eu_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best model perplexity on test: 5.09


### 2.2 Model training on United States COVID texts
The same steps are followed for the US data model as for the European model. New training parameters need to be found as the datasets differ in size.


#### Dataset build

In [16]:
%%time
from transformers import TextDataset
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('../data/03_models/tokenizer/')

#tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
#tokenizer.add_tokens(new_tokens)
#tokenizer.save_pretrained('../data/03_models/tokenizer')

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_us_text.txt",
    block_size=128,
)
print(len(dataset))

17103
CPU times: user 39 s, sys: 458 ms, total: 39.5 s
Wall time: 39.5 s


In [17]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
13682 3421


#### Data collator

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search

The model is trained using the same parameters as before

In [19]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer)) 

us_models_params, us_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=None,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='us_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(us_models_params, open('../data/03_models/us_models_params.p', 'wb'))
pickle.dump(us_scores, open('../data/03_models/us_scores.p', 'wb'))

- Starting grid search, totalling 6 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,2.1375
1000,1.7943
1500,1.676
2000,1.6062


------ - Perplexity: 4.48 | Fitted 1 jobs out of 6. Elapsed 00:14:25 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,1.3913
1000,1.3609
1500,1.3717
2000,1.395


------ - Perplexity: 3.94 | Fitted 2 jobs out of 6. Elapsed 00:28:51 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,1.0511
1000,1.0982
1500,1.1765
2000,1.2591


------ - Perplexity: 3.67 | Fitted 3 jobs out of 6. Elapsed 00:43:16 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,0.6607
1000,0.7758
1500,0.9465
2000,1.1385


------ - Perplexity: 3.63 | Fitted 4 jobs out of 6. Elapsed 00:57:42 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,0.4047
1000,0.5602
1500,0.7885
2000,1.0543


------ - Perplexity: 3.62 | Fitted 5 jobs out of 6. Elapsed 01:12:08 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}


Step,Training Loss
500,0.196
1000,0.299
1500,0.5282
2000,0.9099


------ - Perplexity: 3.74 | Fitted 6 jobs out of 6. Elapsed 01:26:34 ------
--- Ending grid search, totalling 6 jobs. Elapsed 01:26:34 ---


We evaluate the results obtained in training

In [20]:
print('- Training results: ')
for model_params, score in zip(us_models_params, us_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 4.48 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}
  - Model Perplexity: 3.94 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 856}
  - Model Perplexity: 3.67 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_bat

And finally evaluate the results of the final best found model

In [21]:
model=AutoModelForMaskedLM.from_pretrained("../data/03_models/us_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best model perplexity on test: 3.70
