In [1]:
#from google.colab import drivedrive.mount('/content/drive')
#%cd /content/drive/MyDrive/tfm_code/03 Training
#%pwd
# Install `transformers` from master
#!pip install transformers==4.5.1
# !pip install git+https://github.com/huggingface/transformers
#!pip install torch
#!pip install sklearn
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

## 1. Load the base models and the tokenizers


In [2]:
# Check that we have a GPU
!nvidia-smi

Mon May  3 21:32:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla M60           Off  | 00000000:00:1E.0 Off |                    0 |
| N/A   31C    P0    38W / 150W |      0MiB /  7618MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

### Model checkpoint

In [8]:
# Select the model baseline to perform the transfer learning from
model_checkpoint = 'distilbert-base-uncased'
new_tokens = ['covid', 'covid-19', 'coronavirus', 'sars', 'sars-cov-2', 'pandemic', 'outbreak']

### Tokenizer

In [9]:
from transformers import DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(new_tokens)
tokenizer.save_pretrained('../data/03_models/tokenizer')

('../data/03_models/tokenizer/tokenizer_config.json',
 '../data/03_models/tokenizer/special_tokens_map.json',
 '../data/03_models/tokenizer/vocab.txt',
 '../data/03_models/tokenizer/added_tokens.json')

### Model
Finally let's initialize our model. We are looking to train from a pretrained model

In [10]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer)) 

Embedding(30528, 768)

In [7]:
model.num_parameters()

66990144

## 2. Model training

### 2.1 Define a grid search function for the training

Finally, in order to perform the training, a grid search function that allows random search is created. The functions does the following steps:
* It finds all possible combinations of parameters among the parameters grid
* Then, if a maximum number of fits is provided, it selects n_combinations random parameter combinations from the total list
* It creates a directory to store the temporary trained models so they don't have to be loaded in memory
* It splits the data into train and validations sets
* Then, for every parameter combination in the list:
    * It creates a transformers.Trainer object with a transfomers.TrainingArguments, that is created from the parameter combination dictionary, the baseline model, the datacollator to created the training batches by masking random tokens in the training set
    * It trains the model
    * It evaluates the perplexity of the model on the validation set
    * It saves the score and the parameters used in a list
    * It writes the model to disk
* After all the models are trained, it finds the one with the lowest perplexity
* It moves the model to the output folder
* Finally, it returns all the models and perplexitys cores

In [5]:
from transformers import Trainer, TrainingArguments
import time
from itertools import product
from sklearn.model_selection import train_test_split
import math
import os
from random import sample
from shutil import rmtree, move
def grid_search_for_language_modeling(baseline_model=None, param_grid={}, n_combinations=None, X=None, data_collator=None, validation_size=0.15, random_state=42, model_name='model', out_dir='models/', tmp_dir='grid_search/'): 
    """
    Performs grid search over a grid of parameters for an ML model and another grid of parameters for a function applied to training data in order to augment it
    It uses a custom cross validation function that only applies the function to the training data and validates on clean data
    -------------------------------------------------------------------------------
    Parameters:
        - model: scikit-learn like model
        - model_grid: dictionary of parameters to perform gird search on the _model
        - X (dataframe): train data (np.array)
        - validation_size

    Returns:
        - best_model: a dictionaty that contains the results with the best model found by performing the grid search over the _model and _function
            + _best_model'
            + _best_model_params
            + _best_function_params
            + _best_score
    """
    # Get all combinations of parameters in grid
    keys, values = zip(*param_grid.items())
    param_combinations_list = [dict(zip(keys, v)) for v in product(*values)]
    
    for param_combination in param_combinations_list:
        param_combination['eval_steps'] = (len(train_dataset) // param_combination["per_device_train_batch_size"]) + 1

    # If a max number of combinations is provided then n_combinations random param combinations are selected from the list
    if n_combinations:
        param_combinations_list = sample(param_combinations_list, n_combinations)
    total_fits = len(param_combinations_list)
    model_params, scores = [], []

    # Create directory to save temporary models
    if os.path.isdir(tmp_dir):
        rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    # Divide the dataset for validation
    X_train, X_test = train_test_split(X, test_size=validation_size, random_state=random_state)

    # Start grid search
    start = time.time() # Get initial time of training
    print(f'- Starting grid search, totalling {total_fits} jobs -')
    for param_combination in param_combinations_list:

        # Instantiate a model with the given param combination in the iteration
        print(f'  - Training model {param_combination}')
        training_args = TrainingArguments(**param_combination) # Unpacking the param grid
        trainer = Trainer(
                            model=baseline_model,
                            args=training_args,
                            train_dataset=X_train,
                            eval_dataset=X_test,
                            data_collator=data_collator,
                        )
        # Train the model
        trainer.train()

        # Save the model
        trainer.save_model(f'{tmp_dir}/{len(os.listdir(tmp_dir))}')

        # Evaluate performance
        eval_results = trainer.evaluate()
        model_score = math.exp(eval_results['eval_loss'])

        model_params.append(param_combination)
        scores.append(model_score)

        elapsed_time = time.time() - start
        print(f'------ - Perplexity: {model_score:.2f} | Fitted {len(scores)} jobs out of {total_fits}. Elapsed {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))} ------') 
            
    elapsed_time = time.time() - start
    print('--- Ending grid search, totalling {} jobs. Elapsed {} ---'.format(total_fits, time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) 
    
    # Get index og the model with the best score
    best_index = scores.index(min(scores))
    
    # Move model to output_dir
    os.rename(f'{tmp_dir}/{best_index}', f'{tmp_dir}/{model_name}')

    # Remove the model in the output dir
    if os.path.isdir(f'{out_dir}/{model_name}'):
        rmtree(f'{out_dir}/{model_name}')
    
    # Move the contents of the new best model to the output dir
    move(f'{tmp_dir}/{model_name}', f'{out_dir}/{model_name}')
    # Remove working dir
    rmtree(tmp_dir)

    # Return the best params and score in a dict
    return model_params, scores

### 2.2 Model training on European COVID texts


#### Dataset build

We'll build our dataset by applying our tokenizer to our text file.

TextDataset: reads the full input text, tokenizes it and cuts it in block_sized chunks. Then adds special tokens (here just <s> or ["SEP"]/["CLS"])

LineByLineTextDataset: reads each line separately, tokenizes and truncates the lines to block_size. Adds special tokens.

use TextDataset because --line-by-line will throw away a lot of data if not used correctly.

In [35]:
%%time
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_eu_text.txt",
    block_size=128,
)
print(len(dataset))

24477
CPU times: user 160 ms, sys: 48.1 ms, total: 208 ms
Wall time: 206 ms


In [36]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
19581 4896


#### Data collator
[Data collators](https://huggingface.co/transformers/master/main_classes/data_collator.html) are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset. This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

[DataCollatorForLanguageModeling](https://huggingface.co/transformers/master/main_classes/data_collator.html#datacollatorforlanguagemodeling): Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.

For best performance, this data collator should be used with a dataset having items that are dictionaries or BatchEncoding, with the "special_tokens_mask" key, as returned by a PreTrainedTokenizer or a PreTrainedTokenizerFast with the argument return_special_tokens_mask=True.

In [37]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search
First the parameters to perform the search over the training are defined

In [14]:
param_grid = dict(
            output_dir=["../data/03_models/trainer/"],
            overwrite_output_dir=[True],
            num_train_epochs=[3],
            learning_rate=[2e-5, 3e-5, 5e-5],
            weight_decay=[0.01, 0.005],
            per_device_train_batch_size=[16],
            per_device_eval_batch_size=[32],
            save_steps=[0],
            warmup_steps=[0],
            save_total_limit=[1],
            prediction_loss_only=[True],
            eval_accumulation_steps=[1]
            )

Then the model is trained

In [13]:
eu_models_params, eu_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=None,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='eu_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(eu_models_params, open('../data/03_models/eu_models_params.p', 'wb'))
pickle.dump(eu_scores, open('../data/03_models/eu_scores.p', 'wb'))

- Starting grid search, totalling 6 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,2.5093
1000,2.2976
1500,2.1996
2000,2.1569
2500,2.1069
3000,2.0759


------ - Perplexity: 7.36 | Fitted 1 jobs out of 6. Elapsed 00:20:46 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,1.8199
1000,1.8047
1500,1.8081
2000,1.8487
2500,1.8669
3000,1.9026


------ - Perplexity: 6.61 | Fitted 2 jobs out of 6. Elapsed 00:41:33 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,1.4302
1000,1.4813
1500,1.5407
2000,1.6334
2500,1.6978
3000,1.7828


------ - Perplexity: 6.18 | Fitted 3 jobs out of 6. Elapsed 01:02:21 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,0.9497
1000,1.0547
1500,1.1904
2000,1.363
2500,1.5144
3000,1.6914


------ - Perplexity: 6.09 | Fitted 4 jobs out of 6. Elapsed 01:23:08 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,0.599
1000,0.7572
1500,0.946
2000,1.176
2500,1.3888
3000,1.6288


------ - Perplexity: 6.07 | Fitted 5 jobs out of 6. Elapsed 01:43:56 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}


Step,Training Loss
500,0.2804
1000,0.3985
1500,0.5669
2000,0.8374
2500,1.1561
3000,1.5388


------ - Perplexity: 6.27 | Fitted 6 jobs out of 6. Elapsed 02:04:44 ------
--- Ending grid search, totalling 6 jobs. Elapsed 02:04:44 ---


We evaluate the results obtained in training

In [14]:
print('- Training results: ')
for model_params, score in zip(eu_models_params, eu_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 7.36 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}
  - Model Perplexity: 6.61 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 1224}
  - Model Perplexity: 6.18 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_b

And finally evaluate the results of the final best found model

In [15]:
model=AutoModelForMaskedLM.from_pretrained("../data/03_models/eu_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best model perplexity on test: 6.12


### 2.2 Model training on United States COVID texts
The same steps are followed for the US data model as for the European model. New training parameters need to be found as the datasets differ in size.


#### Dataset build

In [1]:
%%time
from transformers import TextDataset
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('../data/03_models/tokenizer/')

#tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
#tokenizer.add_tokens(new_tokens)
#tokenizer.save_pretrained('../data/03_models/tokenizer')

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_us_text.txt",
    block_size=128,
)
print(len(dataset))

14748
CPU times: user 3.43 s, sys: 2.05 s, total: 5.48 s
Wall time: 2.1 s




In [11]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
11798 2950


#### Data collator

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search

The model is trained using the same parameters as before

In [15]:
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer)) 

us_models_params, us_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=None,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='us_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(us_models_params, open('../data/03_models/us_models_params.p', 'wb'))
pickle.dump(us_scores, open('../data/03_models/us_scores.p', 'wb'))

- Starting grid search, totalling 6 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,2.339
1000,1.9639
1500,1.8565


------ - Perplexity: 5.66 | Fitted 1 jobs out of 6. Elapsed 00:12:32 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,1.5652
1000,1.5288
1500,1.5658


------ - Perplexity: 4.93 | Fitted 2 jobs out of 6. Elapsed 00:25:02 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,1.1979
1000,1.2574
1500,1.3765


------ - Perplexity: 4.56 | Fitted 3 jobs out of 6. Elapsed 00:37:31 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,0.7702
1000,0.9268
1500,1.1699


------ - Perplexity: 4.49 | Fitted 4 jobs out of 6. Elapsed 00:50:02 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,0.4779
1000,0.6986
1500,1.0234


------ - Perplexity: 4.48 | Fitted 5 jobs out of 6. Elapsed 01:02:30 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 5e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}


Step,Training Loss
500,0.2278
1000,0.3947
1500,0.771


------ - Perplexity: 4.63 | Fitted 6 jobs out of 6. Elapsed 01:15:01 ------
--- Ending grid search, totalling 6 jobs. Elapsed 01:15:01 ---


We evaluate the results obtained in training

In [16]:
print('- Training results: ')
for model_params, score in zip(us_models_params, us_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 5.66 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.01, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}
  - Model Perplexity: 4.93 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 2e-05, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'save_steps': 0, 'warmup_steps': 0, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1, 'eval_steps': 738}
  - Model Perplexity: 4.56 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'num_train_epochs': 3, 'learning_rate': 3e-05, 'weight_decay': 0.01, 'per_device_train_bat

And finally evaluate the results of the final best found model

In [17]:
model=AutoModelForMaskedLM.from_pretrained("../data/03_models/us_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best model perplexity on test: 4.40


## 3. Compare the model results

Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.

Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, transformers.pipeline.tokenizer.mask_token) and return a list of the most probable filled sequences, with their probabilities.

The predictions are compared to the predictions outputed from the base model that was used to fine tune this model.

In [21]:
from transformers import pipeline, AutoModelForMaskedLM, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('../data/03_models/tokenizer/')

model=AutoModelForMaskedLM.from_pretrained("../data/03_models/eu_bert_model")

eu_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

model=AutoModelForMaskedLM.from_pretrained("../data/03_models/us_bert_model")
us_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
old_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer
)

In [23]:
sequence = f"coronavirus is a very bad {tokenizer.mask_token}"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'coronavirus is a very bad time', 'score': 0.17173822224140167, 'token': 2051, 'token_str': 't i m e'}
UNITED STATES MODEL 0 token is  {'sequence': 'coronavirus is a very bad ;', 'score': 0.36263328790664673, 'token': 1025, 'token_str': ';'}
OLD MODEL 0 token is            {'sequence': 'coronavirus is a very bad virus', 'score': 0.34118759632110596, 'token': 7865, 'token_str': 'v i r u s'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'coronavirus is a very bad ;', 'score': 0.06552662700414658, 'token': 1025, 'token_str': ';'}
UNITED STATES MODEL 1 token is  {'sequence': 'coronavirus is a very bad.', 'score': 0.19779327511787415, 'token': 1012, 'token_str': '.'}
OLD MODEL 1 token is            {'sequence': 'coronavirus is a very bad.', 'score': 0.22734670341014862, 'token': 1012, 'token_str': '.'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': 'coronavirus is a very bad.', 'score

In [24]:
sequence = f"coronavirus is a very {tokenizer.mask_token} disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'coronavirus is a very common disease', 'score': 0.4286090135574341, 'token': 2691, 'token_str': 'c o m m o n'}
UNITED STATES MODEL 0 token is  {'sequence': 'coronavirus is a very common disease', 'score': 0.6938325762748718, 'token': 2691, 'token_str': 'c o m m o n'}
OLD MODEL 0 token is            {'sequence': 'coronavirus is a very rare disease', 'score': 0.5813637375831604, 'token': 4678, 'token_str': 'r a r e'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'coronavirus is a very serious disease', 'score': 0.16115660965442657, 'token': 3809, 'token_str': 's e r i o u s'}
UNITED STATES MODEL 1 token is  {'sequence': 'coronavirus is a very serious disease', 'score': 0.13249681890010834, 'token': 3809, 'token_str': 's e r i o u s'}
OLD MODEL 1 token is            {'sequence': 'coronavirus is a very common disease', 'score': 0.23531213402748108, 'token': 2691, 'token_str': 'c o m m o n'}
----------------------

In [25]:
sequence = f"covid is a very bad {tokenizer.mask_token}"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'covid is a very bad time', 'score': 0.11618148535490036, 'token': 2051, 'token_str': 't i m e'}
UNITED STATES MODEL 0 token is  {'sequence': 'covid is a very bad ;', 'score': 0.2179058939218521, 'token': 1025, 'token_str': ';'}
OLD MODEL 0 token is            {'sequence': 'covid is a very bad.', 'score': 0.19394788146018982, 'token': 1012, 'token_str': '.'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'covid is a very bad.', 'score': 0.05466793105006218, 'token': 1012, 'token_str': '.'}
UNITED STATES MODEL 1 token is  {'sequence': 'covid is a very bad.', 'score': 0.1583063155412674, 'token': 1012, 'token_str': '.'}
OLD MODEL 1 token is            {'sequence': 'covid is a very bad ;', 'score': 0.17415544390678406, 'token': 1025, 'token_str': ';'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': 'covid is a very bad ;', 'score': 0.03962734714150429, 'token': 1025, 'token_str': ';'}

In [26]:
sequence = f"covid is a very {tokenizer.mask_token} disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'covid is a very common disease', 'score': 0.3215981125831604, 'token': 2691, 'token_str': 'c o m m o n'}
UNITED STATES MODEL 0 token is  {'sequence': 'covid is a very common disease', 'score': 0.6336857676506042, 'token': 2691, 'token_str': 'c o m m o n'}
OLD MODEL 0 token is            {'sequence': 'covid is a very rare disease', 'score': 0.7391716837882996, 'token': 4678, 'token_str': 'r a r e'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'covid is a very infectious disease', 'score': 0.09421249479055405, 'token': 16514, 'token_str': 'i n f e c t i o u s'}
UNITED STATES MODEL 1 token is  {'sequence': 'covid is a very serious disease', 'score': 0.1463167667388916, 'token': 3809, 'token_str': 's e r i o u s'}
OLD MODEL 1 token is            {'sequence': 'covid is a very common disease', 'score': 0.13259394466876984, 'token': 2691, 'token_str': 'c o m m o n'}
--------------------------------
EUROPEAN MODEL 2

In [27]:
sequence = f"{tokenizer.mask_token} is a disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'it is a disease', 'score': 0.14260460436344147, 'token': 2009, 'token_str': 'i t'}
UNITED STATES MODEL 0 token is  {'sequence': 'it is a disease', 'score': 0.04115577042102814, 'token': 2009, 'token_str': 'i t'}
OLD MODEL 0 token is            {'sequence': 'tuberculosis is a disease', 'score': 0.08622989058494568, 'token': 15877, 'token_str': 't u b e r c u l o s i s'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'this is a disease', 'score': 0.09274052828550339, 'token': 2023, 'token_str': 't h i s'}
UNITED STATES MODEL 1 token is  {'sequence': 'tuberculosis is a disease', 'score': 0.026136085391044617, 'token': 15877, 'token_str': 't u b e r c u l o s i s'}
OLD MODEL 1 token is            {'sequence': 'malaria is a disease', 'score': 0.03631953150033951, 'token': 19132, 'token_str': 'm a l a r i a'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': 'which is a disease', 'score':

In [28]:
sequence = f"The {tokenizer.mask_token} of the coronavirus pandemic are very serious"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'the consequences of the coronavirus pandemic are very serious', 'score': 0.5420258045196533, 'token': 8465, 'token_str': 'c o n s e q u e n c e s'}
UNITED STATES MODEL 0 token is  {'sequence': 'the effects of the coronavirus pandemic are very serious', 'score': 0.4934997260570526, 'token': 3896, 'token_str': 'e f f e c t s'}
OLD MODEL 0 token is            {'sequence': 'the symptoms of the coronavirus pandemic are very serious', 'score': 0.29923003911972046, 'token': 8030, 'token_str': 's y m p t o m s'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'the effects of the coronavirus pandemic are very serious', 'score': 0.385225772857666, 'token': 3896, 'token_str': 'e f f e c t s'}
UNITED STATES MODEL 1 token is  {'sequence': 'the impacts of the coronavirus pandemic are very serious', 'score': 0.13947957754135132, 'token': 14670, 'token_str': 'i m p a c t s'}
OLD MODEL 1 token is            {'sequence': 'the ef

In [29]:
sequence = f"the consequences of the {tokenizer.mask_token} are very serious"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'the consequences of the earthquake are very serious', 'score': 0.39475271105766296, 'token': 8372, 'token_str': 'e a r t h q u a k e'}
UNITED STATES MODEL 0 token is  {'sequence': 'the consequences of the disaster are very serious', 'score': 0.1754169911146164, 'token': 7071, 'token_str': 'd i s a s t e r'}
OLD MODEL 0 token is            {'sequence': 'the consequences of the accident are very serious', 'score': 0.08996063470840454, 'token': 4926, 'token_str': 'a c c i d e n t'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'the consequences of the pandemic are very serious', 'score': 0.12387777119874954, 'token': 30527, 'token_str': 'p a n d e m i c'}
UNITED STATES MODEL 1 token is  {'sequence': 'the consequences of the accident are very serious', 'score': 0.09676043689250946, 'token': 4926, 'token_str': 'a c c i d e n t'}
OLD MODEL 1 token is            {'sequence': 'the consequences of the earthquake are v

In [30]:
sequence = f"coronavirus disease is an infectious disease caused by a {tokenizer.mask_token}"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'coronavirus disease is an infectious disease caused by a.', 'score': 0.23049484193325043, 'token': 1012, 'token_str': '.'}
UNITED STATES MODEL 0 token is  {'sequence': 'coronavirus disease is an infectious disease caused by a.', 'score': 0.2763156294822693, 'token': 1012, 'token_str': '.'}
OLD MODEL 0 token is            {'sequence': 'coronavirus disease is an infectious disease caused by a.', 'score': 0.4726460874080658, 'token': 1012, 'token_str': '.'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'coronavirus disease is an infectious disease caused by a virus', 'score': 0.1717226505279541, 'token': 7865, 'token_str': 'v i r u s'}
UNITED STATES MODEL 1 token is  {'sequence': 'coronavirus disease is an infectious disease caused by a novel', 'score': 0.17340105772018433, 'token': 3117, 'token_str': 'n o v e l'}
OLD MODEL 1 token is            {'sequence': 'coronavirus disease is an infectious disease caused b

In [31]:
sequence = f"the covid virus spreads primarily through {tokenizer.mask_token}"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'the covid virus spreads primarily through the', 'score': 0.136428102850914, 'token': 1996, 'token_str': 't h e'}
UNITED STATES MODEL 0 token is  {'sequence': 'the covid virus spreads primarily through.', 'score': 0.10988763719797134, 'token': 1012, 'token_str': '.'}
OLD MODEL 0 token is            {'sequence': 'the covid virus spreads primarily through :', 'score': 0.2401067316532135, 'token': 1024, 'token_str': ':'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'the covid virus spreads primarily through.', 'score': 0.12120738625526428, 'token': 1012, 'token_str': '.'}
UNITED STATES MODEL 1 token is  {'sequence': 'the covid virus spreads primarily through ;', 'score': 0.10381394624710083, 'token': 1025, 'token_str': ';'}
OLD MODEL 1 token is            {'sequence': 'the covid virus spreads primarily through.', 'score': 0.17415806651115417, 'token': 1012, 'token_str': '.'}
--------------------------------
EURO

In [32]:
def create_text(sequence, pipeline, steps=15):
    for step in range(steps):
        prediction_sequence = f'{sequence} {tokenizer.mask_token}'
        results = pipeline(prediction_sequence)
        token = results[0]['token_str']
        sequence = f'{sequence}{token}'
    return sequence

In [33]:
old_sequence = "the covid virus spreads primarily through" 
new_sequence = create_text(old_sequence, eu_model_pipeline, steps=15)   
print(f'{old_sequence}\n{new_sequence}')
print('--------------------------------')
new_sequence = create_text(old_sequence, us_model_pipeline, steps=15)   
print(f'{old_sequence}\n{new_sequence}')
print('--------------------------------')
new_sequence = create_text(old_sequence, old_model_pipeline, steps=15)   
print(f'{old_sequence}\n{new_sequence}')

the covid virus spreads primarily through
the covid virus spreads primarily throught h ev i r u s.c.c.c.c.c.c.
--------------------------------
the covid virus spreads primarily through
the covid virus spreads primarily through...............
--------------------------------
the covid virus spreads primarily through
the covid virus spreads primarily through:..............


In [34]:
old_sequence = "The consequenes of the pandemic need to be" 
new_sequence = create_text(old_sequence, eu_model_pipeline, steps=40)   
print(f'{old_sequence}\n{new_sequence}')
print('--------------------------------')
new_sequence = create_text(old_sequence, us_model_pipeline, steps=40)   
print(f'{old_sequence}\n{new_sequence}')
print('--------------------------------')
new_sequence = create_text(old_sequence, old_model_pipeline, steps=40)   
print(f'{old_sequence}\n{new_sequence}')

The consequenes of the pandemic need to be
The consequenes of the pandemic need to bea d d r e s s e del ar# # r of;4.4.4.4.4.4.4.4.4.4.4.4.4.4.4.4.4
--------------------------------
The consequenes of the pandemic need to be
The consequenes of the pandemic need to beo v e r c o m ew.i.op.4 5.1.32.1.1.23...................
--------------------------------
The consequenes of the pandemic need to be
The consequenes of the pandemic need to be."......".......".......................
