In [32]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/tfm_code/03 Training
%pwd
# Install `transformers` from master
!pip install transformers==4.5.1
# !pip install git+https://github.com/huggingface/transformers
!pip install torch
!pip install sklearn
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/tfm_code/03 Training


## 1. Load the base models and the tokenizers


In [33]:
# Check that we have a GPU
!nvidia-smi

Mon Apr 26 20:55:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    34W /  70W |   1784MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [34]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

### Model checkpoint

In [35]:
# Select the model baseline to perform the transfer learning from
model_checkpoint = 'roberta-base'

### Tokenizer

In [36]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

### Model
Finally let's initialize our model. We are looking to train from a pretrained model

In [37]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM.from_pretrained(model_checkpoint)

In [38]:
model.num_parameters()
# => 84 million parameters

124697433

## 2. Model training

### 2.1 Define a grid search function for the training

Finally, in order to perform the training, a grid search function that allows random search is created. The functions does the following steps:
* It finds all possible combinations of parameters among the parameters grid
* Then, if a maximum number of fits is provided, it selects n_combinations random parameter combinations from the total list
* It creates a directory to store the temporary trained models so they don't have to be loaded in memory
* It splits the data into train and validations sets
* Then, for every parameter combination in the list:
    * It creates a transformers.Trainer object with a transfomers.TrainingArguments, that is created from the parameter combination dictionary, the baseline model, the datacollator to created the training batches by masking random tokens in the training set
    * It trains the model
    * It evaluates the perplexity of the model on the validation set
    * It saves the score and the parameters used in a list
    * It writes the model to disk
* After all the models are trained, it finds the one with the lowest perplexity
* It moves the model to the output folder
* Finally, it returns all the models and perplexitys cores

In [39]:
from transformers import Trainer, TrainingArguments
import time
from itertools import product
from sklearn.model_selection import train_test_split
import math
import os
from random import sample
from shutil import rmtree, move
def grid_search_for_language_modeling(baseline_model=None, param_grid={}, n_combinations=None, X=None, data_collator=None, validation_size=0.15, random_state=42, model_name='model', out_dir='models/', tmp_dir='grid_search/'): 
    """
    Performs grid search over a grid of parameters for an ML model and another grid of parameters for a function applied to training data in order to augment it
    It uses a custom cross validation function that only applies the function to the training data and validates on clean data
    -------------------------------------------------------------------------------
    Parameters:
        - model: scikit-learn like model
        - model_grid: dictionary of parameters to perform gird search on the _model
        - X (dataframe): train data (np.array)
        - validation_size

    Returns:
        - best_model: a dictionaty that contains the results with the best model found by performing the grid search over the _model and _function
            + _best_model'
            + _best_model_params
            + _best_function_params
            + _best_score
    """
    # Get all combinations of parameters in grid
    keys, values = zip(*param_grid.items())
    param_combinations_list = [dict(zip(keys, v)) for v in product(*values)]

    # If a max number of combinations is provided then n_combinations random param combinations are selected from the list
    if n_combinations:
        param_combinations_list = sample(param_combinations_list, n_combinations)
    total_fits = len(param_combinations_list)
    model_params, scores = [], []

    # Create directory to save temporary models
    if os.path.isdir(tmp_dir):
        rmtree(tmp_dir)
    os.makedirs(tmp_dir)

    # Divide the dataset for validation
    X_train, X_test = train_test_split(X, test_size=validation_size, random_state=random_state)

    # Start grid search
    start = time.time() # Get initial time of training
    print(f'- Starting grid search, totalling {total_fits} jobs -')
    for param_combination in param_combinations_list:

        # Instantiate a model with the given param combination in the iteration
        print(f'  - Training model {param_combination}')
        training_args = TrainingArguments(**param_combination) # Unpacking the param grid
        trainer = Trainer(
                            model=baseline_model,
                            args=training_args,
                            train_dataset=X_train,
                            eval_dataset=X_test,
                            data_collator=data_collator,
                        )
        # Train the model
        trainer.train()

        # Save the model
        trainer.save_model(f'{tmp_dir}/{len(os.listdir(tmp_dir))}')

        # Evaluate performance
        eval_results = trainer.evaluate()
        model_score = math.exp(eval_results['eval_loss'])

        model_params.append(param_combination)
        scores.append(model_score)

        elapsed_time = time.time() - start
        print(f'------ - Perplexity: {model_score:.2f} | Fitted {len(scores)} jobs out of {total_fits}. Elapsed {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))} ------') 
            
    elapsed_time = time.time() - start
    print('--- Ending grid search, totalling {} jobs. Elapsed {} ---'.format(total_fits, time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))) 
    
    # Get index og the model with the best score
    best_index = scores.index(min(scores))
    
    # Move model to output_dir
    os.rename(f'{tmp_dir}/{best_index}', f'{tmp_dir}/{model_name}')

    # Remove the model in the output dir
    if os.path.isdir(f'{out_dir}/{model_name}'):
        rmtree(f'{out_dir}/{model_name}')
    
    # Move the contents of the new best model to the output dir
    move(f'{tmp_dir}/{model_name}', f'{out_dir}/{model_name}')
    # Remove working dir
    rmtree(tmp_dir)

    # Return the best params and score in a dict
    return model_params, scores

### 2.2 Model training on European COVID texts


#### Dataset build

We'll build our dataset by applying our tokenizer to our text file.

TextDataset: reads the full input text, tokenizes it and cuts it in block_sized chunks. Then adds special tokens (here just <s> or ["SEP"]/["CLS"])

LineByLineTextDataset: reads each line separately, tokenizes and truncates the lines to block_size. Adds special tokens.

use TextDataset because --line-by-line will throw away a lot of data if not used correctly.

In [40]:
%%time
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_eu_text.txt",
    block_size=128,
)
print(len(dataset))



24701
CPU times: user 165 ms, sys: 81 ms, total: 246 ms
Wall time: 264 ms


In [41]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
19760 4941


#### Data collator
[Data collators](https://huggingface.co/transformers/master/main_classes/data_collator.html) are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset. This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on.

[DataCollatorForLanguageModeling](https://huggingface.co/transformers/master/main_classes/data_collator.html#datacollatorforlanguagemodeling): Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they are not all of the same length.

For best performance, this data collator should be used with a dataset having items that are dictionaries or BatchEncoding, with the "special_tokens_mask" key, as returned by a PreTrainedTokenizer or a PreTrainedTokenizerFast with the argument return_special_tokens_mask=True.

In [42]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search
First the parameters to perform the search over the training are defined

In [43]:
param_grid = dict(
            output_dir=["../data/03_models/trainer/"],
            overwrite_output_dir=[True],
            evaluation_strategy = ["epoch"],
            num_train_epochs=[1,2],
            learning_rate=[2e-4, 2e-5, 2e-6],
            weight_decay=[0.01, 0.005],
            per_device_train_batch_size=[16],
            per_device_eval_batch_size=[64],
            save_steps=[10_000],
            save_total_limit=[1],
            prediction_loss_only=[True],
            eval_accumulation_steps=[1]
            )

Then the model is trained

In [44]:
eu_models_params, eu_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=3,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='eu_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(eu_models_params, open('../data/03_models/eu_models_params.p', 'wb'))
pickle.dump(eu_scores, open('../data/03_models/eu_scores.p', 'wb'))

- Starting grid search, totalling 3 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 2e-06, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,2.0689,1.942543,37.4391,79.169


------ - Perplexity: 6.93 | Fitted 1 jobs out of 3. Elapsed 00:13:41 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 0.0002, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.8295,1.597354,37.4379,79.171


------ - Perplexity: 4.97 | Fitted 2 jobs out of 3. Elapsed 00:27:20 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 2, 'learning_rate': 2e-06, 'weight_decay': 0.001, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.9042,1.657105,37.5294,78.978
2,1.6412,1.584923,36.6797,80.808


------ - Perplexity: 4.80 | Fitted 3 jobs out of 3. Elapsed 00:53:56 ------
--- Ending grid search, totalling 3 jobs. Elapsed 00:53:56 ---


We evaluate the results obtained in training

In [45]:
print('- Training results: ')
for model_params, score in zip(eu_models_params, eu_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 6.93 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 2e-06, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}
  - Model Perplexity: 4.97 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 0.0002, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}
  - Model Perplexity: 4.80 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 2, 'learning_rate': 2e-06, 'weight_decay

And finally evaluate the results of the final best found model

In [46]:
model=RobertaForMaskedLM.from_pretrained("../data/03_models/eu_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best mode perplexity: 4.80


### 2.2 Model training on United States COVID texts
The same steps are followed for the US data model as for the European model. New training parameters need to be found as the datasets differ in size.


#### Dataset build

In [47]:
%%time
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="../data/02_preprocessed/full_us_text.txt",
    block_size=128,
)
print(len(dataset))

15632
CPU times: user 149 ms, sys: 42.6 ms, total: 191 ms
Wall time: 194 ms




In [48]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
print(type(train_dataset), type(test_dataset))
print(len(train_dataset), len(test_dataset))

<class 'list'> <class 'list'>
12505 3127


#### Data collator

In [49]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, #The tokenizer used for encoding the data.
    mlm=True, #Whether or not to use masked language modeling. The labels are -100 for non-masked tokens and the value to predict for the masked token.
    mlm_probability=0.15 #The probability with which to (randomly) mask tokens in the input, when mlm is set to True
)

#### Train the model using the gird search

The model is trained using the same parameters as before

In [50]:
us_models_params, us_scores = grid_search_for_language_modeling(
                    baseline_model=model,
                    param_grid=param_grid,
                    n_combinations=3,
                    X=train_dataset,
                    data_collator=data_collator,
                    model_name='us_bert_model',
                    out_dir='../data/03_models/',
                    tmp_dir='../data/03_models/tmp/'
                    )

import pickle
pickle.dump(us_models_params, open('../data/03_models/us_models_params.p', 'wb'))
pickle.dump(us_scores, open('../data/03_models/us_scores.p', 'wb'))

- Starting grid search, totalling 3 jobs -
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 2e-06, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,2.5851,2.244523,23.0119,81.523


------ - Perplexity: 9.33 | Fitted 1 jobs out of 3. Elapsed 00:08:31 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 0.0002, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,1.7496,1.275229,23.9914,78.195


------ - Perplexity: 3.56 | Fitted 2 jobs out of 3. Elapsed 00:17:05 ------
  - Training model {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 2, 'learning_rate': 2e-06, 'weight_decay': 0.001, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.6339,1.337269,23.798,78.83
2,1.2054,1.261774,23.7518,78.984


------ - Perplexity: 3.56 | Fitted 3 jobs out of 3. Elapsed 00:34:00 ------
--- Ending grid search, totalling 3 jobs. Elapsed 00:34:00 ---


We evaluate the results obtained in training

In [51]:
print('- Training results: ')
for model_params, score in zip(us_models_params, us_scores):
    print(f'  - Model Perplexity: {score:.2f} | Params: {model_params}')

- Training results: 
  - Model Perplexity: 9.33 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 2e-06, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}
  - Model Perplexity: 3.56 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 1, 'learning_rate': 0.0002, 'weight_decay': 0.005, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 64, 'save_steps': 10000, 'save_total_limit': 1, 'prediction_loss_only': True, 'eval_accumulation_steps': 1}
  - Model Perplexity: 3.56 | Params: {'output_dir': '../data/03_models/trainer/', 'overwrite_output_dir': True, 'evaluation_strategy': 'epoch', 'num_train_epochs': 2, 'learning_rate': 2e-06, 'weight_decay

And finally evaluate the results of the final best found model

In [52]:
model=RobertaForMaskedLM.from_pretrained("../data/03_models/us_bert_model")
trainer = Trainer(
                    model=model,
                    eval_dataset=test_dataset,
                    data_collator=data_collator,
                        )
eval_results = trainer.evaluate()
print(f" Best model perplexity on test: {math.exp(eval_results['eval_loss']):.2f}")

 Best mode perplexity: 3.60


## 3. Compare the model results

Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.

Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, transformers.pipeline.tokenizer.mask_token) and return a list of the most probable filled sequences, with their probabilities.

The predictions are compared to the predictions outputed from the base model that was used to fine tune this model.

In [53]:
from transformers import pipeline, RobertaForMaskedLM
model=RobertaForMaskedLM.from_pretrained("../data/03_models/eu_bert_model")
eu_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=model_checkpoint
)

model=RobertaForMaskedLM.from_pretrained("../data/03_models/us_bert_model")
us_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=model_checkpoint
)

model = RobertaForMaskedLM.from_pretrained(model_checkpoint)
old_model_pipeline = pipeline(
    "fill-mask",
    model=model,
    tokenizer=model_checkpoint
)

In [61]:
sequence = f"Coronavirus is a very bad <mask>"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'Coronavirus is a very bad virus', 'score': 0.2691511809825897, 'token': 6793, 'token_str': ' virus'}
UNITED STATES MODEL 0 token is  {'sequence': 'Coronavirus is a very bad disease', 'score': 0.25462937355041504, 'token': 2199, 'token_str': ' disease'}
OLD MODEL 0 token is            {'sequence': 'Coronavirus is a very bad virus', 'score': 0.7009516954421997, 'token': 6793, 'token_str': ' virus'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'Coronavirus is a very bad case', 'score': 0.16394412517547607, 'token': 403, 'token_str': ' case'}
UNITED STATES MODEL 1 token is  {'sequence': 'Coronavirus is a very bad virus', 'score': 0.12848496437072754, 'token': 6793, 'token_str': ' virus'}
OLD MODEL 1 token is            {'sequence': 'Coronavirus is a very bad thing', 'score': 0.1592380851507187, 'token': 631, 'token_str': ' thing'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': 'Cor

In [62]:
sequence = f"Coronavirus is a very <mask> disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'Coronavirus is a very rare disease', 'score': 0.24148690700531006, 'token': 3159, 'token_str': ' rare'}
UNITED STATES MODEL 0 token is  {'sequence': 'Coronavirus is a very rare disease', 'score': 0.28560319542884827, 'token': 3159, 'token_str': ' rare'}
OLD MODEL 0 token is            {'sequence': 'Coronavirus is a very rare disease', 'score': 0.19685015082359314, 'token': 3159, 'token_str': ' rare'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'Coronavirus is a very serious disease', 'score': 0.21718955039978027, 'token': 1473, 'token_str': ' serious'}
UNITED STATES MODEL 1 token is  {'sequence': 'Coronavirus is a very infectious disease', 'score': 0.08091650158166885, 'token': 19166, 'token_str': ' infectious'}
OLD MODEL 1 token is            {'sequence': 'Coronavirus is a very contagious disease', 'score': 0.17108966410160065, 'token': 27432, 'token_str': ' contagious'}
--------------------------------
EU

In [63]:
sequence = f"Covid is a very bad <mask>"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'Covid is a very bad disease', 'score': 0.2005854845046997, 'token': 2199, 'token_str': ' disease'}
UNITED STATES MODEL 0 token is  {'sequence': 'Covid is a very bad virus', 'score': 0.35650452971458435, 'token': 6793, 'token_str': ' virus'}
OLD MODEL 0 token is            {'sequence': 'Covid is a very bad guy', 'score': 0.0871887058019638, 'token': 2173, 'token_str': ' guy'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'Covid is a very bad virus', 'score': 0.15339580178260803, 'token': 6793, 'token_str': ' virus'}
UNITED STATES MODEL 1 token is  {'sequence': 'Covid is a very bad disease', 'score': 0.10436339676380157, 'token': 2199, 'token_str': ' disease'}
OLD MODEL 1 token is            {'sequence': 'Covid is a very bad dog', 'score': 0.05724494904279709, 'token': 2335, 'token_str': ' dog'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': 'Covid is a very bad drug', 'score': 0.

In [64]:
sequence = f"Covid is a very <mask> disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'Covid is a very serious disease', 'score': 0.18287736177444458, 'token': 1473, 'token_str': ' serious'}
UNITED STATES MODEL 0 token is  {'sequence': 'Covid is a very rare disease', 'score': 0.2775634825229645, 'token': 3159, 'token_str': ' rare'}
OLD MODEL 0 token is            {'sequence': 'Covid is a very rare disease', 'score': 0.3451760709285736, 'token': 3159, 'token_str': ' rare'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'Covid is a very dangerous disease', 'score': 0.17433346807956696, 'token': 2702, 'token_str': ' dangerous'}
UNITED STATES MODEL 1 token is  {'sequence': 'Covid is a very infectious disease', 'score': 0.15055875480175018, 'token': 19166, 'token_str': ' infectious'}
OLD MODEL 1 token is            {'sequence': 'Covid is a very serious disease', 'score': 0.10931653529405594, 'token': 1473, 'token_str': ' serious'}
--------------------------------
EUROPEAN MODEL 2 token is       {'seq

In [65]:
sequence = f"<mask> is a disease"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': ' it is a disease', 'score': 0.14300011098384857, 'token': 24, 'token_str': ' it'}
UNITED STATES MODEL 0 token is  {'sequence': ' there is a disease', 'score': 0.36707645654678345, 'token': 89, 'token_str': ' there'}
OLD MODEL 0 token is            {'sequence': 'Life is a disease', 'score': 0.18752367794513702, 'token': 12116, 'token_str': 'Life'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': ' this is a disease', 'score': 0.0931367352604866, 'token': 42, 'token_str': ' this'}
UNITED STATES MODEL 1 token is  {'sequence': '  is a disease', 'score': 0.28947973251342773, 'token': 1437, 'token_str': ' '}
OLD MODEL 1 token is            {'sequence': 'It is a disease', 'score': 0.0876104012131691, 'token': 243, 'token_str': 'It'}
--------------------------------
EUROPEAN MODEL 2 token is       {'sequence': ' disease is a disease', 'score': 0.08356322348117828, 'token': 2199, 'token_str': ' disease'}
UNITED STATES MO

In [66]:
sequence = f"The <mask> of the coronavirus pandemic are very serious"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'The consequences of the coronavirus pandemic are very serious', 'score': 0.7281331419944763, 'token': 4914, 'token_str': ' consequences'}
UNITED STATES MODEL 0 token is  {'sequence': 'The consequences of the coronavirus pandemic are very serious', 'score': 0.5790644288063049, 'token': 4914, 'token_str': ' consequences'}
OLD MODEL 0 token is            {'sequence': 'The consequences of the coronavirus pandemic are very serious', 'score': 0.6111682057380676, 'token': 4914, 'token_str': ' consequences'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'The effects of the coronavirus pandemic are very serious', 'score': 0.1942291110754013, 'token': 3038, 'token_str': ' effects'}
UNITED STATES MODEL 1 token is  {'sequence': 'The effects of the coronavirus pandemic are very serious', 'score': 0.3185788691043854, 'token': 3038, 'token_str': ' effects'}
OLD MODEL 1 token is            {'sequence': 'The implications of t

In [67]:
sequence = f"The consequences of the <mask> are very serious"
eu_model_pipeline_results = eu_model_pipeline(sequence)
us_model_pipeline_results = us_model_pipeline(sequence)
old_model_pipeline_results = old_model_pipeline(sequence)

for idx,(eu_prediction, us_prediction, old_prediction) in enumerate(zip(eu_model_pipeline_results, us_model_pipeline_results, old_model_pipeline_results)):
    print(f'EUROPEAN MODEL {idx} token is       {eu_prediction}')
    print(f'UNITED STATES MODEL {idx} token is  {us_prediction}')
    print(f'OLD MODEL {idx} token is            {old_prediction}')
    print('--------------------------------')

EUROPEAN MODEL 0 token is       {'sequence': 'The consequences of the crisis are very serious', 'score': 0.6593908071517944, 'token': 1486, 'token_str': ' crisis'}
UNITED STATES MODEL 0 token is  {'sequence': 'The consequences of the disaster are very serious', 'score': 0.3059934079647064, 'token': 4463, 'token_str': ' disaster'}
OLD MODEL 0 token is            {'sequence': 'The consequences of the attack are very serious', 'score': 0.07320790737867355, 'token': 908, 'token_str': ' attack'}
--------------------------------
EUROPEAN MODEL 1 token is       {'sequence': 'The consequences of the measures are very serious', 'score': 0.0319034680724144, 'token': 1797, 'token_str': ' measures'}
UNITED STATES MODEL 1 token is  {'sequence': 'The consequences of the act are very serious', 'score': 0.09415481239557266, 'token': 1760, 'token_str': ' act'}
OLD MODEL 1 token is            {'sequence': 'The consequences of the decision are very serious', 'score': 0.04880741238594055, 'token': 568, 't