In [None]:
!pip install -r requirements.txt

## Pretrain dataset text creation 
This commands will convert the csv input file into `.txt` files ready to be used in pre-train stages. You can create a single file or a version already splitted into train and text partition (use the `--split` command).

In [23]:
import os
nsp_data_folder = 'data/nsp'
mlm_only_data_folder = 'data/mlm_only'
finetune_data_folder = 'data/finetune'

In [None]:
# nsp_only and nsp_mlm dataset
!python3 bert_medical_records/preprocessing_python/text_generator.py \
    --file_path bert_medical_records/data/trajectories_training_set.csv \
    --output_folder {nsp_data_folder} \
    --create_nsp_class_text_data \
    --split

In [None]:
# mlm_only dataset
!python3 bert_medical_records/preprocessing_python/text_generator.py \
    --file_path bert_medical_records/data/trajectories_training_set.csv \
    --output_folder {mlm_only_data_folder} \
    --create_mlm_only_dataset \
    --split

In [None]:
# finetuning dataset
!python3 bert_medical_records/preprocessing_python/text_generator.py \
    --file_path bert_medical_records/data/trajectories_training_set.csv \
    --output_folder {finetune_data_folder} \
    --create_finetuning_text_data \
    --split

In [None]:
# COMPLETE GRID
#
# This hyperparameter grid is about training parameters. It doesn't change the model architecture 
grid = {
    'already_pretrained' : [True, False],
    'pre_train_tasks': ['mlm_nsp', 'nsp', 'mlm'],
    'learning_rate' : [1e-5, 3e-5, 5e-5],
    'warmup_step' : [500, 1_000],
    'max_seq_length' : [128,256,512],
    'type_of_scheduler' : ['constant_with_warmup', 'linear'],
    'batch_size' : [16,32],
    'num_epochs' : [16, 32],
    'hidden_size' : [512, 1024],
    'num_hidden_layers' : [18,24],
    'num_attention_heads' : [8,16],
    'intermediate_size' : [2048, 4096]
}

# Training grid
Here there are some parameters that will change some parameters that are needed for training

In [15]:
# TRAINING GRID
#
# This hyperparameter grid is about training parameters. It doesn't change the model architecture 
grid = {
    'pre_train_tasks': ['mlm_nsp', 'nsp'],
    'learning_rate' : [3e-5, 5e-5],
    'warmup_step' : [1_000],
    'max_seq_length' : [128,512],
    'type_of_scheduler' : ['constant_with_warmup', 'linear'],
    'batch_size' : [16,32],
    'num_epochs' : [16, 32]
}

## Run PRETRAIN & FINETUNING
This set of commands will run pre_train and finetuning of the bert model on the selected dataset. The finetuning will serve as evaluation of the performance of the pretrained model. 

In [19]:
from datetime import datetime

### NSP_&_MLM
- `batch_size = 16`
- `num_epochs = 16`
- `learning_rate = 3e-5`
- `scheduler = constant_with_warmup && linear`
- `max_seq_length = 128`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 3e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

- `batch_size = 32`
- `num_epochs = 32`
- `learning_rate = 5e-5`
- `scheduler = constant_with_warmup`
- `max_seq_length = 256 && 512`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 512 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

### NSP_ONLY
- `batch_size = 16`
- `num_epochs = 16`
- `learning_rate = 3e-5`
- `scheduler = constant_with_warmup && linear`
- `max_seq_length = 128`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks nsp \
    --learning_rate 3e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks nsp \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {nsp_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks nsp \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

- `batch_size = 32`
- `num_epochs = 32`
- `learning_rate = 5e-5`
- `scheduler = constant_with_warmup`
- `max_seq_length = 256 && 512`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {nsp_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 512 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

### MLM_ONLY
- `batch_size = 16`
- `num_epochs = 16`
- `learning_rate = 3e-5`
- `scheduler = constant_with_warmup && linear`
- `max_seq_length = 128`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm \
    --learning_rate 3e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 16 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 16 \
    --pre_train_tasks mlm \
    --learning_rate 3e-5 \
    --scheduler_name linear \
    --num_warmup_step 1000 \
    --max_seq_length 128 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 8 \
    --train_batch_size 16

- `batch_size = 32`
- `num_epochs = 32`
- `learning_rate = 5e-5`
- `scheduler = constant_with_warmup`
- `max_seq_length = 256 && 512`
- `use_pretrained = False && True`

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 256 \
    --use_pretrained_bert \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

In [None]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M")
!python3 run_pre_train.py \
    --do_train \
    --do_eval \
    --train_batch_size 32 \
    --input_file {mlm_only_data_folder} \
    --num_epochs 32 \
    --pre_train_tasks mlm_nsp \
    --learning_rate 5e-5 \
    --scheduler_name constant_with_warmup \
    --num_warmup_step 1000 \
    --max_seq_length 512 \
    --output_dir output/{current_time}
!python3 run_glue.py \
    --input_file {finetune_data_folder} \
    --model_input output/{current_time} \
    --do_train \
    --do_eval \
    --num_epochs 16 \
    --train_batch_size 32

# Architecture grid
Here there a grid that change the model architecture, after the first grid search it can be done a second try to improve the best model 

In [46]:
# ARCHITECTURE GRID
#
# This hyperparameters change the model architecture, 
# for an optional second grid search for improving the best model found in the previous grid search
grid = {
    'hidden_size' : [512, 1024],
    'num_hidden_layers' : [18,24],
    'num_attention_heads' : [8,16],
    'intermediate_size' : [2048, 4096]
}