<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_05/blob/main/LM_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Causal Language Model (CLM) fine-tuning

This notebook executes the fine-tuning of **facebook/opt-125m** model, over the mc4 pt dataset samples prepared by the `LM_training_dataset_preparation.ipynb` notebook.

In [1]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_05"

API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

TRAIN_OUTPUT_FOLDER="./trained_model"

NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME="normalized_samples_block_??.pkl"

In [3]:
import os
from google.colab import drive
import json

In [4]:
drive.mount('/content/drive', force_remount=True)
os.chdir(WORKING_FOLDER)

Mounted at /content/drive


In [5]:
# with open(API_KEYS_FILE) as inputFile:
#     api_keys = json.load(inputFile)

# os.environ["COMET_API_KEY"] = api_keys['comet_ml']
# os.environ["COMET_LOG_ASSETS"] = "True"
# os.environ['COMET_MODE'] = "ONLINE"

In [6]:
# from comet_ml import Experiment

from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          Trainer, 
                          TrainingArguments, 
                          TrainerCallback, 
                          get_cosine_with_hard_restarts_schedule_with_warmup)

from multiprocessing import Pool

import pickle

import torch

from tqdm.auto import tqdm

import glob

import numpy as np

from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
!nvidia-smi

Wed Mar 29 23:15:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8    10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
MODEL_NAME="facebook/opt-125m"

## Link to Comet ML reporting

In [11]:
# Experiment(api_key=api_keys['comet_ml'], 
#            project_name="causal-language-model-fine-tuning",
#            workspace="eduseiti")

## Get the list of normalized-tokenized samples data blocked_samples

The mc4 pt dataset sample has already been tokenized and size-normalized to 512, which is the model input size.

Each data block contains a list of prepared samples, each of which can be directly fed to the model:

```
    {'input_ids': <list-of-512-tokens>,
     'attention_masks': <list-of-512-attention-masks>}
```





In [12]:
data_blocks = glob.glob(NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME)

In [13]:
data_blocks

['normalized_samples_block_00.pkl',
 'normalized_samples_block_01.pkl',
 'normalized_samples_block_02.pkl',
 'normalized_samples_block_03.pkl',
 'normalized_samples_block_04.pkl',
 'normalized_samples_block_05.pkl',
 'normalized_samples_block_06.pkl',
 'normalized_samples_block_07.pkl',
 'normalized_samples_block_08.pkl',
 'normalized_samples_block_09.pkl',
 'normalized_samples_block_10.pkl',
 'normalized_samples_block_11.pkl',
 'normalized_samples_block_12.pkl',
 'normalized_samples_block_13.pkl',
 'normalized_samples_block_14.pkl',
 'normalized_samples_block_15.pkl',
 'normalized_samples_block_16.pkl',
 'normalized_samples_block_17.pkl',
 'normalized_samples_block_18.pkl',
 'normalized_samples_block_19.pkl',
 'normalized_samples_block_20.pkl',
 'normalized_samples_block_21.pkl',
 'normalized_samples_block_22.pkl',
 'normalized_samples_block_23.pkl',
 'normalized_samples_block_24.pkl']

Define the dataset class

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, samples_blocks_filenames, fixed_data_block_index=None, sampling_size=None):
        self.samples_blocks_filenames = samples_blocks_filenames

        if fixed_data_block_index is not None:
            self.current_file_index = fixed_data_block_index
            self.change_file_index = False
        else:
            self.current_file_index = 0
            self.change_file_index = True

        with open(samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
            self.db = pickle.load(inputFile)

        print("Dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        self.sampling_size = sampling_size
    
        if self.sampling_size is not None:
            self.dataset_size = self.sampling_size
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def sample_data(self):
        self.selected_samples = np.random.choice(list(range(len(self.db))), self.sampling_size, replace=False)
        self.sampled_db = [self.db[i] for i in self.selected_samples]

        print("Updating the sampled dataset itens; sample DB size: {}".format(len(self.sampled_db)))



    def update_dataset(self):
        if self.change_file_index:
            self.current_file_index = (self.current_file_index + 1) % len(self.samples_blocks_filenames)

            with open(self.samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
                self.db = pickle.load(inputFile)

            print("Updating dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        if self.sampling_size is not None:
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def __len__(self):
        return self.dataset_size



    def __getitem__(self, idx):

        if self.sampling_size is not None:
            return {'input_ids': self.sampled_db[idx]['input_ids'],
                    'attention_mask': self.sampled_db[idx]['attention_mask'],
                    'labels': self.sampled_db[idx]['input_ids'].copy()}
        else:
            return {'input_ids': self.db[idx]['input_ids'],
                    'attention_mask': self.db[idx]['attention_mask'],
                    'labels': self.db[idx]['input_ids'].copy()}

Create a callback to update the datasets and save a checkpoint of the best epoch yet.

In [15]:
class CustomTrainerCallback(TrainerCallback):

    def __init__(self, best_validation_yet=99999, model=None, train_dataset=None, eval_dataset=None) -> None:
        super().__init__()

        self.best_validation_loss = best_validation_yet
        self.model = model
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset


    def on_epoch_end(self, args, state, control, train_dataloader=None, eval_dataloader=None, **kwargs):
        self.train_dataset.update_dataset()
        self.eval_dataset.update_dataset()


    def on_evaluate(self, args, state, control, model=None, metrics=None, **kwargs):
        # print(metrics.keys())

        try:
            perplexity = np.exp(metrics["eval_loss"])
        except OverflowError:
            perplexity = float("inf")

        metrics['perplexity'] = perplexity

        # print("perplexity={}".format(metrics['perplexity']))

        if metrics['eval_loss'] < self.best_validation_loss:
            self.model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, 
                                                    "checkpoint-{}-{:.4f}".format(state.global_step,
                                                                                  metrics['eval_loss'])))
            self.best_validation_loss = metrics['eval_loss']

Prepare the data

In [16]:
train_dataset = Dataset(data_blocks[6:-1])#, sampling_size=5000)

Dataset loading samples block 0; change_file_index=True...


In [None]:
eval_dataset = Dataset(data_blocks, len(data_blocks) - 1, sampling_size=3000)

Dataset loading samples block 24; change_file_index=False...
Updating the sampled dataset itens; sample DB size: 3000


Prepare the model

In [None]:
model = AutoModelForCausalLM.from_pretrained("trained_model/checkpoint-after-05-complete-data-blocks").to(device)
print('Parameters', model.num_parameters())

Parameters 125239296


Prepare the trainer

In [None]:
batch_size=12
epochs=19

In [None]:
training_params = TrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                    num_train_epochs=epochs,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='steps',
                                    save_steps=10000,
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    dataloader_num_workers=2,
                                    dataloader_pin_memory=False,
                                    fp16=True)

In [None]:
trainer_callback = CustomTrainerCallback(best_validation_yet=1.946620, 
                                         model=model, 
                                         train_dataset=train_dataset, 
                                         eval_dataset=eval_dataset)

In [None]:
num_training_steps = epochs * len(train_dataset)

optimzer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               num_training_steps * 0.1, 
                                                               num_training_steps, 
                                                               num_cycles=80)

In [None]:
trainer = Trainer(model=model,
                  args=training_params,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  callbacks=[trainer_callback],
                  optimizers=(optimzer, scheduler)
                 )

In [None]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Unnamed: 3
1,2.0542,1.975033,7.206854
2,2.0019,1.991585,7.327138
3,2.1393,1.990324,7.317907
4,1.9967,2.006491,7.437178
5,2.0364,1.980272,7.244715
6,1.9502,1.985172,7.280299
7,2.0456,1.991627,7.327446
8,1.9708,1.990485,7.319085
9,1.9699,1.992419,7.333249
10,1.9787,1.983041,7.264799


Updating dataset loading samples block 1; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 2; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 3; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 4; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 5; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 6; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 7; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 8; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000


Epoch,Training Loss,Validation Loss,Unnamed: 3
1,2.0542,1.975033,7.206854
2,2.0019,1.991585,7.327138
3,2.1393,1.990324,7.317907
4,1.9967,2.006491,7.437178
5,2.0364,1.980272,7.244715
6,1.9502,1.985172,7.280299
7,2.0456,1.991627,7.327446
8,1.9708,1.990485,7.319085
9,1.9699,1.992419,7.333249
10,1.9787,1.983041,7.264799


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-a6eca412ee9e>", line 1, in <module>
    train_result = trainer.train()
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 1633, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 1979, in _inner_training_loop
    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 2224, in _maybe_log_save_evaluate
    self.log(logs)
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer.py", line 2558, in log
    self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
  File "/usr/local/lib/python3.9/dist-packages/transformers/trainer_callback.

FailedPreconditionError: ignored

# Just evaluate against the entire evaluation dataset

In [17]:
eval_dataset = Dataset(data_blocks, len(data_blocks) - 1)

Dataset loading samples block 24; change_file_index=False...


In [18]:
len(eval_dataset)

36869

In [19]:
model = AutoModelForCausalLM.from_pretrained("trained_model/checkpoint-50000").to(device)
print('Parameters', model.num_parameters())

Parameters 125239296


Prepare the trainer

In [20]:
batch_size=4
epochs=1

In [21]:
training_params = TrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                    num_train_epochs=epochs,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='steps',
                                    save_steps=10000,
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    dataloader_num_workers=2,
                                    dataloader_pin_memory=False,
                                    fp16=True)

In [22]:
trainer_callback = CustomTrainerCallback(best_validation_yet=1.94, 
                                         model=model, 
                                         train_dataset=train_dataset, 
                                         eval_dataset=eval_dataset)

In [23]:
num_training_steps = epochs * len(train_dataset)

optimzer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               num_training_steps * 0.1, 
                                                               num_training_steps, 
                                                               num_cycles=80)

In [24]:
trainer = Trainer(model=model,
                  args=training_params,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  callbacks=[trainer_callback],
                  optimizers=(optimzer, scheduler)
                 )

In [25]:
evaluation_result = trainer.evaluate()

In [26]:
evaluation_result

{'eval_loss': 1.9811569452285767,
 'eval_runtime': 641.3556,
 'eval_samples_per_second': 57.486,
 'eval_steps_per_second': 14.373,
 'perplexity': 7.251127291255752}