# Causal Language Model (CLM) fine-tuning

This notebook executes the fine-tuning of **facebook/opt-125m** model, over the mc4 pt dataset samples prepared by the `LM_training_dataset_preparation.ipynb` notebook.

In [1]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_05"

API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

TRAIN_OUTPUT_FOLDER="./trained_model_2048"

NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME="normalized_samples_block_2048_??.pkl"

In [3]:
import os
from google.colab import drive
import json

In [4]:
drive.mount('/content/drive', force_remount=True)
os.chdir(WORKING_FOLDER)

Mounted at /content/drive


In [5]:
# with open(API_KEYS_FILE) as inputFile:
#     api_keys = json.load(inputFile)

# os.environ["COMET_API_KEY"] = api_keys['comet_ml']
# os.environ["COMET_LOG_ASSETS"] = "True"
# os.environ['COMET_MODE'] = "ONLINE"

In [6]:
# from comet_ml import Experiment

from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          Trainer, 
                          TrainingArguments, 
                          TrainerCallback, 
                          get_cosine_with_hard_restarts_schedule_with_warmup)

from multiprocessing import Pool

import pickle

import torch

from tqdm.auto import tqdm

import glob

import numpy as np

from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
!nvidia-smi

Wed Mar 29 23:02:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    45W / 400W |      3MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
MODEL_NAME="facebook/opt-125m"

## Link to Comet ML reporting

In [11]:
# Experiment(api_key=api_keys['comet_ml'], 
#            project_name="causal-language-model-fine-tuning",
#            workspace="eduseiti")

## Get the list of normalized-tokenized samples data blocked_samples

The mc4 pt dataset sample has already been tokenized and size-normalized to 2048, which is the model input size.

Each data block contains a list of prepared samples, each of which can be directly fed to the model:

```
    {'input_ids': <list-of-2048-tokens>,
     'attention_masks': <list-of-2048-attention-masks>}
```





In [12]:
data_blocks = glob.glob(NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME)

In [13]:
data_blocks

['normalized_samples_block_2048_00.pkl',
 'normalized_samples_block_2048_01.pkl',
 'normalized_samples_block_2048_02.pkl',
 'normalized_samples_block_2048_03.pkl',
 'normalized_samples_block_2048_04.pkl',
 'normalized_samples_block_2048_05.pkl',
 'normalized_samples_block_2048_06.pkl',
 'normalized_samples_block_2048_07.pkl',
 'normalized_samples_block_2048_08.pkl',
 'normalized_samples_block_2048_09.pkl',
 'normalized_samples_block_2048_10.pkl',
 'normalized_samples_block_2048_11.pkl',
 'normalized_samples_block_2048_12.pkl',
 'normalized_samples_block_2048_13.pkl',
 'normalized_samples_block_2048_14.pkl',
 'normalized_samples_block_2048_15.pkl',
 'normalized_samples_block_2048_16.pkl',
 'normalized_samples_block_2048_17.pkl',
 'normalized_samples_block_2048_18.pkl',
 'normalized_samples_block_2048_19.pkl',
 'normalized_samples_block_2048_20.pkl',
 'normalized_samples_block_2048_21.pkl',
 'normalized_samples_block_2048_22.pkl',
 'normalized_samples_block_2048_23.pkl',
 'normalized_sam

Define the dataset class

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, samples_blocks_filenames, fixed_data_block_index=None, sampling_size=None):
        self.samples_blocks_filenames = samples_blocks_filenames

        if fixed_data_block_index is not None:
            self.current_file_index = fixed_data_block_index
            self.change_file_index = False
        else:
            self.current_file_index = 0
            self.change_file_index = True

        with open(samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
            self.db = pickle.load(inputFile)

        print("Dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        self.sampling_size = sampling_size
    
        if self.sampling_size is not None:
            self.dataset_size = self.sampling_size
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def sample_data(self):
        self.selected_samples = np.random.choice(list(range(len(self.db))), self.sampling_size, replace=False)
        self.sampled_db = [self.db[i] for i in self.selected_samples]

        print("Updating the sampled dataset itens; sample DB size: {}".format(len(self.sampled_db)))



    def update_dataset(self):
        if self.change_file_index:
            self.current_file_index = (self.current_file_index + 1) % len(self.samples_blocks_filenames)

            with open(self.samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
                self.db = pickle.load(inputFile)

            print("Updating dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        if self.sampling_size is not None:
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def __len__(self):
        return self.dataset_size



    def __getitem__(self, idx):

        if self.sampling_size is not None:
            return {'input_ids': self.sampled_db[idx]['input_ids'],
                    'attention_mask': self.sampled_db[idx]['attention_mask'],
                    'labels': self.sampled_db[idx]['input_ids'].copy()}
        else:
            return {'input_ids': self.db[idx]['input_ids'],
                    'attention_mask': self.db[idx]['attention_mask'],
                    'labels': self.db[idx]['input_ids'].copy()}

Create a callback to update the datasets and save a checkpoint of the best epoch yet.

In [15]:
class CustomTrainerCallback(TrainerCallback):

    def __init__(self, best_validation_yet=99999, model=None, train_dataset=None, eval_dataset=None) -> None:
        super().__init__()

        self.best_validation_loss = best_validation_yet
        self.model = model
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset


    def on_epoch_end(self, args, state, control, train_dataloader=None, eval_dataloader=None, **kwargs):
        self.train_dataset.update_dataset()
        self.eval_dataset.update_dataset()


    def on_evaluate(self, args, state, control, model=None, metrics=None, **kwargs):
        # print(metrics.keys())

        try:
            perplexity = np.exp(metrics["eval_loss"])
        except OverflowError:
            perplexity = float("inf")

        metrics['perplexity'] = perplexity

        # print("perplexity={}".format(metrics['perplexity']))

        if metrics['eval_loss'] < self.best_validation_loss:
            self.model.save_pretrained(os.path.join(TRAIN_OUTPUT_FOLDER, 
                                                    "checkpoint-{}-{:.4f}".format(state.global_step,
                                                                                  metrics['eval_loss'])))
            self.best_validation_loss = metrics['eval_loss']

Prepare the data

In [27]:
train_dataset = Dataset(data_blocks[:-1])#, sampling_size=3000)

Dataset loading samples block 0; change_file_index=True...


In [None]:
eval_dataset = Dataset(data_blocks, len(data_blocks) - 1, sampling_size=3000)

Dataset loading samples block 24; change_file_index=False...
Updating the sampled dataset itens; sample DB size: 3000


Prepare the model

In [16]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
print('Parameters', model.num_parameters())

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Parameters 125239296


Prepare the trainer

In [17]:
batch_size=4
epochs=24

In [18]:
training_params = TrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                    num_train_epochs=epochs,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='steps',
                                    save_steps=10000,
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    dataloader_num_workers=2,
                                    dataloader_pin_memory=False,
                                    fp16=True)

In [None]:
trainer_callback = CustomTrainerCallback(best_validation_yet=999999, 
                                         model=model, 
                                         train_dataset=train_dataset, 
                                         eval_dataset=eval_dataset)

In [None]:
num_training_steps = epochs * len(train_dataset)

optimzer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               num_training_steps * 0.1, 
                                                               num_training_steps, 
                                                               num_cycles=80)

In [None]:
trainer = Trainer(model=model,
                  args=training_params,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  callbacks=[trainer_callback],
                  optimizers=(optimzer, scheduler)
                 )

In [None]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Unnamed: 3
1,1.5739,1.536855,4.649945
2,1.582,1.457014,4.293122
3,1.4895,1.405658,4.078208
4,1.4134,1.403137,4.06794
5,1.6132,1.38387,3.990313
6,1.4654,1.380773,3.977976
7,1.4063,1.38314,3.987402
8,1.4429,1.365949,3.919441
9,1.4538,1.368323,3.928757
10,1.3829,1.381435,3.980608


Updating dataset loading samples block 1; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 2; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 3; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 4; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 5; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 6; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 7; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 8; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000


Epoch,Training Loss,Validation Loss,Unnamed: 3
1,1.5739,1.536855,4.649945
2,1.582,1.457014,4.293122
3,1.4895,1.405658,4.078208
4,1.4134,1.403137,4.06794
5,1.6132,1.38387,3.990313
6,1.4654,1.380773,3.977976
7,1.4063,1.38314,3.987402
8,1.4429,1.365949,3.919441
9,1.4538,1.368323,3.928757
10,1.3829,1.381435,3.980608


Updating dataset loading samples block 15; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 16; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 17; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 18; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 19; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 20; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 21; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating dataset loading samples block 22; change_file_index=True...
Updating the sampled dataset itens; sample DB siz

# Just evaluate against the entire evaluation dataset

In [20]:
eval_dataset = Dataset(data_blocks, len(data_blocks) - 1)

Dataset loading samples block 24; change_file_index=False...


In [21]:
model = AutoModelForCausalLM.from_pretrained("trained_model_2048/checkpoint-89685-1.2310").to(device)
print('Parameters', model.num_parameters())

Parameters 125239296


Prepare the trainer

In [22]:
batch_size=4
epochs=1

In [23]:
training_params = TrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                    num_train_epochs=epochs,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='steps',
                                    save_steps=10000,
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    dataloader_num_workers=2,
                                    dataloader_pin_memory=False,
                                    fp16=True)

In [28]:
trainer_callback = CustomTrainerCallback(best_validation_yet=1.230991, 
                                         model=model, 
                                         train_dataset=train_dataset, 
                                         eval_dataset=eval_dataset)

In [29]:
num_training_steps = epochs * len(train_dataset)

optimzer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-3)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimzer, 
                                                               num_training_steps * 0.1, 
                                                               num_training_steps, 
                                                               num_cycles=80)

In [31]:
trainer = Trainer(model=model,
                  args=training_params,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  callbacks=[trainer_callback],
                  optimizers=(optimzer, scheduler)
                 )

In [32]:
evaluation_result = trainer.evaluate()

In [33]:
evaluation_result

{'eval_loss': 1.2417575120925903,
 'eval_runtime': 471.2032,
 'eval_samples_per_second': 31.12,
 'eval_steps_per_second': 7.78,
 'perplexity': 3.4616920872759387}