<a href="https://colab.research.google.com/github/eduseiti/ia368v_dd_class_05/blob/main/LM_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Causal Language Model (CLM) fine-tuning

This notebook executes the fine-tuning of **facebook/opt-125m** model, over the mc4 pt dataset samples prepared by the `LM_training_dataset_preparation.ipynb` notebook.

In [1]:
!pip install transformers -q

In [2]:
WORKING_FOLDER="drive/MyDrive/unicamp/ia368v_dd/aula_05"

API_KEYS_FILE="/content/drive/MyDrive/unicamp/ia368v_dd/api_keys_20230324.json"

TRAIN_OUTPUT_FOLDER="./trained_model"

NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME="normalized_samples_block_*"

In [3]:
import os
from google.colab import drive
import json

In [4]:
drive.mount('/content/drive', force_remount=True)
os.chdir(WORKING_FOLDER)

Mounted at /content/drive


In [5]:
# with open(API_KEYS_FILE) as inputFile:
#     api_keys = json.load(inputFile)

# os.environ["COMET_API_KEY"] = api_keys['comet_ml']
# os.environ["COMET_LOG_ASSETS"] = "True"
# os.environ['COMET_MODE'] = "ONLINE"

In [6]:
# from comet_ml import Experiment

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, TrainerCallback

from multiprocessing import Pool

import pickle

import torch

from tqdm.auto import tqdm

import glob

import numpy as np

from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
!nvidia-smi

Sun Mar 26 00:03:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
MODEL_NAME="facebook/opt-125m"

## Link to Comet ML reporting

In [11]:
# Experiment(api_key=api_keys['comet_ml'], 
#            project_name="causal-language-model-fine-tuning",
#            workspace="eduseiti")

## Get the list of normalized-tokenized samples data blocked_samples

The mc4 pt dataset sample has already been tokenized and size-normalized to 512, which is the model input size.

Each data block contains a list of prepared samples, each of which can be directly fed to the model:

```
    {'input_ids': <list-of-512-tokens>,
     'attention_masks': <list-of-512-attention-masks>}
```





In [12]:
data_blocks = glob.glob(NORMALIZED_DATA_BLOCKS_PARTIAL_FILENAME)

In [13]:
data_blocks

['normalized_samples_block_00.pkl',
 'normalized_samples_block_01.pkl',
 'normalized_samples_block_02.pkl',
 'normalized_samples_block_03.pkl',
 'normalized_samples_block_04.pkl',
 'normalized_samples_block_05.pkl',
 'normalized_samples_block_06.pkl',
 'normalized_samples_block_07.pkl',
 'normalized_samples_block_08.pkl',
 'normalized_samples_block_09.pkl',
 'normalized_samples_block_10.pkl',
 'normalized_samples_block_11.pkl',
 'normalized_samples_block_12.pkl',
 'normalized_samples_block_13.pkl',
 'normalized_samples_block_14.pkl',
 'normalized_samples_block_15.pkl',
 'normalized_samples_block_16.pkl',
 'normalized_samples_block_17.pkl',
 'normalized_samples_block_18.pkl',
 'normalized_samples_block_19.pkl',
 'normalized_samples_block_20.pkl',
 'normalized_samples_block_21.pkl',
 'normalized_samples_block_22.pkl',
 'normalized_samples_block_23.pkl',
 'normalized_samples_block_24.pkl']

Define the dataset class

In [14]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, samples_blocks_filenames, fixed_data_block_index=None, sampling_size=None):
        self.samples_blocks_filenames = samples_blocks_filenames

        if fixed_data_block_index is not None:
            self.current_file_index = fixed_data_block_index
            self.change_file_index = False
        else:
            self.current_file_index = 0
            self.change_file_index = True

        with open(samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
            self.db = pickle.load(inputFile)

        print("Dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        self.sampling_size = sampling_size
    
        if self.sampling_size is not None:
            self.dataset_size = self.sampling_size
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def sample_data(self):
        self.selected_samples = np.random.choice(list(range(len(self.db))), self.sampling_size, replace=False)
        self.sampled_db = [self.db[i] for i in self.selected_samples]

        print("Updating the sampled dataset itens; sample DB size: {}".format(len(self.sampled_db)))



    def update_dataset(self):
        if self.change_file_index:
            self.current_file_index = (self.current_file_index + 1) % len(self.samples_blocks_filenames)

            with open(self.samples_blocks_filenames[self.current_file_index], "rb") as inputFile:
                self.db = pickle.load(inputFile)

            print("Updating dataset loading samples block {}; change_file_index={}...".format(self.current_file_index, self.change_file_index))

        if self.sampling_size is not None:
            self.sample_data()
        else:
            self.dataset_size = len(self.db)



    def __len__(self):
        return self.dataset_size



    def __getitem__(self, idx):

        if self.sampling_size is not None:
            return {'input_ids': self.sampled_db[idx]['input_ids'],
                    'attention_mask': self.sampled_db[idx]['attention_mask'],
                    'labels': self.sampled_db[idx]['input_ids'].copy()}
        else:
            return {'input_ids': self.db[idx]['input_ids'],
                    'attention_mask': self.db[idx]['attention_mask'],
                    'labels': self.db[idx]['input_ids'].copy()}

Create a callback to update the datasets

In [15]:
class DatasetUpdaterCallback(TrainerCallback):

    def on_epoch_end(self, args, state, control, **kwargs):
        train_dataset.update_dataset()
        eval_dataset.update_dataset()

Prepare the data

In [16]:
train_dataset = Dataset(data_blocks[:-1], sampling_size=3000)

Dataset loading samples block 0; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000


In [17]:
eval_dataset = Dataset(data_blocks, len(data_blocks) - 1, sampling_size=1000)

Dataset loading samples block 24; change_file_index=False...
Updating the sampled dataset itens; sample DB size: 1000


Prepare the model

In [18]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
print('Parameters', model.num_parameters())

Parameters 125239296


Prepare the trainer

In [19]:
batch_size=14

In [20]:
training_params = TrainingArguments(output_dir=TRAIN_OUTPUT_FOLDER,
                                    num_train_epochs=100,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='epoch',
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    learning_rate=2e-4,
                                    weight_decay=1e-2,
                                    dataloader_num_workers=4,
                                    dataloader_pin_memory=False,
                                    optim='adamw_torch',
                                    fp16=True)

In [21]:
trainer_callback = DatasetUpdaterCallback()

In [22]:
trainer = Trainer(model=model,
                     args=training_params,
                     train_dataset=train_dataset,
                     eval_dataset=eval_dataset,
                     callbacks=[trainer_callback]
                     )

In [None]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss
1,2.7945,2.785501


Updating dataset loading samples block 1; change_file_index=True...
Updating the sampled dataset itens; sample DB size: 3000
Updating the sampled dataset itens; sample DB size: 1000
