In [1]:
# !pip install peft -U

# Supervised Fine Tuning

In [24]:
import time
import uuid

import evaluate
import numpy as np

from tqdm import tqdm
from datasets import load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
import torch

tqdm.pandas()

from pathlib import Path

import pandas as pd


TRAIN_BATCH_SIZE = 2
LEARNING_RATE = 1e-3
LORA_PARAM_R = 16
LORA_PARAM_ALPHA = 32
LORA_PARAM_TARGET_MODULES = {
    "bigscience/mt0-small": ["q", "v"],
    "microsoft/phi-1_5": ["q_proj", "v_proj"],
}
# PRECISION = torch.float32
PRECISION_NAME = 'float32'
DEVICE = "mps"  # 0 if torch.cuda.is_available() else "cpu"
CHOSEN_MODEL = "microsoft/phi-1_5"  # "bigscience/mt0-small" #"google/flan-t5-large"
TESTING = True
RUN_ID = uuid.uuid4().hex
print(RUN_ID)


print(f"Model: {CHOSEN_MODEL} will be trained on device: {DEVICE}.")

a9688be2d1414c5d9e0fb99dbe9ad30c
Model: microsoft/phi-1_5 will be trained on device: mps.


### Developed utility functions
- For details see [Bath github link](https://github.bath.ac.uk/gt566/ai-msc-dissertation/blob/dissertation-experienced-ft/nyx/dissertation/utils.py)

In [25]:
from nyx.utils import (
    precision_enumerator,
    download_and_save_reddit_data,
    round_dictionary_values,
    get_task_type,
    print_number_of_trainable_model_parameters,
)

from nyx.constants import (
    SFT_DATA_OUTPUT_PATH,
    COMMON_OUTPUT_PATHS,
    SFT_OUTPUT_DIR,
    SFT_PEFT_MERGED_MODEL_PATH,
    SFT_PEFT_ADAPTER_PATH,
    METRICS_PATH,
)


COMMON_OUTPUT_PATHS = COMMON_OUTPUT_PATHS.format(RUN_ID=RUN_ID)
METRICS_PATH = METRICS_PATH.format(COMMON_OUTPUT_PATHS=COMMON_OUTPUT_PATHS)
COMMON_OUTPUT_PATHS = COMMON_OUTPUT_PATHS.format(RUN_ID=RUN_ID)
SFT_OUTPUT_DIR = SFT_OUTPUT_DIR.format(COMMON_OUTPUT_PATHS=COMMON_OUTPUT_PATHS)
SFT_PEFT_ADAPTER_PATH = SFT_PEFT_ADAPTER_PATH.format(
    COMMON_OUTPUT_PATHS=COMMON_OUTPUT_PATHS
)
SFT_PEFT_MERGED_MODEL_PATH = SFT_PEFT_MERGED_MODEL_PATH.format(
    COMMON_OUTPUT_PATHS=COMMON_OUTPUT_PATHS
)


PRECISION = precision_enumerator(PRECISION_NAME)
PRECISION

torch.float32

## Load model and data

In [26]:
try:
    original_model = AutoModelForSeq2SeqLM.from_pretrained(
        CHOSEN_MODEL, torch_dtype=PRECISION
    )
except ValueError:
    original_model = AutoModelForCausalLM.from_pretrained(
        CHOSEN_MODEL, torch_dtype=PRECISION
    )

tokenizer = AutoTokenizer.from_pretrained(CHOSEN_MODEL)  # model_max_length=512


original_model.to(torch.device(DEVICE))
print("Push models and data to GPU for efficiency.")

print(print_number_of_trainable_model_parameters(original_model))

Push models and data to GPU for efficiency.
trainable model parameters: 1418270720
all model parameters: 1418270720
percentage of trainable model parameters: 100.00%


In [27]:
filtered_reddit_summarisation_data = Path(SFT_DATA_OUTPUT_PATH)
if not filtered_reddit_summarisation_data.is_dir():
    print("Downloading and saving filtered reddit data.")
    download_and_save_reddit_data()

dataset = load_from_disk(SFT_DATA_OUTPUT_PATH)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 116722
    })
    test: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 6553
    })
    validation: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 6447
    })
})

In [28]:
if TESTING is True:
    dataset["train"] = dataset["train"].select(range(10))
    dataset["test"] = dataset["test"].select(range(50))
    dataset["validation"] = dataset["validation"].select(range(50))
    # dataset = dataset.filter(
    #     lambda example, index: index % 4680 == 0, with_indices=True
    # )
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 10
    })
    test: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['id', 'subreddit', 'title', 'post', 'summary'],
        num_rows: 50
    })
})

In [38]:
tokenizer.pad_token = (
    tokenizer.pad_token if tokenizer.pad_token is not None else tokenizer.eos_token
)


def tokenize_function(example):
    start_prompt = "Summarize the following reddit post.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + post + end_prompt for post in example["post"]]
    example['check'] = prompt
    example["input_ids"] = tokenizer(
        prompt, padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids.to(torch.device(DEVICE))
    example["labels"] = tokenizer(
        example["summary"], padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids.to(torch.device(DEVICE))

    return example


# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    [
        "id",
        "subreddit",
        "post",
        "summary",
    ]
)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [39]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (10, 4)
Validation: (50, 4)
Test: (50, 4)
DatasetDict({
    train: Dataset({
        features: ['title', 'check', 'input_ids', 'labels'],
        num_rows: 10
    })
    test: Dataset({
        features: ['title', 'check', 'input_ids', 'labels'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['title', 'check', 'input_ids', 'labels'],
        num_rows: 50
    })
})


## Train PEFT adapter

In [31]:
# Checking for layers to apply LoRA. Selecting the query and value layers are the most
# basic implementation according to the paper. They are refered to as q and v here.
print(original_model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,

In [34]:
lora_config = LoraConfig(
    # Determines the size of LoRA matrices. x*r * r*y = x*y
    r=LORA_PARAM_R,
    # scaling coefficient. Paper mentions it is important because the adjustments are small compared
    # to the rest of the model.
    lora_alpha=LORA_PARAM_ALPHA,
    # Variable target_modules determines what layers are fine-tuned, see architecture above.
    # Simplest case scenario based on the original paper.
    target_modules=LORA_PARAM_TARGET_MODULES[CHOSEN_MODEL],
    lora_dropout=0.05,
    bias="none",
    task_type=get_task_type(model=original_model),
)

In [35]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3145728
all model parameters: 1421416448
percentage of trainable model parameters: 0.22%


In [40]:
# common_folder_path = f"./models/openai-subreddit-data/{CHOSEN_MODEL}/supervised-fine-tuning"
# output_dir = (
#     f"{common_folder_path}/peft-dialogue-summary-training-{str(int(time.time()))}"
# )
# peft_model_path = f"{common_folder_path}/peft-dialogue-summary-checkpoint-local"

peft_training_args = TrainingArguments(
    output_dir=SFT_OUTPUT_DIR,
    # auto_find_batch_size=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    learning_rate=LEARNING_RATE,  # Higher learning rate than full fine-tuning.
    # num_train_epochs=3,
    save_steps=5_000,
    logging_steps=1,
    max_steps=len(tokenized_datasets["train"])
    // TRAIN_BATCH_SIZE,  # number of training data * 2, i.e. go over all data-summary pairs twice.
)

peft_trainer = Trainer(
    model=peft_model.to(
        torch.device(DEVICE)
    ),  # Important to train on Mac Chip GPU equivalent
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

max_steps is given, it will override any value given in num_train_epochs


In [41]:
start = time.time()

peft_trainer.train()
end = time.time()

duration = end - start
print(end)
print(f"Training for 1 epoch took {round(duration, 2)} seconds to execute.")

peft_trainer.model.save_pretrained(SFT_PEFT_ADAPTER_PATH)
tokenizer.save_pretrained(SFT_PEFT_ADAPTER_PATH)

2024/06/04 20:51:13 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 80eb83e0af1a4bf0ab11ac93f29c33a3: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='./experiments/a9688be2d1414c5d9e0fb99dbe9ad30c/models/supervised-fine-tuning/events/runs/Jun04_20-49-38_Owners-iMac.local' for run ID='80eb83e0af1a4bf0ab11ac93f29c33a3'. Attempted logging new value './experiments/a9688be2d1414c5d9e0fb99dbe9ad30c/models/supervised-fine-tuning/events/runs/Jun04_20-51-12_Owners-iMac.local'.


RuntimeError: MPS backend out of memory (MPS allocated: 13.06 GB, other allocations: 147.45 MB, max allowed: 13.57 GB). Tried to allocate 1024.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

## Load PEFT adapter

In [15]:
# Needed to add adapter_config.json to folder and change name of the model from pytorch_model.bin to adapter_model.bin
# adapter_checkpoint_path = f"/Users/gtoth/PycharmProjects/LLM-jupyter-notebooks/openai-subreddit-data-flan-t5-large/peft-dialogue-summary-checkpoint-local-6k"

# adapter_checkpoint_path = f"{common_folder_path}/peft-dialogue-summary-checkpoint-local"
trained_model = AutoModelForSeq2SeqLM.from_pretrained(
    CHOSEN_MODEL, torch_dtype=PRECISION
)

peft_checkpoint_model = PeftModel.from_pretrained(trained_model, SFT_PEFT_ADAPTER_PATH)

peft_config = PeftConfig.from_pretrained(SFT_PEFT_ADAPTER_PATH)
# to initiate with random weights
peft_config.init_lora_weights = False
trained_model.add_adapter(peft_config)
trained_model.enable_adapters()
trained_model.to(torch.device(DEVICE))
# print("ok")

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): lora.Linear(
                (base_layer): Linear(in_features=512, out_features=384, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=512, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=384, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k): Linear(in_features=512, out_features=384, bias=False)
       

## _Comparing PEFT and Baseline model generations (with ROUGE)_

In [18]:
# %timeit quantitative_comparison(peft_model)

from nyx.evaluation import quantitative_comparison

In [20]:
N_EVAL_SAMPLES = int(len(tokenized_datasets['test']) * 0.15)

start = time.time()
peft_checkpoint_generation = quantitative_comparison(
    trained_model,
    dataset,
    tokenizer,
    n_samples_to_evaluate=N_EVAL_SAMPLES,
    batch_size=2,
    device=DEVICE,
)
baseline_model_generation = quantitative_comparison(
    original_model,
    dataset,
    tokenizer,
    n_samples_to_evaluate=N_EVAL_SAMPLES,
    batch_size=2,
    device=DEVICE,
)

end = time.time()

duration = end - start
print(
    f"Evaluating N={N_EVAL_SAMPLES} samples took {round(duration, 2)} seconds to execute."
)

human_baseline_answer = dataset["test"][0:N_EVAL_SAMPLES]["summary"]

zipped_summaries = list(
    zip(human_baseline_answer, peft_checkpoint_generation, baseline_model_generation)
)

df = pd.DataFrame(
    zipped_summaries,
    columns=[
        "human_baseline_answer",
        "peft_checkpoint_generation",
        "baseline_model_generation",
    ],
)
df.head()
print(df.shape)

Evaluating N=7 samples took 21.3 seconds to execute.
(7, 3)


In [21]:
rouge = evaluate.load("rouge")

original_model_results = rouge.compute(
    predictions=baseline_model_generation,
    references=human_baseline_answer[0 : len(baseline_model_generation)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_checkpoint_generation,
    references=human_baseline_answer[0 : len(peft_checkpoint_generation)],
    use_aggregator=True,
    use_stemmer=True,
)

original_model_results = round_dictionary_values(original_model_results)
# instruct_model_results = round_dictionary_values(instruct_model_results)
peft_model_results = round_dictionary_values(peft_model_results)
print("ORIGINAL MODEL:")
print(original_model_results)
# print('INSTRUCT MODEL:')
# print(instruct_model_results)
print("PEFT MODEL:")
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.02, 'rouge2': 0.0, 'rougeL': 0.02, 'rougeLsum': 0.02}
PEFT MODEL:
{'rouge1': 0.09, 'rouge2': 0.03, 'rougeL': 0.09, 'rougeLsum': 0.09}


In [27]:
METRICS_PATH

'./experiments/{RUN_ID}/metrics'

In [29]:
if not os.path.exists(METRICS_PATH):
    os.makedirs(METRICS_PATH)

data_path = f'{METRICS_PATH}/sft-results.json'

results_dict = {
    'baseline-model': original_model_results,
    'sft-model': peft_model_results,
}
with open(data_path, 'w') as file:
    json.dump(results_dict, file)

In [22]:
print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = np.array(list(peft_model_results.values())) - np.array(
    list(original_model_results.values())
)
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 7.00%
rouge2: 3.00%
rougeL: 7.00%
rougeLsum: 7.00%


## Merge and save peft model (with base model)
So that, it can be loaded in as a Reward Moldel.

In [30]:
model = peft_model.merge_and_unload()
model.save_pretrained(SFT_PEFT_MERGED_MODEL_PATH)

# END