<a href="https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/310_GenCareAIFietjePeftFinetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zuster Fietje, PEFT finetuning ekrombouts/zuster_fietje

**Author:** Eva Rombouts  
**Date:** 2024-07-28  
**Updated:** 2024-10-18

### Description
This notebook is almost fully copied from: [Optimizing Phi-2: A Deep Dive into Fine-Tuning Small Language Models](https://medium.com/thedeephub/optimizing-phi-2-a-deep-dive-into-fine-tuning-small-language-models-9d545ac90a99), by Praveen Yerneni. Thank you!!
It trains the chat version of [Fietje](https://huggingface.co/BramVanroy/fietje-2-chat), an adapated version of microsoft/phi-2, trained on Dutch texts.

## Setup

In [None]:
!pip install -q bitsandbytes flash_attn datasets peft

verbose = False

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

from google.colab import drive, runtime
import time
from datasets import load_dataset, Dataset, DatasetDict

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

drive.mount('/content/drive')


In [None]:
# The base model from Hugging Face that will be finetuned
# base_model = "BramVanroy/fietje-2-instruct"
base_model = "ekrombouts/zuster_fietje"

# The name of the finetuned model to be saved
finetuned_model = "zuster_fietje_peft"

# Commit message for version control
commit_message = "full finetuned on Gardenia_instruct_dataset for 10 epochs, now PEFT finetuned on Olympia_SAMPC_dataset for 5 epochs"

# Path to the dataset on Hugging Face that will be used for finetuning
path_dataset = "ekrombouts/Olympia_SAMPC_dataset"


## Load model and tokenizer

The model is loaded in `4-bit` which is the "Quantization" part of QLORA. The memory footprint of this is much smaller then the default.


In [None]:
# Configuration to load model in 4-bit quantized
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True
)


#Loading the model with compatible settings
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map='auto',
    quantization_config=bnb_config,
    attn_implementation='flash_attention_2',
    trust_remote_code=True
)

# Setting up the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    add_eos_token=True,
    trust_remote_code=True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = 'left'

if verbose:
    print(f"Memory footprint: {model.get_memory_footprint() / 1e9} GB")

In [None]:
# Load data
dataset = load_dataset(path_dataset)
train_dataset = dataset['train']
val_dataset = dataset['validation']

if verbose:
    print(dataset)

In [None]:
def create_prompt(row: dict, add_response: bool = True) -> str:
    """
    Generates a prompt based on the input data in 'row'.

    Args:
        row (dict): A dictionary containing 'context', 'instruction', and optionally 'response'.
        full (bool): If True, the prompt will include the 'response'.
                     If False, only 'context' and 'instruction' will be included.

    Returns:
        str: The generated prompt in text format.
    """
    # Base prompt (without response)
    prompt = f"""Context:
{row['context']}

Instructie:
{row['instruction']}

Antwoord:"""
    # Append response if 'add_response' is True
    if add_response:
        prompt += f"\n{row['response']}\n"

    return prompt


In [None]:
# Show a random example of the model's output before training
if verbose:
    import random
    row = random.choice(train_dataset)  # Select a random row from the training dataset
    prompt = create_prompt(row, False)  # Create the prompt from the selected dataset row
    print(prompt)

    # Convert the prompt into tokens that the model can understand
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    attention_mask = tokenizer(prompt, return_tensors="pt", padding=True).attention_mask.to(model.device)

    # Enable the model's cache for faster generation and switch to evaluation mode
    model.config.use_cache = True
    model.eval()

    # Generate a response based on the input prompt
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=150,  # Limit the number of new tokens generated to 150
        do_sample=True,      # Use sampling to introduce randomness into the generation
        top_p=0.95,          # Use nucleus sampling with a probability threshold of 0.95
        top_k=50,            # Consider the top 50 tokens when sampling for each step
        temperature=0.7,     # Set the temperature to 0.7 to control randomness (lower = more conservative)
        num_return_sequences=1,  # Generate only one sequence
        eos_token_id=tokenizer.eos_token_id,  # End the generation when the EOS token is reached
        pad_token_id=tokenizer.eos_token_id   # Use the EOS token for padding
    )

    # Convert the generated token sequence back into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_response = generated_text[len(prompt):].strip()  # Remove the prompt part from the output

    # Display the generated response and the actual reference response from the dataset
    print("GENERATED RESPONSE:")
    print(generated_response)
    print("\nREFERENCE RESPONSE:")
    print(row['response'])


In [None]:
def collate_and_tokenize(row):
    """
    Tokenizes and prepares a dataset sample for training.

    Args:
        row (dict): A single row or sample from the dataset, typically containing
                    input text fields.

    Returns:
        dict: A dictionary containing tokenized input tensors and labels, with keys:
              - 'input_ids': Tokenized input IDs for the model.
              - 'attention_mask': Attention mask indicating which tokens should be attended to.
              - 'labels': Tokenized labels for model training, identical to input_ids.
    """

    # Generate the prompt from the dataset row
    prompt = create_prompt(
        row=row,
        add_response=True,
    )

    # Tokenize the prompt and prepare input tensors
    encoded = tokenizer(
        prompt,
        return_tensors="pt",   # Return tensors in PyTorch format
        padding="max_length",  # Pad the input to the maximum length
        truncation=True,       # Truncate inputs that are longer than the max length
        max_length=2048,       # Set the maximum length for input tokens
    )

    # Create labels by duplicating input IDs for the model to predict
    encoded["labels"] = encoded["input_ids"].clone()

    return encoded  # Return the tokenized data with labels


In [None]:
#We will just keep the input_ids and labels that we add in function above.
columns_to_remove = ['client_id', 'week', 'context', 'instruction', 'response']

#tokenize the training and validation datasets
tokenized_dataset_train = train_dataset.map(
    collate_and_tokenize,
    batched=True,
    batch_size=1,
    remove_columns=columns_to_remove
)

tokenized_dataset_val = val_dataset.map(
    collate_and_tokenize,
    batched=True,
    batch_size=1,
    remove_columns=columns_to_remove
)


In [None]:
if verbose:
    #Check if tokenization looks good
    input_ids = tokenized_dataset_val[1]['input_ids']

    decoded = tokenizer.decode(
        input_ids,
        skip_special_tokens=True
    )

    print(decoded)

In [None]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
if verbose:
    print_trainable_parameters(model)

#gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Freeze base model layers and cast layernorm in fp32
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
if verbose:
    print(model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
    'q_proj',
    'k_proj',
    'v_proj',
    'dense',
    'fc1',
    'fc2',
    ], #print(model) will show the modules to use
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

lora_model = get_peft_model(model, config)
if verbose:
    print_trainable_parameters(lora_model)

lora_model = accelerator.prepare_model(lora_model)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results_full',  # Directory where the model checkpoints and outputs will be saved
    report_to='none',
    overwrite_output_dir=True, # Overwrite the content of the output directory
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    gradient_accumulation_steps=5, # number of steps before optimizing
    gradient_checkpointing=True,   # Enable gradient checkpointing
    gradient_checkpointing_kwargs={"use_reentrant": False},
    warmup_steps=50,  # Number of warmup steps
    num_train_epochs=5,  # Number of training epochs
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay
    optim="paged_adamw_8bit", #Keep the optimizer state and quantize it
    fp16=True, #Use mixed precision training
    #For logging and saving
    logging_dir='/content/drive/MyDrive/logs',  # Directory for saving logs
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,  # Limit the total number of checkpoints
    eval_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True, # Load the best model at the end of training
)

trainer = Trainer(
    model=lora_model,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    args=training_args,
)


In [None]:
#Disable cache to prevent warning, reenable for inference
model.config.use_cache = False

start_time = time.time()
trainer.train()
end_time = time.time()

training_time = end_time - start_time

print(f"Training completed in {training_time} seconds.")


In [None]:
#Save model to hub to ensure we save our work.
lora_model.push_to_hub(
    finetuned_model,
    use_auth_token=True,
    commit_message=commit_message,
    private=True
)

tokenizer.push_to_hub(
    finetuned_model,
    use_auth_token=True,
    commit_message=commit_message,
    private=True
)

In [None]:
#Terminate the session so we do not incur cost
runtime.unassign()