<a href="https://colab.research.google.com/github/ekrombouts/gcai_zuster_fietje/blob/main/notebooks/322_fietje_finetuning_instruct.ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning Fietje Instruct

**Author:** Eva Rombouts  
**Date:** 2024-10-14  

### Description


In [None]:
!pip install -q transformers datasets

verbose = True

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, Dataset, DatasetDict
import time
from google.colab import runtime, drive

# Mount Google Drive
drive.mount('/content/drive')


In [None]:
# The base model from Hugging Face that will be finetuned
base_model = "BramVanroy/fietje-2-instruct"

# The name of the finetuned model to be saved
finetuned_model = "zuster_fietje"

# Commit message for version control
commit_message = "Finetuned BramVanroy/fietje-2-instruct on ekrombouts/Gardenia_instruct_dataset"

# Path to the dataset on Hugging Face that will be used for finetuning
path_dataset = "ekrombouts/Gardenia_instruct_dataset"

In [None]:
# Load the base model for text generation, automatically choosing the device (CPU or GPU)
# and using bfloat16 precision to save memory.
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

# Load the tokenizer associated with the base model
tokenizer = AutoTokenizer.from_pretrained(base_model)

# Set the padding token to be the same as the end-of-sequence (EOS) token to avoid warnings
tokenizer.pad_token = tokenizer.eos_token

if verbose:
    # Print the memory footprint of the loaded model in GB for monitoring
    print(f"Memory footprint: {model.get_memory_footprint() / 1e9} GB")

In [None]:
# Load data
dataset = load_dataset(path_dataset)
train_dataset = dataset['train']
val_dataset = dataset['validation']

if verbose:
    print(dataset)

In [None]:
def create_prompt(row: dict, add_response: bool = True) -> str:
    """
    Generates a prompt based on the input data in 'row'.

    Args:
        row (dict): A dictionary containing 'context', 'instruction', and optionally 'response'.
        full (bool): If True, the prompt will include the 'response'.
                     If False, only 'context' and 'instruction' will be included.

    Returns:
        str: The generated prompt in text format.
    """
    # Base prompt (without response)
    prompt = f"""Context:
{row['context']}

Instructie:
{row['instruction']}

Antwoord:
"""
    # Append response if 'full' is True
    if add_response:
        prompt += f"\nAntwoord:\n{row['response']}\n"

    return prompt


In [None]:
# Show a random example of the model's output before training
if verbose:
    import random
    row = random.choice(train_dataset)  # Select a random row from the training dataset
    prompt = create_prompt(row, False)  # Create the prompt from the selected dataset row
    print(prompt)

    # Convert the prompt into tokens that the model can understand
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    attention_mask = tokenizer(prompt, return_tensors="pt", padding=True).attention_mask.to(model.device)

    # Enable the model's cache for faster generation and switch to evaluation mode
    model.config.use_cache = True
    model.eval()

    # Generate a response based on the input prompt
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=150,  # Limit the number of new tokens generated to 150
        do_sample=True,      # Use sampling to introduce randomness into the generation
        top_p=0.95,          # Use nucleus sampling with a probability threshold of 0.95
        top_k=50,            # Consider the top 50 tokens when sampling for each step
        temperature=0.7,     # Set the temperature to 0.7 to control randomness (lower = more conservative)
        num_return_sequences=1,  # Generate only one sequence
        eos_token_id=tokenizer.eos_token_id,  # End the generation when the EOS token is reached
        pad_token_id=tokenizer.eos_token_id   # Use the EOS token for padding
    )

    # Convert the generated token sequence back into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    generated_response = generated_text[len(prompt):].strip()  # Remove the prompt part from the output

    # Display the generated response and the actual reference response from the dataset
    print("GENERATED RESPONSE:")
    print(generated_response)
    print("\nREFERENCE RESPONSE:")
    print(row['response'])

In [None]:
def collate_and_tokenize(row):
    """
    Tokenizes and prepares a dataset sample for training.

    Args:
        row (dict): A single row or sample from the dataset, typically containing
                    input text fields.

    Returns:
        dict: A dictionary containing tokenized input tensors and labels, with keys:
              - 'input_ids': Tokenized input IDs for the model.
              - 'attention_mask': Attention mask indicating which tokens should be attended to.
              - 'labels': Tokenized labels for model training, identical to input_ids.
    """
    prompt = create_prompt(row)  # Generate the prompt from the dataset row

    # Tokenize the prompt and prepare input tensors
    encoded = tokenizer(
        prompt,
        return_tensors="pt",  # Return tensors in PyTorch format
        padding="max_length",  # Pad the input to the maximum length
        truncation=True,       # Truncate inputs that are longer than the max length
        max_length=2048,       # Set the maximum length for input tokens
    )

    # Create labels by duplicating input IDs for the model to predict
    encoded["labels"] = encoded["input_ids"].clone()

    return encoded  # Return the tokenized data with labels


In [None]:
# Columns that are not needed after tokenization and can be removed
columns_to_remove = ['context', 'instruction', 'response']

# Tokenize the training dataset and remove unnecessary columns
tokenized_dataset_train = train_dataset.map(
    collate_and_tokenize,  # Apply the tokenization function to each sample
    batched=True,          # Process the dataset in batches
    batch_size=1,          # Set batch size to 1 to process one sample at a time
    remove_columns=columns_to_remove  # Remove columns that are no longer needed
)

# Tokenize the validation dataset and remove unnecessary columns
tokenized_dataset_val = val_dataset.map(
    collate_and_tokenize,
    batched=True,
    batch_size=1,
    remove_columns=columns_to_remove
)


In [None]:
if verbose:
    def print_trainable_parameters(model):
        """
        Prints the number of trainable parameters in the model, along with the total number of parameters,
        and the percentage of trainable parameters.

        Args:
            model (torch.nn.Module): The model to inspect.
        """
        trainable_params = 0  # Counter for the number of trainable parameters
        all_param = 0         # Counter for the total number of parameters

        # Iterate over all model parameters
        for _, param in model.named_parameters():
            all_param += param.numel()  # Add the total number of parameters
            if param.requires_grad:     # Check if the parameter is trainable
                trainable_params += param.numel()  # Add the number of trainable parameters

        # Print the results: number of trainable parameters, total parameters, and percentage of trainable parameters
        print(
            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}%"
        )

    # Call the function to print the trainable parameters of the model
    print_trainable_parameters(model)


In [None]:
# Enable gradient checkpointing to reduce memory usage during training by saving memory at the cost of some speed.
# This allows the model to compute gradients on smaller chunks, which is useful for large models.
model.gradient_checkpointing_enable()

# Set the model to training mode, allowing layers like dropout and batchnorm to behave accordingly during training.
model.train()

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results_full',  # Directory where the model checkpoints and outputs will be saved
    report_to='none',  # Disable reporting to any platform (e.g., TensorBoard, WandB)
    overwrite_output_dir=True,  # Overwrite the contents of the output directory if it already exists
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=1,  # Batch size per device for training
    per_device_eval_batch_size=1,  # Batch size per device for evaluation
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps before updating the model
    gradient_checkpointing=True,  # Enable gradient checkpointing to reduce memory usage during backpropagation
    warmup_steps=50,  # Number of warmup steps for the learning rate scheduler
    logging_dir='/content/drive/MyDrive/logs',  # Directory for saving logs
    logging_strategy="steps",  # Log training information every few steps
    logging_steps=50,  # Log every 50 steps
    save_strategy="steps",  # Save the model at regular step intervals
    save_steps=100,  # Save the model every 100 steps
    save_total_limit=2,  # Limit the number of saved checkpoints to the 2 most recent ones
    evaluation_strategy="steps",  # Evaluate the model at regular step intervals
    eval_steps=100,  # Evaluate the model every 100 steps
    load_best_model_at_end=True,  # Automatically load the best model when training is finished
    bf16=True,  # Use bfloat16 precision for faster training with less memory usage
    learning_rate=5e-5,  # Initial learning rate for the optimizer
    weight_decay=0.01,  # Apply weight decay to the optimizer to avoid overfitting
    resume_from_checkpoint=True  # Resume training from the last checkpoint, if available
)

In [None]:
# Disable caching during training for models that support caching, to save memory.
model.config.use_cache = False

# Initialize the Trainer object with the model, datasets, training arguments, and early stopping.
trainer = Trainer(
    model=model,  # The model to be trained
    train_dataset=tokenized_dataset_train,  # The tokenized training dataset
    eval_dataset=tokenized_dataset_val,  # The tokenized validation dataset
    args=training_args,  # Training arguments defined earlier
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping after 3 evaluations without improvement
)

In [None]:
start_time = time.time()  # Record the start time before training begins
trainer.train()  # Start the training process
end_time = time.time()  # Record the end time after training finishes

# Calculate and print the total training time in seconds
training_time = end_time - start_time
print(f"Training completed in {training_time} seconds.")

In [None]:
# Push trained model and tokenizer to Hugging Face Hub
model.push_to_hub(finetuned_model, use_auth_token=True, commit_message=commit_message, private=True)
tokenizer.push_to_hub(finetuned_model, use_auth_token=True, commit_message=commit_message)


In [None]:
# top Colab runtime (if applicable)
runtime.unassign()