<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/zuster_fietje/notebooks/300_zuster_fietje/320_fietje_finetuning_sampc.ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning Fietje Instruct

**Author:** Eva Rombouts  
**Date:** 2024-10-14  

### Description


In [None]:
# Cell 1: Install necessary libraries
!pip install -q transformers datasets

In [None]:
# Cell 2: Import required libraries and mount Google Drive
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, Dataset, DatasetDict
import time
from google.colab import runtime, drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Cell 3: Load pre-trained model and tokenizer
path_hf_sampc = "ekrombouts/Galaxy_SAMPC_long"
model_name = "BramVanroy/fietje-2"
model_finetuned = "gcai_sampc_fietje"
commit_message = "Finetuned BramVanroy/fietje-2 on Galaxy_SAMPC_long"

# Load the model. Using bfloat16 gives lower precision but saves memory
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# Print memory footprint of the model
print(f"Memory footprint: {model.get_memory_footprint() / 1e9} GB")


In [None]:
# Cell 4: Load dataset
dataset = load_dataset(path_hf_sampc)
train_dataset = dataset['train']
val_dataset = dataset['validation']

In [None]:
train_dataset

In [None]:
# Cell 6: Show the results of the untrained model

sample = val_dataset[6]
prompt = sample['prompt']

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
attention_mask = tokenizer(prompt, return_tensors="pt", padding=True).attention_mask.to(model.device)

# Enable cache and set model to evaluation mode
model.config.use_cache = True
model.eval()

# Generate output
output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_new_tokens=150,
    do_sample=True,
    top_p=0.95,
    top_k=50,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

# Decode generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
generated_response = generated_text[len(prompt):].strip()

# Display generated response and actual response
print("GENERATED RESPONSE:")
print(generated_response)
print("\nREFERENCE RESPONSE:")
print(sample['reference'])

In [None]:
# Cell 7: Function to tokenize dataset samples
def collate_and_tokenize(examples):
    prompt = examples["prompt"][0]+examples["reference"][0]

    # Tokenize and create labels
    encoded = tokenizer(
        prompt,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=2048,
    )
    encoded["labels"] = encoded["input_ids"].clone()
    return encoded


In [None]:
# Cell 8: Tokenize the dataset and remove unnecessary columns
columns_to_remove = ['notes', 'prompt', 'reference', 'category']
# Apply tokenization
tokenized_dataset_train = train_dataset.map(collate_and_tokenize, batched=True, batch_size=1, remove_columns=columns_to_remove)
tokenized_dataset_val = val_dataset.map(collate_and_tokenize, batched=True, batch_size=1, remove_columns=columns_to_remove)


In [None]:
# Cell 9: Print trainable parameters in the model
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}%"
    )

print_trainable_parameters(model)


In [None]:
# Cell 10: Enable gradient checkpointing and set model to training mode
model.gradient_checkpointing_enable()
model.train()

In [None]:
# Cell 11: Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/results_full',
    report_to='none',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    warmup_steps=50,
    logging_dir='/content/drive/MyDrive/logs',
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    bf16=True,
    learning_rate=5e-5,
    weight_decay=0.01,
    resume_from_checkpoint=True
)

In [None]:
# Cell 12: Initialize Trainer with EarlyStoppingCallback and disable cache for training

# Disable cache for training
model.config.use_cache = False

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Add early stopping callback
)

In [None]:
# Cell 13: Train the model and measure training time
start_time = time.time()  # Start time
trainer.train()  # Start training
end_time = time.time()  # End time

training_time = end_time - start_time  # Total training time
print(f"Training completed in {training_time} seconds.")


In [None]:
# Cell 14: Push trained model and tokenizer to Hugging Face Hub
model.push_to_hub(model_finetuned, use_auth_token=True, commit_message=commit_message, private=True)
tokenizer.push_to_hub(model_finetuned, use_auth_token=True, commit_message=commit_message)


In [None]:
# Cell 15: Stop Colab runtime (if applicable)
runtime.unassign()