### Import dependancies

In [None]:
HF_TOKEN = ""

In [None]:
!pip install -q wandb

In [None]:
# !pip install -q -U "transformers==4.38.0" --upgrade
# !pip install -q datasets peft trl accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes

In [None]:
from dataclasses import dataclass, field
from typing import Optional

import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout
import os
import wandb
from datetime import datetime
from jinja2 import Template
from huggingface_hub import ModelCard, HfApi

### Huggingface Hub

In [None]:
username = widgets.Text(
    value="CorticalStack",
    description='HF hub user',
    disabled=False
)
username.style.description_width = 'initial'
display(username)

model_id = widgets.Text(
    value="google/gemma-7b",
    description='Model id',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

new_model_id = widgets.Text(
    value="gemma-7b",
    description='New model id',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

### Training parameters

In [None]:
training_dataset = widgets.Text(
    value="stingning/ultrachat",
    description='Training dataset',
    disabled=False
)
training_dataset.style.description_width = 'initial'
display(training_dataset)

per_device_train_batch_size = widgets.IntText(
    value=4,
    description='Per device train batch size',
    disabled=False
)
per_device_train_batch_size.style.description_width = 'initial'
display(per_device_train_batch_size)

gradient_accumulation_steps = widgets.IntText(
    value=6,
    description='Gradient accumulation steps',
    disabled=False
)
gradient_accumulation_steps.style.description_width = 'initial'
display(gradient_accumulation_steps)

learning_rate = widgets.FloatText(
    value="2e-4",
    description='Learning rate',
    disabled=False
)
learning_rate.style.description_width = 'initial'
display(learning_rate)

max_grad_norm = widgets.FloatText(
    value="0.3",
    description='Max grad norm',
    disabled=False
)
max_grad_norm.style.description_width = 'initial'
display(max_grad_norm)

weight_decay = widgets.FloatText(
    value="0.001",
    description='Weight decay',
    disabled=False
)
weight_decay.style.description_width = 'initial'
display(weight_decay)

lora_alpha = widgets.IntText(
    value=16,
    description='lora alpha',
    disabled=False
)
lora_alpha.style.description_width = 'initial'
display(lora_alpha)

lora_dropout = widgets.FloatText(
    value=0.1,
    description='lora dropout',
    disabled=False
)
lora_dropout.style.description_width = 'initial'
display(lora_dropout)

lora_r = widgets.IntText(
    value=8,
    description='LoRA r',
    disabled=False
)
lora_r.style.description_width = 'initial'
display(lora_r)

max_seq_length = widgets.IntText(
    value=2048, 
    description='Max seq length',
    disabled=False
)
max_seq_length.style.description_width = 'initial'
display(max_seq_length)

fp16 = widgets.Checkbox(
    value=False,
    description='BF16 training enabled',
    disabled=False,
    indent=False
)
fp16.style.description_width = 'initial'
display(fp16)

bf16 = widgets.Checkbox(
    value=False,
    description='BF16 training enabled',
    disabled=False,
    indent=False
)
bf16.style.description_width = 'initial'
display(bf16)

packing = widgets.Checkbox(
    value=True,
    description='Use packing dataset creating',
    disabled=False,
    indent=False
)
packing.style.description_width = 'initial'
display(packing)

gradient_checkpointing= widgets.Checkbox(
    value=True,
    description='Enable gradient checkpointing',
    disabled=False,
    indent=False
)
gradient_checkpointing.style.description_width = 'initial'
display(gradient_checkpointing)

use_flash_attn_2 = widgets.Checkbox(
    value=False,
    description='Use flash attention 2',
    disabled=False,
    indent=False
)
use_flash_attn_2.style.description_width = 'initial'
display(use_flash_attn_2)

optim = widgets.Text(
    value="paged_adamw_32bit",
    description='Optimizer',
    disabled=False
)
optim.style.description_width = 'initial'
display(optim)

lr_scheduler_type = widgets.Text(
    value="constant",
    description='LR schedule type',
    disabled=False
)
lr_scheduler_type.style.description_width = 'initial'
display(lr_scheduler_type)

max_steps = widgets.IntText(
    value=100,
    description='Max number of optimizer update steps',
    disabled=False
)
max_steps.style.description_width = 'initial'
display(max_steps)

warmup_ratio = widgets.FloatText(
    value=0.01,
    description='Warmup ratios',
    disabled=False
)
warmup_ratio.style.description_width = 'initial'
display(warmup_ratio)

eval_steps = widgets.IntText(
    value=10,
    description='Eval steps',
    disabled=False
)
eval_steps.style.description_width = 'initial'
display(eval_steps)

save_steps = widgets.IntText(
    value=50,
    description='Save steps',
    disabled=False
)
save_steps.style.description_width = 'initial'
display(save_steps)

logging_steps = widgets.IntText(
    value=1,
    description='Logging steps',
    disabled=False
)
logging_steps.style.description_width = 'initial'
display(logging_steps)

random_state = widgets.IntText(
    value=3407,
    description='random state',
    disabled=False
)
random_state.style.description_width = 'initial'
display(random_state)

### Tracking training with weights & biases (wandb)

In [None]:
wandb.login()

wandb_project = "ft-" + new_model_id.value
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

In [None]:
dataset = load_dataset(training_dataset.value, split='train[:25%]')

In [None]:
train_test_ratio = 0.99  # 80% train, 20% test
split_dataset = dataset.train_test_split(test_size=1-train_test_ratio)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']
print(f"train dataset records: {len(train_dataset)}")
print(f"test dataset records: {len(test_dataset)}")

In [None]:
# Split the dataset into training and testing sets
train_test_split = dataset["train"].train_test_split(test_size=0.001)

# Extract the training and testing datasets
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"train dataset records: {len(train_dataset)}")
print(f"test dataset records: {len(test_dataset)}")
train_dataset[0]

In [None]:
prompt = """### USER: {}\n### ASSISTANT: {}"""

In [None]:
def format_prompts(examples):
    texts = []
    for convo in examples["data"]:
        input = convo[0]
        response = convo[1]
        text = prompt.format(input, response)
        texts.append(text)
    return { "text" : texts, }

In [None]:
train_dataset_in_prompt_format = train_dataset.map(format_prompts, batched = True,)
test_dataset_in_prompt_format = test_dataset.map(format_prompts, batched = True,)

print(train_dataset_in_prompt_format[0]['text'])

In [None]:
from huggingface_hub import login
login()

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id.value, 
    quantization_config=quantization_config, 
    torch_dtype=torch.float32,
    attn_implementation="sdpa" if not use_flash_attn_2.value else "flash_attention_2"
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id.value)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
lora_config = LoraConfig(
    r=lora_r.value,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
    lora_alpha=lora_alpha.value,
    lora_dropout=lora_dropout.value
)

In [None]:
output_dir = f"{username.value}/gemma-7b-ultrachat-sft"

In [None]:
HAS_BFLOAT16 = torch.cuda.is_bf16_supported()

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size.value,
    gradient_accumulation_steps=gradient_accumulation_steps.value,
    optim=optim.value,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps = eval_steps.value,  
    report_to = "wandb", 
    run_name = f"{wandb_project}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    save_steps=save_steps.value,
    logging_steps=logging_steps.value,
    learning_rate=learning_rate.value,
    max_grad_norm=max_grad_norm.value,
    max_steps=max_steps.value,
    warmup_ratio=warmup_ratio.value,
    lr_scheduler_type=lr_scheduler_type.value,
    gradient_checkpointing=gradient_checkpointing.value,
    fp16 = not HAS_BFLOAT16,
    bf16 = HAS_BFLOAT16,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset = train_dataset_in_prompt_format,
    eval_dataset = test_dataset_in_prompt_format,
    args=training_arguments,
    peft_config=lora_config,
    packing=packing.value,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length.value,
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
!nvidia-smi

In [None]:
trainer.save_model()

In [None]:
new_model = "gemma-7b-ultrachat-sft" #Name of the model you will be pushing to huggingface model hub
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

In [None]:
from peft import LoraConfig, PeftModel

In [None]:
# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id.value,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

### Create HF model card

In [None]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

{{ new_model_id }} is an SFT fine-tuned version of [{{ model_id }}](https://huggingface.co/{{ model_id }}) using the [{{ train_dataset_name }}](https://huggingface.co/datasets/{{ train_dataset_name }}) dataset.

## Fine-tuning configuration
### LoRA
- LoRA r: {{ r }}
- LoRA alpha: {{ lora_alpha }}
- LoRA dropout: {{ lora_dropout }}

### Training arguments
- Epochs: {{ num_epochs }}
- Batch size: {{ per_device_train_batch_size }}
- Gradient accumulation steps: {{ gradient_accumulation_steps }}
- Optimizer: {{ optim }}
- Max steps: {{ max_steps }}
- Learning rate: {{ learning_rate }}
- Weight decay: {{ weight_decay }}
- Learning rate scheduler type: {{ lr_scheduler_type }}
- Max seq length: {{ max_seq_length }}

"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [None]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          model_id = model_id.value,
          train_dataset_name = training_dataset.value,
          r = lora_r.value,
          lora_alpha = lora_alpha.value,
          lora_dropout = lora_dropout.value,
          num_epochs = 1,
          per_device_train_batch_size = per_device_train_batch_size.value,
          gradient_accumulation_steps = gradient_accumulation_steps.value,
          optim = optim.value,
          max_steps = max_steps.value,
          learning_rate = learning_rate.value,
          weight_decay = weight_decay.value,
          lr_scheduler_type = lr_scheduler_type.value,
          max_seq_length = max_seq_length.value,
          )

# Save the model card
card = ModelCard(content)
card.save("merged_model/README.md")

In [None]:
api = HfApi(token=HF_TOKEN)
api.create_repo(
    repo_id=f"{username.value}/{new_model_id.value}",
    repo_type="model",
    exist_ok=True,
)

In [None]:
api.upload_folder(
    repo_id=f"{username.value}/{new_model_id.value}",
    folder_path="merged_model",
    token=HF_TOKEN
)

In [None]:
new_model_id.value