### Import dependancies

In [1]:
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout
import os
import wandb
from dotenv import load_dotenv
from datetime import datetime
from jinja2 import Template
from huggingface_hub import ModelCard, HfApi

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Huggingface Hub

In [3]:
username = widgets.Text(
    value="CorticalStack",
    description='HF hub user',
    disabled=False
)
username.style.description_width = 'initial'
display(username)

new_model_id = widgets.Text(
    value="mistral-7b-alpaca-sft",
    description='New model id',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='CorticalStack', description='HF hub user', style=TextStyle(description_width='initial'))

Text(value='mistral-7b-alpaca-sft', description='New model id', style=TextStyle(description_width='initial'))

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

### LoRA parameters

In [4]:
r = widgets.IntText(
    value=256,
    description='r',
    disabled=False
)
r.style.description_width = 'initial'
display(r)

lora_alpha = widgets.IntText(
    value=128,
    description='lora alpha',
    disabled=False
)
lora_alpha.style.description_width = 'initial'
display(lora_alpha)

lora_dropout = widgets.FloatText(
    value=0,
    description='lora dropout',
    disabled=False
)
lora_dropout.style.description_width = 'initial'
display(lora_dropout)

random_state = widgets.IntText(
    value=3407,
    description='random state',
    disabled=False
)
random_state.style.description_width = 'initial'
display(random_state)

IntText(value=256, description='r', style=DescriptionStyle(description_width='initial'))

IntText(value=128, description='lora alpha', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.0, description='lora dropout', style=DescriptionStyle(description_width='initial'))

IntText(value=3407, description='random state', style=DescriptionStyle(description_width='initial'))

### Training parameters

In [5]:
model_id = widgets.Text(
    value="unsloth/mistral-7b-bnb-4bit",
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

train_dataset_name = widgets.Text(
    value="yahma/alpaca-cleaned",
    description='Train dataset',
    disabled=False
)
train_dataset_name.style.description_width = 'initial'
display(train_dataset_name)

num_epochs = widgets.IntText(
    value=1,
    description='Number epochs',
    disabled=False
)
num_epochs.style.description_width = 'initial'
display(num_epochs)

per_device_train_batch_size = widgets.IntText(
    value=4,
    description='Per device train batch size',
    disabled=False
)
per_device_train_batch_size.style.description_width = 'initial'
display(per_device_train_batch_size)

gradient_accumulation_steps = widgets.IntText(
    value=4,
    description='Gradient accumulation steps',
    disabled=False
)
gradient_accumulation_steps.style.description_width = 'initial'
display(gradient_accumulation_steps)

dataset_num_proc = widgets.IntText(
    value=4,
    description='Dataset num proc',
    disabled=False
)
dataset_num_proc.style.description_width = 'initial'
display(dataset_num_proc)

logging_steps = widgets.IntText(
    value=1,
    description='Logging steps',
    disabled=False
)
logging_steps.style.description_width = 'initial'
display(logging_steps)

optim = widgets.Text(
    value="adamw_torch_fused",
    description='Optimizer',
    disabled=False
)
optim.style.description_width = 'initial'
display(optim)

warmup_steps = widgets.IntText(
    value=5,
    description='Warmup steps',
    disabled=False
)
warmup_steps.style.description_width = 'initial'
display(warmup_steps)

max_steps = widgets.IntText(
    value=30,
    description='Max steps',
    disabled=False
)
max_steps.style.description_width = 'initial'
display(max_steps)

eval_steps = widgets.IntText(
    value=5,
    description='Eval steps',
    disabled=False
)
eval_steps.style.description_width = 'initial'
display(eval_steps)

save_steps = widgets.IntText(
    value=5,
    description='Save steps',
    disabled=False
)
save_steps.style.description_width = 'initial'
display(save_steps)

learning_rate = widgets.FloatText(
    value="2e-4",
    description='Learning rate',
    disabled=False
)
learning_rate.style.description_width = 'initial'
display(learning_rate)

weight_decay = widgets.FloatText(
    value="0.01",
    description='Weight decay',
    disabled=False
)
weight_decay.style.description_width = 'initial'
display(weight_decay)

lr_scheduler_type = widgets.Text(
    value="linear",
    description='LR schedule type',
    disabled=False
)
lr_scheduler_type.style.description_width = 'initial'
display(lr_scheduler_type)

max_seq_length = widgets.IntText(
    value=2048,  #1024,
    description='Max seq length',
    disabled=False
)
max_seq_length.style.description_width = 'initial'
display(max_seq_length)

four_bit_bnb = widgets.Checkbox(
    value=True,
    description='Four bit bnb',
    disabled=False,
    indent=False
)
four_bit_bnb.style.description_width = 'initial'
display(four_bit_bnb)


Text(value='unsloth/mistral-7b-bnb-4bit', description='Model ID', style=TextStyle(description_width='initial')…

Text(value='yahma/alpaca-cleaned', description='Train dataset', style=TextStyle(description_width='initial'))

IntText(value=1, description='Number epochs', style=DescriptionStyle(description_width='initial'))

IntText(value=4, description='Per device train batch size', style=DescriptionStyle(description_width='initial'…

IntText(value=4, description='Gradient accumulation steps', style=DescriptionStyle(description_width='initial'…

IntText(value=4, description='Dataset num proc', style=DescriptionStyle(description_width='initial'))

IntText(value=1, description='Logging steps', style=DescriptionStyle(description_width='initial'))

Text(value='adamw_torch_fused', description='Optimizer', style=TextStyle(description_width='initial'))

IntText(value=5, description='Warmup steps', style=DescriptionStyle(description_width='initial'))

IntText(value=30, description='Max steps', style=DescriptionStyle(description_width='initial'))

IntText(value=5, description='Eval steps', style=DescriptionStyle(description_width='initial'))

IntText(value=5, description='Save steps', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.0002, description='Learning rate', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.01, description='Weight decay', style=DescriptionStyle(description_width='initial'))

Text(value='linear', description='LR schedule type', style=TextStyle(description_width='initial'))

IntText(value=2048, description='Max seq length', style=DescriptionStyle(description_width='initial'))

Checkbox(value=True, description='Four bit bnb', indent=False, style=CheckboxStyle(description_width='initial'…

### Tracking training with weights & biases (wandb)

In [6]:
wandb.login()

wandb_project = "ft-" + new_model_id.value
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcorticalstack[0m ([33mcorticalstackteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Load dataset from the HuggingFace Hub

In [7]:
dataset = load_dataset(train_dataset_name.value)

In [8]:
# Split the dataset into training and testing sets
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Extract the training and testing datasets
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"train dataset records: {len(train_dataset)}")
print(f"test dataset records: {len(test_dataset)}")
train_dataset[0]

train dataset records: 46584
test dataset records: 5176


{'instruction': 'Describe a fictional character that would be a perfect match for the following book title.',
 'output': 'The main character in The Adventures of a Lifelong Dreamer would be a person with an insatiable thirst for adventure and a boundless imagination. She is someone who never lost the childlike wonder and curiosity that drives her to explore new places and try new things. She has a free spirit, always reaching for the stars, and an infectious optimism that inspires those around her.\n\nShe is a Dreamer in the purest sense of the word, always envisioning grand, elaborate futures for herself, and never settling for anything less. While some may see her ambitions as naive or unrealistic, she knows that anything is possible as long as she keeps her sights set high.\n\nThroughout her journey, this character faces her fair share of setbacks and disappointments, but never loses her determination or her sense of wonder. Even in tough times, she can be found with her head in the

### Load base model

In [9]:

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id.value,
    max_seq_length = max_seq_length.value,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = four_bit_bnb.value, # Use 4bit quantization to reduce memory usage. Can be False
    token=HF_TOKEN
)

==((====))==  Unsloth: Fast Mistral patching release 2024.2
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.2.post301. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.21. FA = True.
 "-____-"     Apache 2 free license: http://github.com/unslothai/unsloth


You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


### Set prompt

In [10]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [11]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def format_prompts(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [12]:
train_dataset_in_prompt_format = train_dataset.map(format_prompts, batched = True,)
test_dataset_in_prompt_format = test_dataset.map(format_prompts, batched = True,)
print(train_dataset_in_prompt_format[0]['text'])

Map:   0%|          | 0/46584 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Describe a fictional character that would be a perfect match for the following book title.

### Input:
The Adventures of a Lifelong Dreamer

### Response:
The main character in The Adventures of a Lifelong Dreamer would be a person with an insatiable thirst for adventure and a boundless imagination. She is someone who never lost the childlike wonder and curiosity that drives her to explore new places and try new things. She has a free spirit, always reaching for the stars, and an infectious optimism that inspires those around her.

She is a Dreamer in the purest sense of the word, always envisioning grand, elaborate futures for herself, and never settling for anything less. While some may see her ambitions as naive or unrealistic, she knows that anything is possible as long as she keeps her sights set high.

Thro

### Do model patching and add fast LoRA weights

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = r.value,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_alpha.value,
    lora_dropout = lora_dropout.value, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = random_state.value,
    max_seq_length = max_seq_length.value,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


#### Construct the model trainer
- Will train the model with TRL (Transformer Reinforcement Learning), with the SFT (Supervised Fine Tuning) trainer
- Use the text column of the dataset for training

In [14]:
output_dir = "./" + new_model_id.value + "/output"
logging_dir =  "./" + new_model_id.value + "/logs"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset_in_prompt_format,
    eval_dataset = test_dataset_in_prompt_format,
    max_seq_length = max_seq_length.value,
    dataset_num_proc = dataset_num_proc.value,
    dataset_text_field = "text",
    args = TrainingArguments(
        num_train_epochs = num_epochs.value,
        per_device_train_batch_size = per_device_train_batch_size.value,
        gradient_accumulation_steps = gradient_accumulation_steps.value,
        warmup_steps = warmup_steps.value,
        max_steps = max_steps.value,
        learning_rate = learning_rate.value,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = logging_steps.value,
        logging_dir = logging_dir,   
        optim = optim.value,
        weight_decay = weight_decay.value,
        lr_scheduler_type = lr_scheduler_type.value,
        seed = random_state.value,
        do_eval=True,
        evaluation_strategy="steps",
        eval_steps = eval_steps.value,  
        save_steps = save_steps.value,                
        report_to = "wandb", 
        run_name = f"{wandb_project}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
        output_dir = output_dir,
    ),
)

Map (num_proc=4):   0%|          | 0/46584 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5176 [00:00<?, ? examples/s]

#### Show current memory stats

In [15]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.691 GB.
6.898 GB of memory reserved.


#### Train the model

In [16]:
trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
5,0.7053,0.873414


KeyboardInterrupt: 

### Show memory stats after training

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

### Create HF model card

#### Create the jinja template

In [None]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

{{ new_model_id }} is an SFT fine-tuned version of {{ model_id }} using the {{ train_dataset_name }} dataset.

## Fine-tuning configuration
### LoRA
- r: {{ r }}
- LoRA alpha: {{ lora_alpha }}
- LoRA dropout: {{ lora_dropout }}

### Training arguments
- Epochs: {{ num_epochs.value }}
- Batch size: {{ per_device_train_batch_size.value }}
- Gradient accumulation steps: {{ gradient_accumulation_steps.value }}
- Optimizer: {{ optim.value }}
- Max steps: {{ max_steps.value }}
- Learning_rate: {{ learning_rate.value }}
- Weight decay: {{ weight_decay.value }}
- Learning rate scheduler type: {{ lr_scheduler_type.value }}
- Max seq length: {{ max_seq_length.value }}
- 4-bit bnb: {{ four_bit_bnb.value }}


- Learning Rate: {{ learning_rate }}
- Maximum Sequence Length: {{ MAX_SEQ_LENGTH }}
- Dataset: {{ DATASET_NAME }}
- Split: {{ SPLIT }}

Trained with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [None]:
# Fill the template

content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          model_id = model_id.value,
          train_dataset_name = train_dataset_name.value,
          r = r.value,
          lora_alpha = lora_alpha.value,
          lora_dropout = lora_dropout.value,
          num_epochs = num_epochs.value,
          per_device_train_batch_size = per_device_train_batch_size.value,
          gradient_accumulation_steps = gradient_accumulation_steps.value,
          optim = optim.value,
          max_steps = max_steps.value,
          learning_rate = learning_rate.value,
          weight_decay = weight_decay.value,
          lr_scheduler_type = lr_scheduler_type.value,
          max_seq_length = max_seq_length.value,
          four_bit_bnb = four_bit_bnb.value
          )

# Save the model card
card = ModelCard(content)
card.save('README.md')

In [None]:
api = HfApi(token=HF_TOKEN)
api.create_repo(
    repo_id=f"{username.value}/{new_model_id.value}",
    repo_type="model",
    exist_ok=True,
)
api.upload_folder(
    repo_id=f"{username}/{MODEL_NAME}",
    folder_path="merge",
)