### Import dependancies

In [1]:
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout
import os
import wandb
from dotenv import load_dotenv
from datetime import datetime


### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Huggingface Hub

In [3]:
username = widgets.Text(
    value="CorticalStack",
    description='HF hub user',
    disabled=False
)
username.style.description_width = 'initial'
display(username)

new_model_id = widgets.Text(
    value="mistral-7b-alpaca",
    description='New model id',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

Text(value='CorticalStack', description='HF hub user', style=TextStyle(description_width='initial'))

Text(value='mistral-7b-alpaca', description='New model id', style=TextStyle(description_width='initial'))

### LoRA parameters

In [4]:
r = widgets.IntText(
    value=64,
    description='r',
    disabled=False
)
r.style.description_width = 'initial'
display(r)

lora_alpha = widgets.IntText(
    value=64,
    description='lora alpha',
    disabled=False
)
lora_alpha.style.description_width = 'initial'
display(lora_alpha)

lora_dropout = widgets.IntText(
    value=0,
    description='lora dropout',
    disabled=False
)
lora_dropout.style.description_width = 'initial'
display(lora_dropout)

random_state = widgets.IntText(
    value=3407,
    description='random state',
    disabled=False
)
random_state.style.description_width = 'initial'
display(random_state)

IntText(value=64, description='r', style=DescriptionStyle(description_width='initial'))

IntText(value=64, description='lora alpha', style=DescriptionStyle(description_width='initial'))

IntText(value=0, description='lora dropout', style=DescriptionStyle(description_width='initial'))

IntText(value=3407, description='random state', style=DescriptionStyle(description_width='initial'))

### Training parameters

In [5]:
model_id = widgets.Text(
    value="unsloth/mistral-7b-bnb-4bit",
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

train_dataset = widgets.Text(
    value="yahma/alpaca-cleaned",
    description='Train dataset',
    disabled=False
)
train_dataset.style.description_width = 'initial'
display(train_dataset)

per_device_train_batch_size = widgets.IntText(
    value=4,
    description='Per device train batch size',
    disabled=False
)
per_device_train_batch_size.style.description_width = 'initial'
display(per_device_train_batch_size)

gradient_accumulation_steps = widgets.IntText(
    value=4,
    description='Gradient accumulation steps',
    disabled=False
)
gradient_accumulation_steps.style.description_width = 'initial'
display(gradient_accumulation_steps)

dataset_num_proc = widgets.IntText(
    value=4,
    description='Dataset num proc',
    disabled=False
)
dataset_num_proc.style.description_width = 'initial'
display(dataset_num_proc)

logging_steps = widgets.IntText(
    value=1,
    description='Logging steps',
    disabled=False
)
logging_steps.style.description_width = 'initial'
display(logging_steps)

seed = widgets.IntText(
    value=42,
    description='Seed',
    disabled=False
)
seed.style.description_width = 'initial'
display(seed)

optim = widgets.Text(
    value="adamw_8bit",
    description='Optimizer',
    disabled=False
)
optim.style.description_width = 'initial'
display(optim)

warmup_steps = widgets.IntText(
    value=5,
    description='Warmup steps',
    disabled=False
)
warmup_steps.style.description_width = 'initial'
display(warmup_steps)

max_steps = widgets.IntText(
    value=60,
    description='Max steps',
    disabled=False
)
max_steps.style.description_width = 'initial'
display(max_steps)

eval_steps = widgets.IntText(
    value=5,
    description='Eval steps',
    disabled=False
)
eval_steps.style.description_width = 'initial'
display(eval_steps)

save_steps = widgets.IntText(
    value=5,
    description='Save steps',
    disabled=False
)
save_steps.style.description_width = 'initial'
display(save_steps)

learning_rate = widgets.FloatText(
    value="2e-4",
    description='Learning rate',
    disabled=False
)
learning_rate.style.description_width = 'initial'
display(learning_rate)

weight_decay = widgets.FloatText(
    value="0.01",
    description='Weight decay',
    disabled=False
)
weight_decay.style.description_width = 'initial'
display(weight_decay)

lr_scheduler_type = widgets.Text(
    value="linear",
    description='LR schedule type',
    disabled=False
)
lr_scheduler_type.style.description_width = 'initial'
display(lr_scheduler_type)

# max_length = widgets.IntText(
#     value=1024,
#     description='Max length',
#     disabled=False
# )
# max_length.style.description_width = 'initial'
# display(max_length)

# max_prompt_length = widgets.IntText(
#     value=512,
#     description='Max prompt length',
#     disabled=False
# )
# max_prompt_length.style.description_width = 'initial'
# display(max_prompt_length)

max_seq_length = widgets.IntText(
    value=2048,  #1024,
    description='Max seq length',
    disabled=False
)
max_seq_length.style.description_width = 'initial'
display(max_seq_length)

four_bit_bnb = widgets.Checkbox(
    value=True,
    description='Four bit bnb',
    disabled=False,
    indent=False
)
four_bit_bnb.style.description_width = 'initial'
display(four_bit_bnb)


Text(value='unsloth/mistral-7b-bnb-4bit', description='Model ID', style=TextStyle(description_width='initial')…

Text(value='yahma/alpaca-cleaned', description='Train dataset', style=TextStyle(description_width='initial'))

IntText(value=4, description='Per device train batch size', style=DescriptionStyle(description_width='initial'…

IntText(value=4, description='Gradient accumulation steps', style=DescriptionStyle(description_width='initial'…

IntText(value=4, description='Dataset num proc', style=DescriptionStyle(description_width='initial'))

IntText(value=1, description='Logging steps', style=DescriptionStyle(description_width='initial'))

IntText(value=42, description='Seed', style=DescriptionStyle(description_width='initial'))

Text(value='adamw_8bit', description='Optimizer', style=TextStyle(description_width='initial'))

IntText(value=5, description='Warmup steps', style=DescriptionStyle(description_width='initial'))

IntText(value=60, description='Max steps', style=DescriptionStyle(description_width='initial'))

IntText(value=5, description='Eval steps', style=DescriptionStyle(description_width='initial'))

IntText(value=5, description='Save steps', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.0002, description='Learning rate', style=DescriptionStyle(description_width='initial'))

FloatText(value=0.01, description='Weight decay', style=DescriptionStyle(description_width='initial'))

Text(value='linear', description='LR schedule type', style=TextStyle(description_width='initial'))

IntText(value=2048, description='Max seq length', style=DescriptionStyle(description_width='initial'))

Checkbox(value=True, description='Four bit bnb', indent=False, style=CheckboxStyle(description_width='initial'…

### Tracking training with weights & biases (wandb)

In [6]:
wandb.login()

wandb_project = "ft-" + new_model_id.value
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mcorticalstack[0m ([33mcorticalstackteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


### Load dataset from the HuggingFace Hub

In [7]:
dataset = load_dataset(train_dataset.value)

In [8]:
# Split the dataset into training and testing sets
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Extract the training and testing datasets
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(f"train dataset records: {len(train_dataset)}")
print(f"test dataset records: {len(test_dataset)}")
train_dataset[0]

train dataset records: 46584
test dataset records: 5176


{'instruction': 'Generate an essay about the importance of physical education in schools.',
 'output': "Physical education is an integral part of the modern school curriculum, and its importance cannot be overemphasized. Over the years, physical education has been seen as a dispensable subject, with its true purposes and benefits often ignored. But as the world embraces physical fitness, the role of physical education in schools has become even more impactful, particularly in shaping the habits and behaviors that promote a healthy lifestyle.\n\nOne of the most obvious reasons why physical education should be an essential part of the school curriculum is the positive impact it has on students' health. With the alarming rates of childhood obesity and related diseases, it is particularly necessary for children to engage in physical activities that promote healthy body weight and cardiovascular fitness. Physical education classes provide students with the opportunity to participate in vari

### Load base model

In [9]:

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id.value,
    max_seq_length = max_seq_length.value,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = four_bit_bnb.value, # Use 4bit quantization to reduce memory usage. Can be False
    token=HF_TOKEN
)

==((====))==  Unsloth: Fast Mistral patching release 2024.2
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.2.post301. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.21. FA = True.
 "-____-"     Apache 2 free license: http://github.com/unslothai/unsloth


You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


### Set prompt

In [10]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [11]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def format_prompts(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [12]:
train_dataset_in_prompt_format = train_dataset.map(format_prompts, batched = True,)
test_dataset_in_prompt_format = test_dataset.map(format_prompts, batched = True,)
print(train_dataset_in_prompt_format[0]['text'])

Map:   0%|          | 0/46584 [00:00<?, ? examples/s]

Map:   0%|          | 0/5176 [00:00<?, ? examples/s]

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate an essay about the importance of physical education in schools.

### Input:


### Response:
Physical education is an integral part of the modern school curriculum, and its importance cannot be overemphasized. Over the years, physical education has been seen as a dispensable subject, with its true purposes and benefits often ignored. But as the world embraces physical fitness, the role of physical education in schools has become even more impactful, particularly in shaping the habits and behaviors that promote a healthy lifestyle.

One of the most obvious reasons why physical education should be an essential part of the school curriculum is the positive impact it has on students' health. With the alarming rates of childhood obesity and related diseases, it is particularly necessary for children to engage 

### Do model patching and add fast LoRA weights

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  #r.value,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  #32,  #lora_alpha.value,
    lora_dropout = 0,  #lora_dropout.value, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,  #random_state.value,
    #max_seq_length = max_seq_length.value,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.2 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


#### Construct the model trainer
- Will train the model with TRL (Transformer Reinforcement Learning), with the SFT (Supervised Fine Tuning) trainer
- Use the text column of the dataset for training

In [14]:
output_dir = "./" + new_model_id.value + "/output"
logging_dir =  "./" + new_model_id.value + "/logs"

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset_in_prompt_format,
    eval_dataset = test_dataset_in_prompt_format,
    max_seq_length = max_seq_length.value,
    dataset_num_proc = 2,  #dataset_num_proc.value,
    dataset_text_field = "text",
    args = TrainingArguments(
        per_device_train_batch_size = 2,  #per_device_train_batch_size.value,
        gradient_accumulation_steps = 4,  #gradient_accumulation_steps.value,
        warmup_steps = warmup_steps.value,
        max_steps = max_steps.value,
        learning_rate = 2e-4,  #learning_rate.value,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,  #logging_steps.value,
        logging_dir = logging_dir,   
        optim = "adamw_8bit",  #optim.value,
        weight_decay = 0.01,  ##weight_decay.value,
        lr_scheduler_type = "linear",  #lr_scheduler_type.value,
        seed = 3407,  #seed.value,
        do_eval=True,
        evaluation_strategy="steps",
        eval_steps = eval_steps.value,  
        save_steps = save_steps.value,                
        report_to = "wandb", 
        run_name = f"{wandb_project}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
        output_dir = output_dir,
    ),
)

Map (num_proc=2):   0%|          | 0/46584 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/5176 [00:00<?, ? examples/s]

#### Train the model

In [15]:
trainer.train()

Step,Training Loss,Validation Loss
5,0.9054,0.982301


Checkpoint destination directory ./mistral-7b-alpaca/output/checkpoint-5 already exists and is non-empty.Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 