#### Load custom .jsonl collection of city information extraction prompts and responses

In [1]:
from datasets import load_dataset
dataset = load_dataset("teknium/openhermes")


In [2]:
# Split the dataset into training and testing sets
train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Extract the training and testing datasets
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [3]:
print(f"train dataset records: {len(train_dataset)}")
print(f"test dataset records: {len(test_dataset)}")

train dataset records: 218547
test dataset records: 24284


In [4]:
train_dataset[0]

{'instruction': 'What type of data structure is best suited for a database table?',
 'output': 'The best data structure for a database table is a relational database. Relational databases allow for fast retrieval of stored information and utilize Structured Query Language (SQL) to create relationships between database tables.',
 'input': ''}

#### Load the base pre-trained model and tokenizer
- Unsloth has it's own from_pretrained method.
- "load_in_4bit" indicates that the model will be quantized with bitsandbytes NormalFloat4 data type. This is the standard data type for QLoRA fine-tuning

In [5]:
import os
gpu_id = 0
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True # Use 4bit quantization to reduce memory usage. Can be False.

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token = "[PAD]")
tokenizer.padding_side = 'right' # to prevent warnings

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Transform our prompt/response dataset into Alpaca prompt template format 

In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Will add EOS_TOKEN, otherwise your generation will go on forever!
def format_prompts(samples):
    instructions = samples["instruction"]
    inputs       = samples["input"]
    outputs      = samples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # Input is empty as no completion history
        texts.append(text)
    return { "text" : texts, }
pass

train_dataset_in_prompt_format = train_dataset.map(format_prompts, batched = True,)
test_dataset_in_prompt_format = test_dataset.map(format_prompts, batched = True,)
print(train_dataset_in_prompt_format[0]['text'])


Map:   0%|          | 0/218547 [00:00<?, ? examples/s]

Map:   0%|          | 0/24284 [00:00<?, ? examples/s]

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What type of data structure is best suited for a database table?

### Input:


### Response:
The best data structure for a database table is a relational database. Relational databases allow for fast retrieval of stored information and utilize Structured Query Language (SQL) to create relationships between database tables.</s>


#### Do model patching and add fast LoRA weights
r and lora_aplha are the most important parameters in LoRA configuration.

** r is the rank of the LoRA matrices:
- A higher r-value means more trainable parameters, allowing for more expressivity. But, on the negative side, there is a compute tradeoff, and may also lead to overfitting.
- A lower r-value means less trainable parameters, it can reduce overfitting at the cost of expressiveness.


** lora_aplha is a scaling factor for LoRA weights:
- Higher alpha will put more emphasis on LoRA weights.
- Lower alpha will put reduced emphasis on LoRA weights, hence model will be more dependent on its original weights.


** Important tips:
- Golden rule: lora_aplha = 2*r, i.e., if r=128 and lora_aplha should be 256
- Both r and lora_aplha should be in 2**x value, a good range for selection will be [8, 16, 32, 64, 128, 256, 512]
- If your fine-tuning data is very different from the pre-training data of your model, I recommend selecting r and lora_aplha from the higher values from the above range and vice versa.

In [8]:

from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=32,
        lora_dropout=0,
        r=16,
        bias="none",
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
)


#### Define training arguments
Train for MAX_STEPS with a total batch size of 24 (per_device_train_batch_size*gradient_accumulation_steps)

In [9]:
MAX_STEPS=100
from transformers import TrainingArguments
training_arguments = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=6,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_steps=10,
        logging_steps=10, 
        learning_rate=2e-4,
        eval_steps=50,
        optim='adamw_8bit',
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        weight_decay=0.1,
        max_steps=MAX_STEPS,
        warmup_ratio=0.01,
        lr_scheduler_type="linear",
)

In [11]:
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


#### Construct the model trainer
- Will train the model with TRL (Transformer Reinforcement Learning), with the SFT (Supervised Fine Tuning) trainer
- Use the text column of the dataset for training

In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset_in_prompt_format,
    eval_dataset = test_dataset_in_prompt_format,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    tokenizer = tokenizer,
    args = training_arguments,
)


Map:   0%|          | 0/218547 [00:00<?, ? examples/s]

ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

#### Show current memory stats

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

#### Train the model

In [None]:
trainer_stats = trainer.train()

#### Show final memory and time stats

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### Inference
Infer from the model using the earlier defined Alpca prompt format, leaving response blank

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

#### Inference with text streamer


In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

#### Saving only the LoRA adapters and NOT the full model
Using Huggingface's push_to_hub for an online save or save_pretrained for a local save.

In [None]:
if False: model.save_pretrained("mistral_lora_model") # local saving
if False: model.push_to_hub("your_name/lora_model", token = "...") # Huggingface hub Online saving

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

#### Example saving as transformers architecture for VLLM
Note unsloth push_to_hub_merged is same as HF push_to_hub, with additional perf features

Be aware if I save here, then it merges into n-bit then clears the LoRAs, so will get a NoneType error if later saving the save_pretrained_gguf format

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

#### GGUF / llama.cpp Conversion
Unsloth provides native GGUF/llama.cpp save. Clones llama.cpp and default save it to q8_0. Other quants include q4_k_m. Use save_pretrained_gguf for local saving and push_to_hub_gguf for uploading to HF.

In [None]:
### GGUF / llama.cpp Conversion
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model_q4_k_m_gguf", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("CorticalStack/OpenHermes-Mistral-7B-GGUF", tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)