In [None]:
import warnings
warnings.filterwarnings("ignore")

from pprint import pprint

import os

from unsloth import FastLanguageModel
import torch

from multiprocessing import cpu_count
num_proc = cpu_count()

import yaml

from data_processor import SplittedJsonIoDataset
from customs import customize_tokenizer

from unsloth import UnslothTrainer, UnslothTrainingArguments

from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported

from unsloth.chat_templates import train_on_responses_only

from unsloth import unsloth_train

from utils import save_log_history

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 07-24 08:43:52 [__init__.py:244] Automatically detected platform cuda.


In [2]:
# Clear GPU cache
torch.cuda.empty_cache()

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.SafeLoader)

model, tokenizer = FastLanguageModel.from_pretrained(
    **config["model_loading_args"]
)

model, tokenizer = customize_tokenizer(model, tokenizer, config)

==((====))==  Unsloth 2025.6.8: Fast Llama patching. Transformers: 4.53.0. vLLM: 0.9.1.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.19 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 9.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

Tokenizer has a built-in chat template.

It follows an example of a formatted instruction using chat template. If instruction_part and
    response_part have been defined in config.yaml, please verify their correctness.

CHAT TEMPLATE

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>assistant<|end_header_id|>

SYSTEM MESSAGE PLACEHOLDER<|eot_id|><|start_header_id|>user<|end_header_id|>

USER INPUT MESSAGE PLACEHOLDER<|eot

In [4]:
print(f"Model's context window: {model.max_seq_length}")

Model's context window: 131072


In [5]:
# Create dataset for training
dataset = SplittedJsonIoDataset(tokenizer, config).create()

Filter:   0%|          | 0/1564 [00:00<?, ? examples/s]

Filter:   0%|          | 0/207 [00:00<?, ? examples/s]

LoRA Config

In [25]:
pprint(config["lora_parameters"])

{'bias': 'none',
 'loftq_config': 'None',
 'lora_alpha': 8,
 'lora_dropout': 0,
 'r': 8,
 'target_modules': ['q_proj',
                    'k_proj',
                    'v_proj',
                    'o_proj',
                    'gate_proj',
                    'up_proj',
                    'down_proj',
                    'lm_head',
                    'embed_tokens'],
 'use_gradient_checkpointing': 'unsloth',
 'use_rslora': True}


In [26]:
# Add LoRA weights
model = FastLanguageModel.get_peft_model(
    model=model,
    **config["lora_parameters"]
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2025.6.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


In [27]:
# Select data collator
if config["fine_tuning_args"]["training_type"]=="text_completion":
    _train_on_responses_only_bool = True
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
elif config["fine_tuning_args"]["training_type"]=="continued_pre_training":
    _train_on_responses_only_bool = False
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
else:
    raise Exception("Wrong Training Type. Check config.yaml")

Training Arguments

In [29]:
pprint(config["training_arguments"])

{'embedding_learning_rate': 5e-06,
 'eval_accumulation_steps': 16,
 'eval_strategy': 'steps',
 'fp16_full_eval': True,
 'gradient_accumulation_steps': 16,
 'learning_rate': 5e-05,
 'load_best_model_at_end': True,
 'logging_steps': 1,
 'logging_strategy': 'steps',
 'lr_scheduler_type': 'cosine',
 'metric_for_best_model': 'loss',
 'num_train_epochs': 5,
 'optim': 'adamw_8bit',
 'output_dir': 'Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES',
 'per_device_eval_batch_size': 1,
 'per_device_train_batch_size': 1,
 'report_to': 'tensorboard',
 'save_strategy': 'best',
 'save_total_limit': 1,
 'seed': 1234,
 'warmup_ratio': 0.1,
 'weight_decay': 0.01}


In [30]:
# Initiate trainer
trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["eval"],
    data_collator = data_collator,
    dataset_text_field = "text",
    max_seq_length = config["model_loading_args"]["max_seq_length"], # Used only when packing=True for creating a ConstantLengthDataset.
    packing = config["sft_trainer_arguments"]["apply_packing"],
    dataset_num_proc = num_proc,
    args = UnslothTrainingArguments(
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        **config["training_arguments"]
    )
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/1546 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/206 [00:00<?, ? examples/s]

In [31]:
# Wrap trainer for apply training using only the assistant part
if _train_on_responses_only_bool:
    trainer = train_on_responses_only(
        trainer,
        instruction_part = config["instruction_part"],
        response_part = config["response_part"]
    )

Map (num_proc=30):   0%|          | 0/1546 [00:00<?, ? examples/s]

Map (num_proc=30):   0%|          | 0/206 [00:00<?, ? examples/s]

In [32]:
if config["early_stopping_patience"]:
    from transformers import EarlyStoppingCallback
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = config["early_stopping_patience"])
    trainer.add_callback(early_stopping_callback)

In [None]:
# Start training
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,546 | Num Epochs = 5 | Total steps = 485
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 1,071,644,672/8,000,000,000 (13.40% trained)


Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 5.00e-06 instead of 5.00e-05 for lm_head.
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
1,0.3171,0.393374
2,0.1711,0.393353
3,0.7294,0.393321
4,0.1523,0.393247
5,0.9025,0.392893
6,0.4993,0.392486
7,0.7285,0.391692
8,0.6227,0.390827
9,0.7698,0.389365
10,0.4281,0.387127


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
save_log_history(trainer)

In [None]:
!sudo mkdir /mnt/data/training-outputs/Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES
!sudo cp -r Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES /mnt/data/training-outputs/Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES
!sudo cp -r log_history /mnt/data/training-outputs/Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES
!ls /mnt/data/training-outputs/Llama-3.1-8B-Instruct-CTI-4BIT-ALL-MODULES

In [None]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)
text_streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

def format_input_prompt(system_message, user_input):
    formatted_input = [
        {"role": "assistant", "content": system_message},
        {"role": "user", "content": user_input}
    ]
    return formatted_input

def format_validation_example_for_inference(example):
    return example.split("<|start_header_id|>user<|end_header_id|>")[1].split("<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[0]

def inference(model, system_message, user_input, max_new_tokens=None, **kwargs):
    input_ids = tokenizer.apply_chat_template(
        format_input_prompt(system_message, user_input),
        add_generation_prompt=True,
        return_tensors = "pt").to("cuda")
    if not max_new_tokens:
        max_new_tokens = model.config.max_position_embeddings - input_ids.shape[-1]
    model.generate(input_ids, streamer = text_streamer, max_new_tokens=max_new_tokens, **kwargs)

In [None]:
system_message = ""
user_input = "Hello! How are you?"
inference(model, system_message, user_input, max_new_tokens=100)

In [None]:
system_message = config["system_message"]
user_input = format_validation_example_for_inference(dataset["eval"]["text"][134])
inference(model,
          system_message, 
          user_input, 
          max_new_tokens=None,
          temperature=0.7,
          top_p=0.6,
          repetition_penalty=1.1,
          no_repeat_ngram_size=3,
          do_sample=True)