In [2]:
from unsloth import FastLanguageModel
import torch
import gc
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


  import trl.experimental.openenv.utils as openenv_utils


## Fine Tuning

In [None]:
# 1. Configuration for 8GB VRAM
# These settings are optimized for systems with limited VRAM (8GB or less)
max_seq_length = 2048 # Maximum sequence length - critical limit for 8GB cards to prevent OOM
dtype = None # Data type for model weights - None enables automatic detection based on hardware
load_in_4bit = True # Enable 4-bit quantization - MANDATORY for 8GB cards to reduce memory usage

# 2. Load the Model
print("Loading Model...")
# FastLanguageModel.from_pretrained() loads a pre-trained language model with specified configurations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Pre-quantized 4bit LLaMA-3 8B model from Unsloth
    max_seq_length = max_seq_length, # Maximum tokens the model can process in one sequence
    dtype = dtype, # Data type for model weights (auto-detected)
    load_in_4bit = load_in_4bit, # Enable 4-bit quantization to reduce memory footprint
)

# 3. Add LoRA Adapters (The "Fine-Tuning" part)
# LoRA (Low-Rank Adaptation) adds trainable parameters without modifying the original model weights
model = FastLanguageModel.get_peft_model(
    model, # The base model to add LoRA adapters to
    r = 16, # Rank of LoRA adaptation - controls adapter size (16 is standard, 8 saves more VRAM)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # Query, Key, Value projection layers
                      "gate_proj", "up_proj", "down_proj",], # MLP layers in transformer blocks
    lora_alpha = 16, # LoRA scaling parameter - controls adaptation strength (usually equals r)
    lora_dropout = 0, # Dropout rate for LoRA layers (0 only for Unsloth optimization)
    bias = "none", # Bias parameter handling - "none" means no bias adaptation
    use_gradient_checkpointing = "unsloth", # Memory optimization technique - MANDATORY for 8GB VRAM
    random_state = 3407, # Seed for reproducible LoRA initialization
    use_rslora = False, # Whether to use Rank-Stabilized LoRA (advanced technique)
    loftq_config = None, # LoftQ configuration for quantized models (None = disabled)
)

# 4. Load your dataset
# load_dataset() reads JSONL file and converts it to HuggingFace dataset format
dataset = load_dataset("json", # Specify JSON/JSONL file format
                      data_files="../data/royal_dataset.jsonl", # Path to training data file
                      split="train") # Load as training split

# 5. Formatter Function
# Alpaca prompt template - standardized format for instruction-following tasks
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    """
    Converts dataset examples into formatted prompt strings for training.
    
    Args:
        examples: Batch of dataset examples containing 'instruction', 'input', 'output' fields
    
    Returns:
        Dictionary with 'text' field containing formatted prompt strings
    """
    instructions = examples["instruction"] # Task descriptions from dataset
    inputs       = examples["input"]       # Additional context/input data
    outputs      = examples["output"]      # Expected responses/completions
    texts = []
    
    # Format each example using the Alpaca template
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Create formatted prompt with instruction, input, and expected output
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    
    return { "text" : texts, } # Return formatted texts for SFT training

# Apply formatting function to entire dataset in batches for efficiency
dataset = dataset.map(formatting_prompts_func, batched = True)

# 6. The Trainer
print("Starting Training...")
# SFTTrainer (Supervised Fine-Tuning Trainer) handles the training process
trainer = SFTTrainer(
    model = model, # The LoRA-adapted model to train
    tokenizer = tokenizer, # Tokenizer for text processing
    train_dataset = dataset, # Formatted training dataset
    dataset_text_field = "text", # Field name containing the formatted prompts
    max_seq_length = max_seq_length, # Maximum sequence length for training
    dataset_num_proc = 2, # Number of processes for dataset processing
    packing = False, # Whether to pack multiple samples into one sequence (saves time but uses more VRAM)
    
    # TrainingArguments contains all hyperparameters and training configuration
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Samples per GPU per step - keep LOW (1-2) for 8GB VRAM
        gradient_accumulation_steps = 4, # Steps to accumulate gradients before update (simulates larger batch)
        warmup_steps = 5, # Steps for learning rate warmup (gradual LR increase)
        max_steps = 60, # Total training steps - small for quick "Hello World" test
        learning_rate = 2e-4, # Learning rate for optimizer (0.0002)
        fp16 = not torch.cuda.is_bf16_supported(), # Use 16-bit precision if bfloat16 not available
        bf16 = torch.cuda.is_bf16_supported(), # Use bfloat16 precision if supported (more stable)
        logging_steps = 1, # Log training metrics every N steps
        optim = "adamw_8bit", # 8-bit AdamW optimizer to save VRAM
        weight_decay = 0.01, # L2 regularization strength to prevent overfitting
        lr_scheduler_type = "linear", # Learning rate decay schedule (linear decrease)
        seed = 3407, # Random seed for reproducible training
        output_dir = "outputs", # Directory to save training checkpoints and logs
    ),
)

# Start the fine-tuning process
trainer.train()

print("Training finished! Saving model...")
# Save the trained LoRA adapters and tokenizer locally
model.save_pretrained("lora_model") # Saves LoRA weights and configuration
tokenizer.save_pretrained("lora_model") # Saves tokenizer configuration and vocabulary
print("Model saved to /lora_model")

## Inference

In [2]:
# 1. Configuration for Inference
# Load the fine-tuned model from the "lora_model" folder created during training
# Unsloth automatically finds the base model from the adapter configuration
max_seq_length = 1024 # Reduced sequence length for faster inference
dtype = None # Auto-detect optimal data type based on hardware
load_in_4bit = True # Keep 4-bit quantization for memory efficiency

print("Loading your Royal Model...")
# Load the fine-tuned model with LoRA adapters for inference
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # Path to locally saved LoRA adapters and config
    max_seq_length = max_seq_length, # Maximum tokens for inference generation
    dtype = dtype, # Data type for model weights (auto-detected)
    load_in_4bit = load_in_4bit, # Enable 4-bit quantization to save VRAM
    device_map = "cuda", # Force model to load on GPU (prevent CPU offloading)
)

# 2. Enable Native Inference (2x Faster)
# Optimize model for inference by disabling training-specific features
FastLanguageModel.for_inference(model) 

# 3. The Prompt Template
# MUST be identical to the template used during training for consistent results
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""

Loading your Royal Model...
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 2080 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 2080 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [22]:
if __name__ == "__main__":
    print("\n--- The Royal Court is in Session ---")
    print("(Type 'exit' to abdicate the throne)\n")

    user_input = "what will you do for me?"

    inputs = tokenizer(
        [alpaca_prompt.format(user_input, "", "")], 
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        **inputs, 
        max_new_tokens=128,
        use_cache=True,
        temperature=0.7,
    )

    # DEBUG: Let's see what's actually being generated
    full_response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    prompt_text = tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)[0]
    
    print("="*50)
    print("PROMPT TEXT:")
    print(repr(prompt_text))  # Using repr to see exact whitespace/newlines
    print("="*50)
    print("FULL RESPONSE:")
    print(repr(full_response))
    print("="*50)
    
    generated_text = full_response[len(prompt_text):].strip()
    
    print(f"User: {user_input}")
    print(f"Assistant: {generated_text}")
    
    # Cleanup
    del inputs, outputs
    gc.collect()
    torch.cuda.empty_cache()


--- The Royal Court is in Session ---
(Type 'exit' to abdicate the throne)

PROMPT TEXT:
'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat will you do for me?\n\n### Input:\n\n\n### Response:\n'
FULL RESPONSE:
'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat will you do for me?\n\n### Input:\n\n\n### Response:\nWe shall grant thee royal favors befitting thy station.'
User: what will you do for me?
Assistant: We shall grant thee royal favors befitting thy station.
PROMPT TEXT:
'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat will you do for me?\n\n### Input:\n\n\n### Response:\n'
FULL R

## Retraining with addition training data and increased lora_alpha

In [5]:
## memory cleanup 
  
# Cleanup
gc.collect()
torch.cuda.empty_cache()

# 1. Configuration for 8GB VRAM
# These settings are optimized for systems with limited VRAM (8GB or less)
max_seq_length = 2048 # Maximum sequence length - critical limit for 8GB cards to prevent OOM
dtype = None # Data type for model weights - None enables automatic detection based on hardware
load_in_4bit = True # Enable 4-bit quantization - MANDATORY for 8GB cards to reduce memory usage

# 2. Load the Model
print("Loading Model...")
# FastLanguageModel.from_pretrained() loads a pre-trained language model with specified configurations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Pre-quantized 4bit LLaMA-3 8B model from Unsloth
    max_seq_length = max_seq_length, # Maximum tokens the model can process in one sequence
    dtype = dtype, # Data type for model weights (auto-detected)
    load_in_4bit = load_in_4bit, # Enable 4-bit quantization to reduce memory footprint
)

# 3. Add LoRA Adapters (The "Fine-Tuning" part)
# LoRA (Low-Rank Adaptation) adds trainable parameters without modifying the original model weights
model = FastLanguageModel.get_peft_model(
    model, # The base model to add LoRA adapters to
    r = 16, # Rank of LoRA adaptation - controls adapter size (16 is standard, 8 saves more VRAM)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # Query, Key, Value projection layers
                      "gate_proj", "up_proj", "down_proj",], # MLP layers in transformer blocks
    lora_alpha = 32, # LoRA scaling parameter - controls adaptation strength (usually equals r)
    lora_dropout = 0, # Dropout rate for LoRA layers (0 only for Unsloth optimization)
    bias = "none", # Bias parameter handling - "none" means no bias adaptation
    use_gradient_checkpointing = "unsloth", # Memory optimization technique - MANDATORY for 8GB VRAM
    random_state = 3407, # Seed for reproducible LoRA initialization
    use_rslora = False, # Whether to use Rank-Stabilized LoRA (advanced technique)
    loftq_config = None, # LoftQ configuration for quantized models (None = disabled)
)

# 4. Load your dataset
# load_dataset() reads JSONL file and converts it to HuggingFace dataset format
dataset = load_dataset("json", # Specify JSON/JSONL file format
                      data_files="../data/royal_dataset.jsonl", # Path to training data file
                      split="train") # Load as training split

# 5. Formatter Function
# Alpaca prompt template - standardized format for instruction-following tasks
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    """
    Converts dataset examples into formatted prompt strings for training.
    
    Args:
        examples: Batch of dataset examples containing 'instruction', 'input', 'output' fields
    
    Returns:
        Dictionary with 'text' field containing formatted prompt strings
    """
    instructions = examples["instruction"] # Task descriptions from dataset
    inputs       = examples["input"]       # Additional context/input data
    outputs      = examples["output"]      # Expected responses/completions
    texts = []
    
    # Format each example using the Alpaca template
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Create formatted prompt with instruction, input, and expected output
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    
    return { "text" : texts, } # Return formatted texts for SFT training

# Apply formatting function to entire dataset in batches for efficiency
dataset = dataset.map(formatting_prompts_func, batched = True)

# 6. The Trainer
print("Starting Training...")
# SFTTrainer (Supervised Fine-Tuning Trainer) handles the training process
trainer = SFTTrainer(
    model = model, # The LoRA-adapted model to train
    tokenizer = tokenizer, # Tokenizer for text processing
    train_dataset = dataset, # Formatted training dataset
    dataset_text_field = "text", # Field name containing the formatted prompts
    max_seq_length = max_seq_length, # Maximum sequence length for training
    dataset_num_proc = 2, # Number of processes for dataset processing
    packing = False, # Whether to pack multiple samples into one sequence (saves time but uses more VRAM)
    
    # TrainingArguments contains all hyperparameters and training configuration
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Samples per GPU per step - keep LOW (1-2) for 8GB VRAM
        gradient_accumulation_steps = 4, # Steps to accumulate gradients before update (simulates larger batch)
        warmup_steps = 5, # Steps for learning rate warmup (gradual LR increase)
        max_steps = 60, # Total training steps - small for quick "Hello World" test
        learning_rate = 2e-4, # Learning rate for optimizer (0.0002)
        fp16 = not torch.cuda.is_bf16_supported(), # Use 16-bit precision if bfloat16 not available
        bf16 = torch.cuda.is_bf16_supported(), # Use bfloat16 precision if supported (more stable)
        logging_steps = 1, # Log training metrics every N steps
        optim = "adamw_8bit", # 8-bit AdamW optimizer to save VRAM
        weight_decay = 0.01, # L2 regularization strength to prevent overfitting
        lr_scheduler_type = "linear", # Learning rate decay schedule (linear decrease)
        seed = 3407, # Random seed for reproducible training
        output_dir = "outputs", # Directory to save training checkpoints and logs
    ),
)

# Start the fine-tuning process
trainer.train()

print("Training finished! Saving model...")
# Save the trained LoRA adapters and tokenizer locally
model.save_pretrained("lora_model_v3") # Saves LoRA weights and configuration
tokenizer.save_pretrained("lora_model_v3") # Saves tokenizer configuration and vocabulary
print("Model saved to /lora_model_v3")

Loading Model...
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 2080 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Starting Training...


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 5 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Step,Training Loss
1,3.1562
2,2.6863
3,2.9873
4,2.8774
5,2.5626
6,2.2032
7,2.038
8,1.9903
9,1.7459
10,1.6636


Training finished! Saving model...
Model saved to /lora_model_v3


## Training With New Data

In [3]:
## memory cleanup 
  
# Cleanup
gc.collect()
torch.cuda.empty_cache()

# 1. Configuration for 8GB VRAM
# These settings are optimized for systems with limited VRAM (8GB or less)
max_seq_length = 2048 # Maximum sequence length - critical limit for 8GB cards to prevent OOM
dtype = None # Data type for model weights - None enables automatic detection based on hardware
load_in_4bit = True # Enable 4-bit quantization - MANDATORY for 8GB cards to reduce memory usage

# 2. Load the Model
print("Loading Model...")
# FastLanguageModel.from_pretrained() loads a pre-trained language model with specified configurations
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Pre-quantized 4bit LLaMA-3 8B model from Unsloth
    max_seq_length = max_seq_length, # Maximum tokens the model can process in one sequence
    dtype = dtype, # Data type for model weights (auto-detected)
    load_in_4bit = load_in_4bit, # Enable 4-bit quantization to reduce memory footprint
)

# 3. Add LoRA Adapters (The "Fine-Tuning" part)
# LoRA (Low-Rank Adaptation) adds trainable parameters without modifying the original model weights
model = FastLanguageModel.get_peft_model(
    model, # The base model to add LoRA adapters to
    r = 16, # Rank of LoRA adaptation - controls adapter size (16 is standard, 8 saves more VRAM)
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # Query, Key, Value projection layers
                      "gate_proj", "up_proj", "down_proj",], # MLP layers in transformer blocks
    lora_alpha = 16, # LoRA scaling parameter - controls adaptation strength (usually equals r)
    lora_dropout = 0, # Dropout rate for LoRA layers (0 only for Unsloth optimization)
    bias = "none", # Bias parameter handling - "none" means no bias adaptation
    use_gradient_checkpointing = "unsloth", # Memory optimization technique - MANDATORY for 8GB VRAM
    random_state = 3407, # Seed for reproducible LoRA initialization
    use_rslora = False, # Whether to use Rank-Stabilized LoRA (advanced technique)
    loftq_config = None, # LoftQ configuration for quantized models (None = disabled)
)

# 4. Load your dataset
# load_dataset() reads JSONL file and converts it to HuggingFace dataset format
dataset = load_dataset("json", # Specify JSON/JSONL file format
                      data_files="../data/royal_dataset.jsonl", # Path to training data file
                      split="train") # Load as training split

# 5. Formatter Function
# Alpaca prompt template - standardized format for instruction-following tasks
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    """
    Converts dataset examples into formatted prompt strings for training.
    
    Args:
        examples: Batch of dataset examples containing 'instruction', 'input', 'output' fields
    
    Returns:
        Dictionary with 'text' field containing formatted prompt strings
    """
    instructions = examples["instruction"] # Task descriptions from dataset
    inputs       = examples["input"]       # Additional context/input data
    outputs      = examples["output"]      # Expected responses/completions
    texts = []
    
    # Format each example using the Alpaca template
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Create formatted prompt with instruction, input, and expected output
        text = alpaca_prompt.format(instruction, input, output) + tokenizer.eos_token
        texts.append(text)
    
    return { "text" : texts, } # Return formatted texts for SFT training

# Apply formatting function to entire dataset in batches for efficiency
dataset = dataset.map(formatting_prompts_func, batched = True)

# 6. The Trainer
print("Starting Training...")
# SFTTrainer (Supervised Fine-Tuning Trainer) handles the training process
trainer = SFTTrainer(
    model = model, # The LoRA-adapted model to train
    tokenizer = tokenizer, # Tokenizer for text processing
    train_dataset = dataset, # Formatted training dataset
    dataset_text_field = "text", # Field name containing the formatted prompts
    max_seq_length = max_seq_length, # Maximum sequence length for training
    dataset_num_proc = 2, # Number of processes for dataset processing
    packing = False, # Whether to pack multiple samples into one sequence (saves time but uses more VRAM)
    
    # TrainingArguments contains all hyperparameters and training configuration
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Samples per GPU per step - keep LOW (1-2) for 8GB VRAM
        gradient_accumulation_steps = 4, # Steps to accumulate gradients before update (simulates larger batch)
        warmup_steps = 5, # Steps for learning rate warmup (gradual LR increase)
        max_steps = 80, # Total training steps - small for quick "Hello World" test
        learning_rate = 2e-4, # Learning rate for optimizer (0.0002)
        fp16 = not torch.cuda.is_bf16_supported(), # Use 16-bit precision if bfloat16 not available
        bf16 = torch.cuda.is_bf16_supported(), # Use bfloat16 precision if supported (more stable)
        logging_steps = 1, # Log training metrics every N steps
        optim = "adamw_8bit", # 8-bit AdamW optimizer to save VRAM
        weight_decay = 0.01, # L2 regularization strength to prevent overfitting
        lr_scheduler_type = "linear", # Learning rate decay schedule (linear decrease)
        seed = 3407, # Random seed for reproducible training
        output_dir = "outputs", # Directory to save training checkpoints and logs
    ),
)

# Start the fine-tuning process
trainer.train()

print("Training finished! Saving model...")
# Save the trained LoRA adapters and tokenizer locally
model.save_pretrained("lora_model_v4") # Saves LoRA weights and configuration
tokenizer.save_pretrained("lora_model_v4") # Saves tokenizer configuration and vocabulary
print("Model saved to /lora_model_v4")

Loading Model...
==((====))==  Unsloth 2025.12.4: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 2080 SUPER. Num GPUs = 1. Max memory: 8.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Starting Training...


The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 157 | Num Epochs = 4 | Total steps = 80
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.585
2,2.9262
3,2.7702
4,2.4832
5,2.3066
6,2.4634
7,2.1707
8,2.017
9,2.0345
10,1.8309


Training finished! Saving model...
Model saved to /lora_model_v4
