In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Choose Quantized llama-3-8b-instructed model by unsloth to save GRAM
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.988 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
# sample of dataprocessing
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "system", "content": "You are a chatbot for people who want to kill time"},
    {"role": "human", "content": "Hello, nice to meet you"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a chatbot for people who want to kill time<|eot_id|><|start_header_id|>human<|end_header_id|>\n\nHello, nice to meet you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello there! I'm happy to help you kill some time. What's on your mind? Do you want to chat about something in particular, or just shoot the breeze? I'm all ears (or rather, all text).<|eot_id|>"]


In [2]:
# Add lora adapter to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
# Load the training and evaluation dataset (the eval dataset is same as origin llama3-8b-instruct)
import pickle
with open('data/llama3trainingdataset.pkl', 'rb') as f:
    training_dataset = pickle.load(f)

In [4]:
training_dataset = [{'conversations': i} for i in training_dataset]

In [5]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

training_dataset = Dataset.from_list(training_dataset)

In [6]:
train_test_split = training_dataset.train_test_split(test_size=0.166666, shuffle=False) # dataset already shuffled

train_dataset = train_test_split['train'] # 50000 instance
valid_dataset = train_test_split['test']  # 10000 instance
print(train_dataset)
print(valid_dataset)

Dataset({
    features: ['conversations'],
    num_rows: 50000
})
Dataset({
    features: ['conversations'],
    num_rows: 10000
})


In [7]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3"
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
# inspect the format of three datasets, all should looks the same
print(train_dataset[0])
print('---------------')
print(valid_dataset[0])

{'conversations': [{'content': "You are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games. Simply reply with prefered game's name, no need for explanation", 'role': 'system'}, {'content': 'I played two games counterstrike: source, and fallout: a post nuclear role playing game. After playing, I gave reviews for both games as follow:\ncounterstrike: source: good gmod texture pack\nfallout: a post nuclear role playing game: this games mom is a critical failure', 'role': 'user'}, {'content': 'counterstrike: source', 'role': 'assistant'}], 'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a personal judge of video game, your role is to judge which game is preferred by the user based on user's feedbacks of two games. Simply reply with prefered game's name, no need for explanation<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI played two games counterstrike: source, and fallout

In [9]:
import re

def normalize_title(title):
    title = title.replace('.', '')
    title = title.replace(',', '')
    title = title.replace('/', '')
    title = title.replace(' ', '')
    title = title.replace("'", '')
    title = title.replace("-", '')
    # Remove non-ascii characters
    title = title.encode("ascii", errors="ignore").decode()
    # Remove extra spaces
    title = re.sub(' +', ' ', title)
    # Convert to lowercase
    title = title.lower()
    # Strip leading and trailing spaces
    title = title.strip()
    return title

In [None]:
#test
correct_counter = 0
for i in range(1000):
    msg = valid_dataset[i]['conversations'][:2]
    #print(msg)
    inputs = tokenizer.apply_chat_template(
        msg,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    ).to("cuda")
    outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
    outputs = tokenizer.batch_decode(outputs)

    answer = outputs[0].split('\n\n')[-1][:-10]
    correct_answer = valid_dataset[i]['conversations'][2]['content']
    if normalize_title(answer) == normalize_title(correct_answer):
        correct_counter += 1
    else:
        print(answer + ' | ' + correct_answer)

print(correct_counter)

In [10]:
# define trainer with parameters
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 2,
        warmup_steps = 0,
        max_steps = 1,
        learning_rate = 1e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        lr_scheduler_type = "linear",
        output_dir = "outputs",
        save_total_limit=5,  # Save only the top 5 models
        load_best_model_at_end=True,  # Load the best model at the end of training
        metric_for_best_model="loss",  # Use loss to determine the best model
        greater_is_better=False,  # Lower loss is better
        evaluation_strategy="steps",  # Set evaluation strategy to steps
        save_strategy="steps",  # Set save strategy to steps
        eval_steps=1,  # Evaluate every 10 steps (set this as needed)
    ),
)



Map (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
5.594 GB of memory reserved.


In [12]:
validation_losses = []
for i in range(300):
    trainer.train()
    history = trainer.state.log_history
    validation_loss = [entry['eval_loss'] for entry in history if 'eval_loss' in entry]
    validation_losses = validation_losses + validation_loss
    model.save_pretrained(f"ll3_fitting/epoch_{i}")  

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5913,3.658901


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.585,3.658101


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5953,3.657193


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5852,3.658032


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5829,3.657767


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5859,3.657382


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5869,3.654958


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5837,3.654811


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5866,3.653854


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5858,3.655473


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5879,3.655042


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5872,3.653487


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5826,3.654498


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5827,3.654832


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.581,3.652223


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.581,3.650686


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5797,3.652634


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5819,3.650303


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5792,3.650528


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.578,3.649884


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5791,3.648704


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5768,3.646705


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5752,3.646613


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5707,3.646568


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5715,3.645169


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5724,3.645541


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5715,3.641561


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5695,3.643057


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.573,3.640719


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5677,3.640394


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.565,3.638243


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5625,3.637109


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5655,3.636538


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.563,3.636764


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5564,3.633563


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5587,3.632715


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5602,3.631845


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5585,3.629661


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.556,3.631284


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5476,3.629045


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5501,3.626738


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5495,3.627833


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5447,3.626632


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5453,3.62499


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5426,3.623147


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5413,3.621482


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5403,3.620283


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5387,3.617701


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5368,3.615731


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5356,3.615599


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5335,3.613336


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5269,3.612691


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5249,3.611392


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5288,3.609545


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5242,3.60849


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5236,3.606488


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5222,3.602868


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5171,3.602089


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5164,3.601017


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5112,3.600533


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5117,3.596961


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5078,3.594079


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5052,3.594208


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.504,3.591692


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.5009,3.590077


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4988,3.586941


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4981,3.586498


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4922,3.583984


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4954,3.581947


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4878,3.579405


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4828,3.577484


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4805,3.576122


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4821,3.574117


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4768,3.571921


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4735,3.570228


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4699,3.567589


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4667,3.563624


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4628,3.562105


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4637,3.558931


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4585,3.556084


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4577,3.554929


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4534,3.55411


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4474,3.548748


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4472,3.546354


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4358,3.543534


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4378,3.541274


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4316,3.539014


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4335,3.535721


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4266,3.533147


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4208,3.532531


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4223,3.528386


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4173,3.524852


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4107,3.521794


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4068,3.518736


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.4044,3.514349


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3963,3.511066


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3958,3.508211


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3861,3.504899


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3824,3.504341


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3801,3.497919


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3731,3.495833


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3706,3.492047


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3624,3.489222


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3602,3.484169


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3568,3.480553


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3505,3.477959


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3475,3.47488


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3445,3.471929


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3399,3.467859


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3349,3.463108


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3297,3.459455


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3229,3.457088


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3219,3.453187


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3147,3.450232


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.312,3.446791


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.3075,3.442104


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.297,3.436658


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2964,3.433699


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2895,3.431493


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2857,3.42778


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2808,3.423913


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2745,3.420667


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2716,3.417398


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2654,3.412614


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2544,3.40853


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2506,3.405705


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2457,3.399623


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2404,3.396915


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.238,3.392335


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.231,3.388402


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2286,3.384936


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2235,3.379807


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2102,3.376038


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.208,3.373022


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2011,3.366601


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.2001,3.362884


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1903,3.358562


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1868,3.355276


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1791,3.351003


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1753,3.345932


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1656,3.341518


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1607,3.33657


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1552,3.333498


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1499,3.328945


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1429,3.323422


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1364,3.319811


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1298,3.313887


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1236,3.310773


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1204,3.306707


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1155,3.302247


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1043,3.296214


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.1007,3.291276


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0939,3.288627


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0869,3.284405


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0814,3.279026


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0755,3.275301


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.069,3.268887


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.058,3.265949


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0571,3.260614


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.045,3.256675


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0424,3.251827


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0375,3.24683


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0285,3.242052


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0236,3.238253


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0159,3.233284


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0051,3.228873


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,3.0017,3.224996


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9945,3.220009


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.991,3.21462


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9823,3.210168


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9762,3.206963


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9677,3.201641


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9658,3.195136


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9537,3.191586


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9447,3.185589


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9399,3.180915


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9351,3.1756


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9285,3.171547


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9174,3.16459


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9098,3.160763


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.9067,3.15518


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8959,3.150748


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8939,3.146554


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.884,3.140698


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8791,3.136628


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.869,3.131261


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8642,3.126379


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8545,3.121413


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8483,3.114533


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8388,3.109746


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8354,3.105824


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8263,3.100243


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8178,3.094182


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.8111,3.089261


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7992,3.084332


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7927,3.078915


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7869,3.073073


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7765,3.068772


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7734,3.062825


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7637,3.057851


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7554,3.053393


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7499,3.047498


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.742,3.040422


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7335,3.037082


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7258,3.031831


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7194,3.025636


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7091,3.019871


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.7,3.014392


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6983,3.007542


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6867,3.002918


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6758,2.997708


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.669,2.992217


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6625,2.986389


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6568,2.980401


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6439,2.975145


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6372,2.968769


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.627,2.963587


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6205,2.9584


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6089,2.95166


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.6023,2.944556


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5955,2.939229


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5863,2.93412


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5768,2.92685


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5699,2.920903


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5575,2.914809


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5522,2.908083


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5448,2.903981


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5354,2.896194


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5241,2.889858


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5166,2.883178


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.5043,2.878035


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4982,2.871704


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4883,2.866429


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4784,2.859331


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4682,2.853926


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4604,2.845859


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4514,2.839839


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4391,2.833369


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4361,2.827271


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.423,2.819966


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.4112,2.814005


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.403,2.808125


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3946,2.801239


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3879,2.795043


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3777,2.788764


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.367,2.781996


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3589,2.776118


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3479,2.769773


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.34,2.761567


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3291,2.756494


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3207,2.750155


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.312,2.744487


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.3027,2.737061


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2926,2.730846


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2827,2.724679


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2728,2.717635


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.265,2.710783


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2576,2.703574


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2484,2.698755


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2367,2.691004


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2231,2.684557


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2178,2.679374


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.2058,2.672389


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1985,2.666123


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.19,2.660716


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1825,2.653941


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1729,2.649162


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1626,2.64267


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1536,2.636865


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.148,2.629504


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1388,2.623951


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1276,2.618392


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1197,2.611722


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1105,2.606039


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.1025,2.599594


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0896,2.593648


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0836,2.587778


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0706,2.582256


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0631,2.575761


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.054,2.569991


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0441,2.563774


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0363,2.555775


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0274,2.550876


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0152,2.544155


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,2.0061,2.538583


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9964,2.530854


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9851,2.525048


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9789,2.518142


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9679,2.513149


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9604,2.505706


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9501,2.499305


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9384,2.492326


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9287,2.486949


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9169,2.47684


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9083,2.473056


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.9012,2.464434


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.8862,2.458437


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.8758,2.451802


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.8705,2.443872


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 50,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 2
\        /    Total batch size = 2 | Total steps = 1
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
1,1.8554,2.438692


In [13]:
validation_losses

[3.6589014530181885,
 3.6581010818481445,
 3.657193422317505,
 3.658031940460205,
 3.6577672958374023,
 3.6573824882507324,
 3.654958486557007,
 3.654810905456543,
 3.6538538932800293,
 3.655472755432129,
 3.6550421714782715,
 3.65348744392395,
 3.654498338699341,
 3.654832124710083,
 3.6522231101989746,
 3.650686264038086,
 3.652634382247925,
 3.6503026485443115,
 3.6505284309387207,
 3.6498844623565674,
 3.6487035751342773,
 3.646705389022827,
 3.6466128826141357,
 3.6465682983398438,
 3.645169258117676,
 3.645540952682495,
 3.641561269760132,
 3.643056631088257,
 3.640719413757324,
 3.6403937339782715,
 3.6382434368133545,
 3.637108564376831,
 3.63653826713562,
 3.6367640495300293,
 3.633563280105591,
 3.6327147483825684,
 3.631845235824585,
 3.6296608448028564,
 3.6312835216522217,
 3.629044532775879,
 3.6267378330230713,
 3.627833127975464,
 3.6266324520111084,
 3.6249897480010986,
 3.6231467723846436,
 3.6214821338653564,
 3.6202831268310547,
 3.6177008152008057,
 3.6157312393188