In [1]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth

# Training

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = False

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-25 22:16:14.588575: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745619374.611190    2637 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745619374.618079    2637 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
from datasets import load_dataset, concatenate_datasets
dataset = load_dataset("ekwek/497final")

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 128,
        gradient_accumulation_steps = 1,
        
        warmup_steps = 5,
        num_train_epochs = 10,

        learning_rate = 1e-3,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(trainer, 
                                  instruction_part = '<|start_header_id|>assistant<|end_header_id|>\n\n',
                                  response_part = '<|start_header_id|>assistant<|end_header_id|>\n\n'
                                 )

In [6]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
2.834 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 10 | Total steps = 400
O^O/ \_/ \    Batch size per device = 256 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (256 x 1 x 1) = 256
 "-____-"     Trainable parameters = 11,272,192/1,247,086,592 (0.90% trained)


Step,Training Loss
1,2.207
2,2.1578
3,1.1642
4,0.4111
5,0.1719
6,0.3499
7,0.1414
8,0.1049
9,0.1292
10,0.1182


Unsloth: Will smartly offload gradients to save VRAM!


In [9]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

2828.0105 seconds used for training.
47.13 minutes used for training.
Peak reserved memory = 5.445 GB.
Peak reserved memory for training = 2.611 GB.
Peak reserved memory % of max memory = 36.938 %.
Peak reserved memory for training % of max memory = 17.713 %.


# Inference

In [10]:
inputs = []
for i in range(100):
    text = dataset['test'][i]['text']
    prompt = text[:text.index('assistant<|end_header_id|>\n\n')+28]
    inputs.append(prompt)

In [11]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(inputs, return_tensors = "pt", padding=True).to("cuda")

outputs = model.generate(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask,
                         max_new_tokens = 128, use_cache = True)
answers = tokenizer.batch_decode(outputs)

In [12]:
responses = []
prompts = []
for i in range(100):
    text = answers[i]
    response = text[text.index('assistant<|end_header_id|>\n\n')+28:]
    response = response[:response.index('<|eot_id|>')]
    prompt = text[text.index('user<|end_header_id|>\n\n')+23:]
    prompt = prompt[:prompt.index('<|eot_id|>')]
    responses.append(response)
    prompts.append(prompt)

In [13]:
total = 0
for i in range(len(responses)):
    prompt = prompts[i].split()
    response = responses[i].split()
    word = prompt[-1][:-1]
    character = prompt[2][0]
    charCount = word.count(character)
    pred = int(response[2])
    print(charCount,pred, ' '.join(prompt), ' '.join(response))
    if charCount==pred: total += 1
print(total)

1 1 How many y's are in catholicly? catholicly has 1 y's.
1 1 How many u's are in indulines? indulines has 1 u's.
2 2 How many i's are in fibrosis? fibrosis has 2 i's.
2 2 How many c's are in cassicus? cassicus has 2 c's.
1 1 How many i's are in prerestrain? prerestrain has 1 i's.
2 2 How many s's are in misrehearsed? misrehearsed has 2 s's.
3 3 How many t's are in flattest? flattest has 3 t's.
1 1 How many l's are in solidism? solidism has 1 l's.
3 3 How many b's are in blobby? blobby has 3 b's.
2 2 How many i's are in aminolytic? aminolytic has 2 i's.
1 1 How many g's are in guillemet? Guillemet has 1 g's.
2 2 How many e's are in serried? serried has 2 e's.
1 1 How many l's are in calydon? calydon has 1 l's.
1 1 How many r's are in lyraway? lyraway has 1 r's.
1 1 How many i's are in pokie? pokie has 1 i's.
3 2 How many r's are in oversubscriber? oversubscriber has 2 r's.
1 1 How many c's are in anemologic? anemologic has 1 c's.
1 1 How many r's are in sternofacialis? sternofacialis h

# Can it solve the Strawberry Problem?

In [22]:
FastLanguageModel.for_inference(model)
inputs = tokenizer([
    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 25 Apr 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow many r's are in strawberry?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
], return_tensors = "pt", padding=True).to("cuda")

outputs = model.generate(input_ids = inputs.input_ids, attention_mask = inputs.attention_mask,
                         max_new_tokens = 128, use_cache = True)
answers = tokenizer.batch_decode(outputs)
print(answers[0][-31:-10])
# Got the strawberry problem right!

strawberry has 3 r's.
