In [28]:
%%capture
# Install Unsloth (nightly) + required libraries
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git

# Install dependencies
!pip install transformers accelerate datasets bitsandbytes trl xformers
!pip install sentencepiece

# llama.cpp tools for GGUF export
!apt-get install -y git-lfs
!git clone https://github.com/ggerganov/llama.cpp.git

In [27]:
from unsloth import FastLanguageModel
import torch

# Training configuration
max_seq_length = 2048          # supports long context automatically with RoPE scaling
dtype = None                   # auto-detect (float16 on T4)
load_in_4bit = True            # 4-bit quantized base model

# Load base model (Llama-3.2-1B-Instruct)
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Apply LoRA adapters (trainable parameters)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

print("Model + LoRA loaded successfully!")


ModuleNotFoundError: No module named 'unsloth'

In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

# Load dataset
dataset = load_dataset("mlabonne/FineTome-100k", split="train")

# Convert ShareGPT format to HF conversations format
dataset = standardize_sharegpt(dataset)

# Apply Llama 3 chat template to each conversation
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(
            convo,
            tokenize = False,
            add_generation_prompt = False
        )
        for convo in convos
    ]
    return {"text": texts}

# Format dataset into "text" field for training
dataset = dataset.map(formatting_prompts_func, batched=True)

print("Dataset loaded and formatted successfully!")
print(dataset[0]["text"][:300])  # preview first training sample


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Unsloth: Standardizing formats (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset loaded and formatted successfully!
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

# Initialize SFT Trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc = 2,
    packing = False,  # Packing disabled for clarity
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,        # Small num for lab demo (can change to epochs if desired)
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",  # No wandb
    ),
)

# Mask user/system parts — only train on assistant's responses
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

print("Trainer initialized and masking applied successfully!")


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/100000 [00:00<?, ? examples/s]

Trainer initialized and masking applied successfully!


In [None]:
import torch

# Show GPU info before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"Starting reserved memory = {start_gpu_memory} GB.")

# ---- Start Training ----
print("\nStarting training...\n")
trainer_stats = trainer.train()

# ---- After Training ----
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 2)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 2)

print("\n===== Training Complete =====")
print(f"Training time: {trainer_stats.metrics['train_runtime']} seconds")
print(f"Training time (minutes): {round(trainer_stats.metrics['train_runtime']/60, 2)} min")
print(f"Peak reserved memory = {used_memory} GB")
print(f"Peak reserved memory for LoRA = {used_memory_for_lora} GB")
print(f"Percent of GPU used = {used_percentage}%")
print(f"Percent used for LoRA = {lora_percentage}%")


The model is already on multiple devices. Skipping the move to device specified in `args`.


GPU = Tesla T4. Max memory = 14.741 GB.
Starting reserved memory = 1.203 GB.

Starting training...



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.8668
2,0.9396
3,1.1607
4,1.0338
5,0.8277
6,1.0698
7,0.7263
8,1.1266
9,1.0385
10,0.8836



===== Training Complete =====
Training time: 175.6715 seconds
Training time (minutes): 2.93 min
Peak reserved memory = 2.275 GB
Peak reserved memory for LoRA = 1.072 GB
Percent of GPU used = 15.43%
Percent used for LoRA = 7.27%


In [None]:
import os

# 🔐 Replace this with your HF token
hf_token = "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"

save_dir = "lora_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Saved LoRA model locally to:", save_dir)

# Push to Hugging Face Model Repo
repo_id = "mihailocvetkovic/SML-Lab-2-Model"

model.push_to_hub(repo_id, token=hf_token)
tokenizer.push_to_hub(repo_id, token=hf_token)

print(f"Uploaded LoRA model to Hugging Face repo: {repo_id}")


Saved LoRA model locally to: lora_model


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 23.3kB / 45.1MB            

Saved model to https://huggingface.co/mihailocvetkovic/SML-Lab-2-Model


README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpzhff82ys/tokenizer.json:  89%|########9 | 15.3MB / 17.2MB            

Uploaded LoRA model to Hugging Face repo: mihailocvetkovic/SML-Lab-2-Model


In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Reload your fine-tuned LoRA model from HF repo
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="mihailocvetkovic/SML-Lab-2-Model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("Model reloaded successfully!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: Could not find Trainer class in trl.trainer.bco_trainer. Found: ['BCOTrainer', '_BCOTrainer']
==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.11.6 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


Model reloaded successfully!


In [1]:
# ============================================================
# 1. FIX ENVIRONMENT: reinstall bitsandbytes correctly
# ============================================================
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes --no-cache-dir

# ============================================================
# 2. Install Unsloth BEFORE transformers (CRITICAL)
# ============================================================
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly

# ============================================================
# 3. Install Hugging Face libraries
# ============================================================
!pip install transformers accelerate huggingface_hub
!pip install "huggingface_hub[fast]"   # enables fast Rust downloader


Found existing installation: bitsandbytes 0.48.2
Uninstalling bitsandbytes-0.48.2:
  Successfully uninstalled bitsandbytes-0.48.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m305.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
Collecting git+https://github.com/unslothai/unsloth.git@nightly
  Cloning https://github.com/unslothai/unsloth.git (to revision nightly) to /tmp/pip-req-build-tci_agr4
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-tci_agr4
  Running command git checkout -b nightly --track origin/nightly
  Switched to a new branch 'nightly'
  Branch 'nightly' set up to track remote branch 'nightly' from 'origin'.
  

In [3]:
import unsloth
from unsloth import FastLanguageModel
from huggingface_hub import login, create_repo, upload_folder
from peft import PeftModel
import torch

# ---- LOGIN ----
login(token="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")    # paste your HF token interactively

# ---- 1. Load base model (same as training) ----
base_model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
    token = True,
)

print("✅ Base model loaded.")

# ---- 2. Load LoRA adapter using PEFT ----
lora_repo = "mihailocvetkovic/SML-Lab-2-Model"

print("⏳ Loading LoRA adapter via PEFT...")
model = PeftModel.from_pretrained(model, lora_repo)
print("✅ LoRA adapter loaded.")

# ---- 3. Merge LoRA weights into base ----
print("⏳ Merging LoRA weights...")
model = model.merge_and_unload()
print("✅ Merge complete!")

# ---- 4. Save merged model ----
save_dir = "merged-llama3.2-1b"
model.save_pretrained(save_dir, safe_serialization=True)
tokenizer.save_pretrained(save_dir)

print("📁 Saved to:", save_dir)

# ---- 5. Upload to HF ----
repo_id = "mihailocvetkovic/SML-Lab-2-Model-Merged"
create_repo(repo_id, exist_ok=True)

print("⏳ Uploading to HF...")
upload_folder(
    folder_path=save_dir,
    repo_id=repo_id,
    commit_message="Merged Llama 3.2 1B + LoRA",
)

print("🎉 SUCCESS! Merged model uploaded →", f"https://huggingface.co/{repo_id}")






==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
✅ Base model loaded.
⏳ Loading LoRA adapter via PEFT...


adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

✅ LoRA adapter loaded.
⏳ Merging LoRA weights...




✅ Merge complete!
📁 Saved to: merged-llama3.2-1b
⏳ Uploading to HF...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...lama3.2-1b/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

  ...a3.2-1b/model.safetensors:   0%|          |  526kB / 1.03GB            

🎉 SUCCESS! Merged model uploaded → https://huggingface.co/mihailocvetkovic/SML-Lab-2-Model-Merged
