<a href="https://colab.research.google.com/github/darshlukkad/Unsloth/blob/main/colab2_lora_smollm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Install deps (Colab) ---
%pip -q install -U unsloth transformers trl datasets accelerate peft bitsandbytes einops evaluate sentencepiece

# --- Import order & stability flags (before importing transformers/trl/peft) ---
import os, sys, platform, torch, subprocess
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"   # avoid flaky compiled kernels on some Colab builds
os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"  # quieter, more robust HF downloads

# Import Unsloth FIRST so it can patch transformers properly
import unsloth
from unsloth import FastLanguageModel, is_bfloat16_supported

# Now the rest
from datasets import load_dataset
from peft import PeftModel
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Basic env printouts
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
!nvidia-smi || echo "No NVIDIA GPU detected"

# Precision & common constants
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LEN = 2048
MAX_LENGTH  = 512   # tokenized training context length
print("dtype:", dtype, "| device:", device)


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.8/61.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m351.3/351.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Load base in 4-bit (QLoRA style) and attach LoRA adapters
model_id = "HuggingFaceTB/SmolLM2-135M"

# Load in 4-bit to keep VRAM low on Colab T4
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_id,
    max_seq_length  = MAX_SEQ_LEN,
    dtype           = dtype,
    load_in_4bit    = True,          # quantized base weights
)

# Attach LoRA (parameter-efficient finetuning)
model = FastLanguageModel.get_peft_model(
    model,
    r                          = 16,
    lora_alpha                 = 16,
    lora_dropout               = 0.05,
    target_modules             = ["q_proj","k_proj","v_proj","o_proj",
                                  "gate_proj","up_proj","down_proj"],
    use_gradient_checkpointing = "unsloth",
    random_state               = 3407,
    max_seq_length             = MAX_SEQ_LEN,
)

# Tokenizer safety defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loaded:", model_id)
print("LoRA config: r=16, alpha=16, dropout=0.05")
print("Device:", model.device, "| 4-bit:", True, "| dtype:", dtype)


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.2 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Loaded: HuggingFaceTB/SmolLM2-135M
LoRA config: r=16, alpha=16, dropout=0.05
Device: cuda:0 | 4-bit: True | dtype: torch.float16


In [3]:
# Load instruction-style dataset with preformatted `text` field
from datasets import load_dataset

ds = load_dataset("mlabonne/guanaco-llama2-1k", split="train")
print(ds)
print("\nSample row preview:\n", ds[0]["text"][:600], "...\n")

# Use a small subset for a quick demo run (increase later)
TRAIN_SAMPLES = 200
train_ds = ds.select(range(min(TRAIN_SAMPLES, len(ds))))
print("Training subset size:", len(train_ds))


README.md: 0.00B [00:00, ?B/s]

(…)-00000-of-00001-9ad84bb9cf65a42f.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1000
})

Sample row preview:
 <s>[INST] Me gradué hace poco de la carrera de medicina ¿Me podrías aconsejar para conseguir rápidamente un puesto de trabajo? [/INST] Esto vale tanto para médicos como para cualquier otra profesión tras finalizar los estudios aniversarios y mi consejo sería preguntar a cuántas personas haya conocido mejor. En este caso, mi primera opción sería hablar con otros profesionales médicos, echar currículos en hospitales y cualquier centro de salud. En paralelo, trabajaría por mejorar mi marca personal como médico mediante un blog o formas digitales de comunicación como los vídeos. Y, para mejorar la ...

Training subset size: 200


In [4]:
# Tokenize the instruction data for causal LM (next-token prediction)
from functools import partial

MAX_LENGTH = 512  # keep this modest for Colab T4; raise if you can

# Ensure we have an EOS token to terminate sequences cleanly
EOS = tokenizer.eos_token or tokenizer.pad_token
assert EOS is not None, "Tokenizer must have eos_token or pad_token!"

def tok_fn(batch):
    # Append EOS if missing; truncate to MAX_LENGTH; no padding (packed later by collator)
    texts = [t if t.endswith(EOS) else (t + EOS) for t in batch["text"]]
    return tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
        return_attention_mask=True,
    )

tokenized_train = train_ds.map(
    tok_fn,
    batched=True,
    remove_columns=train_ds.column_names,
    desc="Tokenizing",
)

print("Tokenized keys:", tokenized_train.column_names)
print("Examples:", len(tokenized_train))
print("First example lengths:",
      len(tokenized_train[0]["input_ids"]),
      len(tokenized_train[0]["attention_mask"]))


Tokenizing:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenized keys: ['input_ids', 'attention_mask']
Examples: 200
First example lengths: 312 312


In [5]:
# LoRA training with Hugging Face Trainer (causal LM objective)
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import math, torch

# Collator for causal LM; creates shifted labels (no MLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Helpful: show how many params are actually trainable via LoRA
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.2f}%)")

# T4-friendly settings; if OOM, lower batch size to 4 or 2
training_args = TrainingArguments(
    output_dir="outputs_lora_smollm2_hf",
    per_device_train_batch_size=8,     # reduce if you see CUDA OOM
    gradient_accumulation_steps=1,
    num_train_epochs=1,                # increase for real training
    learning_rate=2e-4,                # common LR for LoRA on small models
    fp16=True,                         # T4 prefers fp16
    bf16=False,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    report_to="none",
    optim="adamw_bnb_8bit",            # 8-bit optimizer (bitsandbytes)
)

# Some models need cache disabled during training
if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

train_result = trainer.train()
print("Training finished.")
print(train_result)

# Save ONLY the LoRA adapter (small!)
adapter_dir = "smollm2_lora_adapter"
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print("Saved LoRA adapter to:", adapter_dir)


Trainable params: 4.88M / 86.32M (5.66%)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,399,488 (3.50% trained)


Step,Training Loss
10,2.4633
20,2.4865


Unsloth: Will smartly offload gradients to save VRAM!
Training finished.
TrainOutput(global_step=25, training_loss=2.5106719970703124, metrics={'train_runtime': 22.7003, 'train_samples_per_second': 8.81, 'train_steps_per_second': 1.101, 'total_flos': 68220434552832.0, 'train_loss': 2.5106719970703124, 'epoch': 1.0})
Saved LoRA adapter to: smollm2_lora_adapter


In [6]:
# Clean reload for inference: base model in 4-bit + attach the saved LoRA adapter
import torch, os
from unsloth import FastLanguageModel
from peft import PeftModel

base_id     = "HuggingFaceTB/SmolLM2-135M"
adapter_dir = "smollm2_lora_adapter"
assert os.path.isdir(adapter_dir), "Adapter folder not found. Run the training cell first."

dtype = torch.float16 if torch.cuda.is_available() else torch.float32
MAX_SEQ_LEN = 2048

# Load base in 4-bit for low VRAM inference
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = base_id,
    max_seq_length = MAX_SEQ_LEN,
    dtype          = dtype,
    load_in_4bit   = True,
)

# Attach LoRA weights
model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()

# Safety defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Quick prompt in Guanaco/Instruction style
prompt = (
    "### Instruction:\n"
    "Explain what a hash map is in two simple sentences.\n\n"
    "### Response:\n"
)

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    out = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

text = tokenizer.decode(out[0], skip_special_tokens=True)
print("==== Full output ====\n", text, "\n")
resp_start = text.find("### Response:")
print("==== Model continuation ====\n", text[resp_start + len("### Response:"):].strip())


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.
==== Full output ====
 ### Instruction:
Explain what a hash map is in two simple sentences.

### Response:

In a hash map, each node is represented by a key. Each node is then associated with a value. Each node is then associated with a hash key. If a node is associated with a value, then this node will be associated with a value if the value is a hash key.

### Exercise:

Write a hash map for the following:

```
10000000
42
424
50
50
100
100000

In [7]:
# Optional: Merge LoRA weights into a single full model and export to GGUF for Ollama
import os, glob, torch
from unsloth import FastLanguageModel

base_id     = "HuggingFaceTB/SmolLM2-135M"
adapter_dir = "smollm2_lora_adapter"
merged_dir  = "smollm2_lora_merged_16bit"
gguf_dir    = "gguf_export_q8"

assert os.path.isdir(adapter_dir), "Adapter folder not found. Run training first."

# If you don't have 'model' in RAM (fresh runtime), reconstruct it as in Cell 6:
if "model" not in globals() or "tokenizer" not in globals():
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = base_id,
        max_seq_length = 2048,
        dtype          = torch.float16 if torch.cuda.is_available() else torch.float32,
        load_in_4bit   = True,
    )
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, adapter_dir)

# 1) Try Unsloth-native merge (preferred)
merged_ok = False
if hasattr(model, "save_pretrained_merged"):
    try:
        model.save_pretrained_merged(merged_dir, tokenizer, save_method="merged_16bit")
        print("Merged LoRA →", merged_dir, "(Unsloth merged_16bit).")
        merged_ok = True
    except Exception as e:
        print("Unsloth merged_16bit failed, will try PEFT fallback:", e)

# 2) Fallback: PEFT merge_and_unload, then save in HF format
if not merged_ok:
    try:
        from peft import PeftModel
        merged = model.merge_and_unload()    # merges adapters into base weights
        os.makedirs(merged_dir, exist_ok=True)
        merged.save_pretrained(merged_dir)
        tokenizer.save_pretrained(merged_dir)
        print("Merged LoRA →", merged_dir, "(PEFT fallback).")
        merged_ok = True
    except Exception as e:
        print("PEFT merge fallback failed:", e)

# 3) Export to GGUF for Ollama (Q8_0)
if merged_ok:
    os.makedirs(gguf_dir, exist_ok=True)
    try:
        # Reload merged model as a full-precision Unsloth model to enable GGUF export
        merged_model, merged_tok = FastLanguageModel.from_pretrained(
            model_name      = merged_dir,
            max_seq_length  = 2048,
            dtype           = torch.float16 if torch.cuda.is_available() else torch.float32,
            load_in_4bit    = False,
            full_finetuning = False,
        )

        merged_model.save_pretrained_gguf(
            gguf_dir,
            merged_tok,
            quantization_method="q8_0",  # good default for Ollama/llama.cpp
        )
        print("Saved GGUF to:", gguf_dir)

        # Create a simple Modelfile for Ollama
        ggufs = glob.glob(os.path.join(gguf_dir, "*.gguf"))
        gguf_name = os.path.basename(ggufs[0]) if ggufs else "model-Q8_0.gguf"
        with open(os.path.join(gguf_dir, "Modelfile"), "w") as f:
            f.write(f"FROM ./{gguf_name}\n")
            f.write("PARAMETER temperature 0.7\n")
            f.write("PARAMETER top_p 0.9\n")
            f.write("TEMPLATE \"{{ .System }}\\n\\n{{ .Prompt }}\"\n")
        print("Wrote Modelfile →", os.path.join(gguf_dir, "Modelfile"))

        print("\nNext steps (locally):")
        print("1) ollama create smollm2-lora -f Modelfile   # run inside", gguf_dir)
        print("2) ollama run smollm2-lora")
    except Exception as e:
        print("GGUF export skipped or failed:", e)
else:
    print("Merge did not succeed; skipping GGUF export.")




Merged LoRA → smollm2_lora_merged_16bit (Unsloth merged_16bit).
GGUF export skipped or failed: Unsloth: No config file found - are you sure the `model_name` is correct?
If you're using a model on your local device, confirm if the folder location exists.
If you're using a HuggingFace online model, check if it exists.
