<a href="https://colab.research.google.com/github/darshlukkad/Unsloth/blob/main/colab1_full_ft_smollm2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# --- Install core deps for Unsloth finetuning (Colab-friendly) ---
%pip -q install -U unsloth transformers trl datasets accelerate peft bitsandbytes einops evaluate sentencepiece

import torch, os, sys, platform, subprocess
print("Python:", sys.version)
print("Platform:", platform.platform())
print("Torch:", torch.__version__)
# Show GPU if available
try:
    import torch
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU name:", torch.cuda.get_device_name(0))
except Exception as e:
    print("GPU check error:", e)

# Optional: view CUDA details (will print an error on CPU-only runtimes)
!nvidia-smi || echo "No NVIDIA GPU detected"


Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
Torch: 2.8.0+cu126
CUDA available: True
GPU name: Tesla T4
Sun Nov  9 22:28:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P0             29W /   70W |     142MiB /  15360MiB |      0%      Default |
|                                         |                        

In [5]:
# Environment & core imports for full finetuning
import torch, os

from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"
print("dtype:", dtype, "| device:", device)
MAX_SEQ_LEN = 2048


dtype: torch.float16 | device: cuda


In [9]:
# --- Repair path for NameError: DEVICE_TYPE_TORCH ---
# 1) Set Unsloth flags BEFORE importing it
import os, sys, importlib, shutil
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"          # disable compiled kernels
os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"         # stabilize hf downloads
print("UNSLOTH_COMPILE_DISABLE =", os.environ["UNSLOTH_COMPILE_DISABLE"])

# 2) Force-reinstall Unsloth + Zoo (common fix from docs)
#    (Also keeps your existing torch/transformers, but refreshes Unsloth bits.)
%pip -q install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo

# 3) Purge previously-imported modules so Unsloth can patch cleanly
to_purge = tuple(["unsloth", "transformers", "trl", "peft"])
purged = [m for m in list(sys.modules) if m.startswith(to_purge)]
for m in purged:
    del sys.modules[m]
print("Purged modules:", len(purged))

# 4) Import Unsloth FIRST, then the rest (required for patching)
import unsloth
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments
import torch

# 5) Recreate env vars & settings
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LEN = 2048
print("dtype:", dtype, "| device:", device)

# 6) Load model for FULL finetuning (no LoRA)
model_id = "HuggingFaceTB/SmolLM2-135M"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_id,
    max_seq_length  = MAX_SEQ_LEN,
    dtype           = dtype,
    load_in_4bit    = False,       # full precision for tiny model
    full_finetuning = True,        # full-parameter finetune
)

# padding defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loaded model:", model_id)
print("Device:", model.device)
!nvidia-smi || echo "No NVIDIA GPU detected"


UNSLOTH_COMPILE_DISABLE = 1
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.8/61.8 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m351.3/351.3 kB[0m [31m354.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m278.2/278.2 kB[0m [31m374.6 MB/s[0m eta [36m0:00:00[0m
[?25hPurged modules: 501
ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
dtype: torch.float16 | device: cuda
==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \   

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.
Loaded model: HuggingFaceTB/SmolLM2-135M
Device: cuda:0
Sun Nov  9 22:36:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P0             31W /   70W |     706MiB /  15360MiB |      0%      Default |
|                                         |                        |              

In [10]:
# Load instruction-style dataset with preformatted `text` field
from datasets import load_dataset

ds = load_dataset("mlabonne/guanaco-llama2-1k", split="train")
print(ds)
print("Sample row:\n", ds[0]["text"][:600], "...\n")

# Use a tiny subset for a quick smoke test; increase later for your full run
train_ds = ds.select(range(min(200, len(ds))))
len(train_ds), train_ds[0]["text"][:200]


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-9ad84bb9cf65a4(‚Ä¶):   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 1000
})
Sample row:
 <s>[INST] Me gradu√© hace poco de la carrera de medicina ¬øMe podr√≠as aconsejar para conseguir r√°pidamente un puesto de trabajo? [/INST] Esto vale tanto para m√©dicos como para cualquier otra profesi√≥n tras finalizar los estudios aniversarios y mi consejo ser√≠a preguntar a cu√°ntas personas haya conocido mejor. En este caso, mi primera opci√≥n ser√≠a hablar con otros profesionales m√©dicos, echar curr√≠culos en hospitales y cualquier centro de salud. En paralelo, trabajar√≠a por mejorar mi marca personal como m√©dico mediante un blog o formas digitales de comunicaci√≥n como los v√≠deos. Y, para mejorar la ...



(200,
 '<s>[INST] Me gradu√© hace poco de la carrera de medicina ¬øMe podr√≠as aconsejar para conseguir r√°pidamente un puesto de trabajo? [/INST] Esto vale tanto para m√©dicos como para cualquier otra profesi√≥n t')

In [13]:
# Full-parameter finetuning with plain HF Trainer (no LoRA)
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Tokenize the dataset (simple truncation to a fixed context length)
MAX_LENGTH = 512
def tok_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
        return_attention_mask=True,
    )

tokenized_train = train_ds.map(tok_fn, batched=True, remove_columns=train_ds.column_names)
print(tokenized_train[0].keys())

# For causal LM (next-token prediction), MLM must be False
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer args: memory-friendly 8-bit AdamW (bitsandbytes) on Colab T4
training_args = TrainingArguments(
    output_dir="outputs_fullft_smollm2_hf",
    per_device_train_batch_size=8,     # if OOM: try 4 or 2
    gradient_accumulation_steps=1,
    num_train_epochs=1,                 # increase for better results
    learning_rate=5e-4,                 # tiny models can tolerate higher LR
    fp16=True,                          # T4 prefers fp16
    bf16=False,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    report_to="none",
    optim="adamw_bnb_8bit",             # 8-bit optimizer (bitsandbytes)
)

# Some models prefer disabling cache during training
model.config.use_cache = False

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
)

train_out = trainer.train()
print(train_out)

# Save final model + tokenizer
trainer.save_model("smollm2_fullft_model")
tokenizer.save_pretrained("smollm2_fullft_model")
print("Saved full-FT model to: smollm2_fullft_model")


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The model is already on multiple devices. Skipping the move to device specified in `args`.


dict_keys(['input_ids', 'attention_mask'])


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 134,515,008 of 134,515,008 (100.00% trained)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 134,515,008 of 134,515,008 (100.00% trained)


Step,Training Loss
10,2.1892
20,2.2101


Unsloth: Will smartly offload gradients to save VRAM!
TrainOutput(global_step=25, training_loss=2.2229506301879884, metrics={'train_runtime': 21.3231, 'train_samples_per_second': 9.379, 'train_steps_per_second': 1.172, 'total_flos': 65215719005184.0, 'train_loss': 2.2229506301879884, 'epoch': 1.0})
Saved full-FT model to: smollm2_fullft_model


In [14]:
# Generate a quick response from the fine-tuned model
import os, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

# If we saved a model dir (from the previous cell), load from disk for a clean test.
if os.path.isdir("smollm2_fullft_model"):
    tokenizer = AutoTokenizer.from_pretrained("smollm2_fullft_model")
    model = AutoModelForCausalLM.from_pretrained(
        "smollm2_fullft_model",
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map=None,
    ).to(device)
    print("Reloaded model from: smollm2_fullft_model")

# Speed up generation
if hasattr(model.config, "use_cache"):
    model.config.use_cache = True

model.eval()

# Guanaco uses an instruction style like below
prompt = (
    "### Instruction:\n"
    "Write a short, friendly greeting in 1‚Äì2 sentences.\n\n"
    "### Response:\n"
)

inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.no_grad():
    gen_ids = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)

# Show full text and the model-only continuation (after '### Response:')
print("==== Full output ====\n", text, "\n")
resp_start = text.find("### Response:")
print("==== Model continuation ====\n", text[resp_start + len("### Response:"):].strip())


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!


Reloaded model from: smollm2_fullft_model
==== Full output ====
 ### Instruction:
Write a short, friendly greeting in 1‚Äì2 sentences.

### Response:

Hello! I'm glad to hear from you! This is a great opportunity to learn more about the world of health care and make a positive impact on the lives of people who depend on it.

I am happy to answer any questions you may have and to provide you with a list of resources and online tools that can help you learn more about the field of health care.

Here are some resources and tools that might be helpful:

* Health Information Exchange: A digital platform that allows users to exchange information on health data and services.
* Patient Centered Care: A model of health care delivery that emphasizes the 

==== Model continuation ====
 Hello! I'm glad to hear from you! This is a great opportunity to learn more about the world of health care and make a positive impact on the lives of people who depend on it.

I am happy to answer any questions you

In [15]:
# Demonstrate Unsloth chat templates and generate with the fine-tuned model
from unsloth.chat_templates import get_chat_template
import torch

# If you restarted, reload from the saved directory
import os
if not ("model" in globals() and "tokenizer" in globals()):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("smollm2_fullft_model")
    model = AutoModelForCausalLM.from_pretrained(
        "smollm2_fullft_model",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    ).to("cuda" if torch.cuda.is_available() else "cpu")

# Get two popular templates
tok_llama  = get_chat_template(tokenizer, chat_template="llama")
tok_alpaca = get_chat_template(tokenizer, chat_template="alpaca")

# A tiny conversation
messages = [
    {"role": "system", "content": "You are a concise, friendly assistant."},
    {"role": "user",   "content": "Write a cheerful greeting in one or two sentences."},
]

# Apply templates (string prompts)
prompt_llama  = tok_llama.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt_alpaca = tok_alpaca.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

print("=== LLaMA-style prompt ===\n", prompt_llama[:400], "...\n")
print("=== Alpaca-style prompt ===\n", prompt_alpaca[:400], "...\n")

# Generate with both to hear formatting differences
def gen(prompt, max_new_tokens=120):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

print("\n=== Output (LLaMA template) ===\n", gen(prompt_llama))
print("\n=== Output (Alpaca template) ===\n", gen(prompt_alpaca))


Model does not have a padding token! Will use pad_token = <|endoftext|>.
Model does not have a padding token! Will use pad_token = <|endoftext|>.
=== LLaMA-style prompt ===
 <|endoftext|>You are a concise, friendly assistant.

### Instruction:
Write a cheerful greeting in one or two sentences.

### Response:
 ...

=== Alpaca-style prompt ===
 <|endoftext|>You are a concise, friendly assistant.

### Instruction:
Write a cheerful greeting in one or two sentences.

### Response:
 ...


=== Output (LLaMA template) ===
 You are a concise, friendly assistant.

### Instruction:
Write a cheerful greeting in one or two sentences.

### Response:
Dear [Name],

I am happy to hear from you and I hope you enjoy our chat.

Your name is [Your Name], and I am happy to meet you. I am looking forward to learning more about you and the world around you.

I hope you have a wonderful day!

Thank you!

Best regards,
[Your Name]

Thank you!

I hope you have a great day!

Best regards,
[Your Name]

Thank you!
