In [2]:
# ===============================================================
# 🧠 COLAB 1: Full Finetuning with a Small Model using Unsloth.ai
# Model: unsloth/smollm2-135m
# Task: Math Reasoning subset of MATH dataset
# ===============================================================

%%capture
!pip install unsloth datasets transformers accelerate bitsandbytes wandb huggingface_hub

In [3]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from huggingface_hub import login
import wandb

In [4]:
# ===============================================================
# 🎫 Step 3. Authenticate to Hugging Face and Weights & Biases (W&B)
# Replace tokens or input them interactively below.
# ===============================================================

hf_token = input("🔑 Enter your Hugging Face token: ")
wb_token = input("🔑 Enter your Weights & Biases token: ")

# Log in to Hugging Face and W&B
login(hf_token)
wandb.login(key=wb_token)

run = wandb.init(
    project="Full-Finetuning-SmolLM2-135M",
    job_type="training",
    anonymous="allow"
)

🔑 Enter your Hugging Face token: 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
🔑 Enter your Weights & Biases token: 6848c82c057c6fb8800814b6199788df5e30f653


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchandinisaisri-uppuganti[0m ([33mchandinisaisri-uppuganti-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [5]:
# ===============================================================
# ⚙️ Step 4. Load the SmolLM2-135M model with full finetuning enabled
# ===============================================================

max_seq_length = 2048
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/smollm2-135m",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,       # ❌ no quantization → full training
    full_finetuning = True,     # ✅ train all weights
    token = hf_token,
)

print("✅ Model loaded successfully for full finetuning!")

==((====))==  Unsloth 2025.11.1: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using bfloat16 full finetuning which cuts memory usage by 50%.
To enable float32 training, use `float32_mixed_precision = True` during FastLanguageModel.from_pretrained


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/742 [00:00<?, ?B/s]

✅ Model loaded successfully for full finetuning!


In [8]:
# ===============================================================
# 📘 Step 5. Load a small instruction-tuning dataset (Alpaca)
# ===============================================================

from datasets import load_dataset

# Use Alpaca instead of lighteval/MATH
dataset = load_dataset("tatsu-lab/alpaca", split="train[:500]")

# Define prompt format
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for inst, inp, out in zip(instructions, inputs, outputs):
        if inp.strip() == "":
            text = prompt_style.format(inst, "N/A", out) + EOS_TOKEN
        else:
            text = prompt_style.format(inst, inp, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
print("✅ Sample formatted data:\n", dataset["text"][0][:400])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Sample formatted data:
 Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Input:
N/A

### Response:
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough s


In [9]:
# ===============================================================
# 🏋️ Step 6. Configure trainer and training arguments
# ===============================================================

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 1,
        num_train_epochs = 3,
        warmup_steps = 5,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        output_dir = "outputs",
        save_total_limit = 1,
        report_to = "wandb"
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
# ===============================================================
# 🚀 Step 7. Train the model and monitor GPU usage
# ===============================================================

gpu = torch.cuda.get_device_properties(0)
print(f"Using GPU: {gpu.name} ({round(gpu.total_memory/1e9, 2)} GB VRAM)")

trainer_stats = trainer.train()

Using GPU: NVIDIA A100-SXM4-40GB (42.47 GB VRAM)


Step,Training Loss
5,2.5748
10,2.1928
15,1.6738
20,1.4019
25,1.3693
30,1.1449
35,1.3306
40,1.2823
45,1.1258
50,1.2354


Unsloth: Will smartly offload gradients to save VRAM!


In [11]:
# ===============================================================
# 📊 Step 8. Display runtime statistics
# ===============================================================
used_mem = round(torch.cuda.max_memory_reserved() / 1e9, 3)
print(f"⏱ Runtime: {round(trainer_stats.metrics['train_runtime']/60, 2)} minutes")
print(f"💾 Peak reserved GPU memory: {used_mem} GB")

⏱ Runtime: 0.99 minutes
💾 Peak reserved GPU memory: 1.908 GB


In [13]:
from IPython.display import Markdown

FastLanguageModel.for_inference(model)

test_prompt = prompt_style.format(
    "If the system of equations 3x + y = a and 2x + 5y = 2a has a solution when x = 2, compute a.",
    "",
    "" # Added an empty string for the response placeholder
)

inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

Markdown(result.split("### Response:")[1])


a = 2

In [19]:
# ===============================================================
# 💾 Step 10. Save and/or push the fine-tuned model to Hugging Face
# ===============================================================

new_model_local = "SmolLM2-135M-Math"
new_model_online = "chandinisaisri/SmolLM2-135M-Math"  # change this to your HF username path

model.save_pretrained(new_model_local)
tokenizer.save_pretrained(new_model_local)

model.push_to_hub(new_model_online)
tokenizer.push_to_hub(new_model_online)

print("✅ Model fine-tuned and uploaded successfully!")

README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...5M-Math/model.safetensors:   0%|          | 12.3kB /  269MB            

Saved model to https://huggingface.co/chandinisaisri/SmolLM2-135M-Math


README.md:   0%|          | 0.00/573 [00:00<?, ?B/s]

✅ Model fine-tuned and uploaded successfully!
