In [2]:
# 1. INSTALL FIRST (No imports yet!)
!pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
!pip install --no-deps bitsandbytes accelerate peft trl triton cut_cross_entropy
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

# 2. NOW IMPORT
import torch
import os

# 3. CHECK GPU
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU = {gpu_stats.name}. Max memory = {round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)} GB.")

Collecting unsloth
  Downloading unsloth-2025.12.10-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m149.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo
  Downloading unsloth_zoo-2025.12.8-py3-none-any.whl.metadata (32 kB)
Collecting wheel>=0.42.0 (from unsloth)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging (from unsloth)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchvision (from unsloth)
  Downloading torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (5.9 kB)
Collecting numpy (from unsloth)
  Downloading numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6

GPU = Tesla T4. Max memory = 14.741 GB.


In [1]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Based on our analysis, 1764 was our max, so 2048 is safe.
dtype = None # Auto-detection (will use Float16 on Tesla T4)
load_in_4bit = True # Essential for fitting on limited VRAM

# 1. Load the pre-quantized Qwen 3 model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-0.6B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 2. Add LoRA adapters to ONLY the MLP layers
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank
    target_modules = ["gate_proj", "up_proj", "down_proj"], # The SwiGLU MLP layers
    lora_alpha = 16,
    lora_dropout = 0, # Optimized for speed
    bias = "none",
    use_gradient_checkpointing = "unsloth", # The VRAM-saving feature we discussed
    random_state = 3407,
)

print("\n✅ Qwen 3 0.6B loaded with MLP-only LoRA adapters!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.10: Fast Qwen3 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/576M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.12.10 patched 28 layers with 0 QKV layers, 0 O layers and 28 MLP layers.



✅ Qwen 3 0.6B loaded with MLP-only LoRA adapters!


In [2]:
from datasets import load_dataset
import requests

# 1. Download your specific SFT dataset from GitHub
dataset_url = "https://raw.githubusercontent.com/goyalayus/wordle/main/data/format_tuning_set_clean.jsonl"
dataset_path = "format_tuning_set_clean.jsonl"

with open(dataset_path, "wb") as f:
    f.write(requests.get(dataset_url).content)

# 2. Load the dataset into Hugging Face format
dataset = load_dataset("json", data_files=dataset_path, split="train")

# 3. Apply the Chat Template
# This converts the 'messages' list into a single string for the model
def formatting_prompts_func(examples):
    instructions = examples["messages"]
    texts = []
    for messages in instructions:
        # We use tokenize=False because the Trainer handles tokenization later.
        # add_generation_prompt=False because we are training on the full completion.
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched=True)

# 4. Show an example to verify the tags
print("--- FORMATTED EXAMPLE ---")
print(dataset[0]["text"][:500] + "...")
print("\n✅ Dataset loaded and formatted for Qwen 3!")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

--- FORMATTED EXAMPLE ---
<|im_start|>system
You are an expert AI playing Wordle.
GOAL: Guess the secret 5-letter word in 6 tries.

GAME RULES:
1. You must input a valid 5-letter English word.
2. Feedback is given for each letter:
   - G (Green): The letter is in the word and in the CORRECT position.
   - Y (Yellow): The letter is in the word but in the WRONG position.
   - X (Gray): The letter is NOT in the word (or no extra copies exist).

LOGIC & STRATEGY:
- Eliminate Impossible Letters: Never use a letter marked 'X' ...

✅ Dataset loaded and formatted for Qwen 3!


In [3]:
!pip install wandb -q
import wandb

# This will prompt you for your API key
wandb.login()

wandb: Currently logged in as: ayush_g (ayush_g-iit-roorkee) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

In [4]:
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
import wandb

# 1. Initialize the WandB project
wandb.init(
    project="wordle-sft-backbone",
    name="qwen3-0.6b-160-examples",
    config={
        "learning_rate": 2e-4,
        "epochs": 2,
        "batch_size": 8, # Global batch size
        "rank": 16
    }
)

# 2. Configuration
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # <--- THE KEY CHANGE
    ),
)

# 3. Start Training
trainer.train()

# 4. Close the WandB run
wandb.finish()

wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/160 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🦥 Unsloth: Padding-free auto-enabled, enabling faster training.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 160 | Num Epochs = 2 | Total steps = 40
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 5,505,024 of 601,554,944 (0.92% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.2604
2,2.577
3,2.272
4,2.1325
5,2.2961
6,2.2442
7,1.9084
8,1.9069
9,1.7216
10,1.8615


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▆█▅▄▄▄▃▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁
train/learning_rate,▁▂▄▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
train/loss,▇█▇▆▇▇▆▆▅▆▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,643827278807040.0
train/epoch,2.0
train/global_step,40.0
train/grad_norm,0.44073
train/learning_rate,1e-05
train/loss,0.5978
train_loss,1.2134
train_runtime,100.5768
train_samples_per_second,3.182
train_steps_per_second,0.398


In [5]:
model.save_pretrained("wordle_lora_model")
tokenizer.save_pretrained("wordle_lora_model")

('wordle_lora_model/tokenizer_config.json',
 'wordle_lora_model/special_tokens_map.json',
 'wordle_lora_model/chat_template.jinja',
 'wordle_lora_model/vocab.json',
 'wordle_lora_model/merges.txt',
 'wordle_lora_model/added_tokens.json',
 'wordle_lora_model/tokenizer.json')

In [6]:
import shutil
from google.colab import files

# 1. Zip the folder
# This creates 'wordle_lora_model.zip' from the folder 'wordle_lora_model'
shutil.make_archive("wordle_lora_model", 'zip', "wordle_lora_model")

# 2. Download to your local computer
files.download("wordle_lora_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>