<a href="https://colab.research.google.com/github/darshlukkad/Unsloth/blob/main/colab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# --- Install deps (Colab) ---
%pip -q install -U unsloth transformers datasets accelerate peft bitsandbytes einops evaluate sentencepiece

# --- Stability flags BEFORE importing transformers/trl/peft ---
import os, sys, platform, torch, subprocess
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"   # avoid flaky compiled kernels on some Colab builds
os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"  # quieter, more robust HF downloads

# Import Unsloth FIRST so it can patch transformers properly
import unsloth
from unsloth import FastLanguageModel, is_bfloat16_supported

# Now the rest
from datasets import load_dataset
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Basic env printouts
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
!nvidia-smi || echo "No NVIDIA GPU detected"

# Precision & common constants
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LEN = 2048     # model context
BLOCK_SIZE  = 512      # training sequence length
print("dtype:", dtype, "| device:", device)


Python: 3.12.12
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
Torch: 2.8.0+cu126
CUDA available: True
GPU: Tesla T4
Mon Nov 10 07:06:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             34W /   70W |     432MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+----------------------

In [12]:
# Policy for continued pretraining: 4-bit base + LoRA (also adapt embeddings & lm_head)
import torch
from unsloth import FastLanguageModel

model_id = "HuggingFaceTB/SmolLM2-135M"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_id,
    max_seq_length  = MAX_SEQ_LEN,   # from Cell 1
    dtype           = dtype,          # from Cell 1
    load_in_4bit    = True,           # QLoRA style for low VRAM
)

# Include embeddings & lm_head so the model can better absorb new-language tokens
model = FastLanguageModel.get_peft_model(
    model,
    r                          = 16,
    lora_alpha                 = 16,
    lora_dropout               = 0.05,
    target_modules             = [
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
        "embed_tokens","lm_head"      # <- important for CPT
    ],
    use_gradient_checkpointing = "unsloth",
    random_state               = 3407,
    max_seq_length             = MAX_SEQ_LEN,
)

# Tokenizer safety defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Show how many parameters will be trained via LoRA
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Loaded: {model_id}")
print(f"Device: {model.device} | 4-bit: True")
print(f"Trainable params: {trainable_params/1e6:.2f}M / {total_params/1e6:.2f}M "
      f"({100*trainable_params/total_params:.2f}%)")


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM
Loaded: HuggingFaceTB/SmolLM2-135M
Device: cpu | 4-bit: True
Trainable params: 61.51M / 171.25M (35.92%)


In [15]:
# Robust Yoruba (yor) corpus loader from Tatoeba using iter_lines (no scripts/auth)
import os, re, random, requests
from datasets import load_dataset

def normalize(s: str) -> str:
    if s is None: return ""
    s = s.replace("\r", " ").replace("\t", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

TATOEBA_URL = "https://downloads.tatoeba.org/exports/sentences.csv"
MAX_KEEP    = 100_000     # upper bound of lines to keep from stream
MIN_CHARS   = 20          # filter very short lines
FALLBACK_MIN = 1000       # if we get fewer than this, use a tiny fallback
corpus_path = "corpora/yoruba_tatoeba.txt"
os.makedirs("corpora", exist_ok=True)

yor_lines = []
try:
    with requests.get(TATOEBA_URL, stream=True, timeout=180) as r:
        r.raise_for_status()
        # Each line is "id<TAB>lang<TAB>text"
        for line in r.iter_lines(decode_unicode=True):
            if not line:
                continue
            parts = line.split("\t", 2)
            if len(parts) < 3:
                continue
            _id, lang, text = parts[0], parts[1], parts[2]
            if lang == "yor":
                t = normalize(text)
                if len(t) >= MIN_CHARS:
                    yor_lines.append(t)
                    if len(yor_lines) >= MAX_KEEP:
                        break
    print(f"Streamed Yoruba lines: {len(yor_lines)}")
except Exception as e:
    print("Tatoeba stream failed:", e)

# Minimal fallback so the notebook can proceed even if the stream is blocked
if len(yor_lines) < FALLBACK_MIN:
    seed = [
        "Báwo ni o ṣe wà?", "Mo wà dáadáa, ẹ ṣé.", "Orúkọ mi ni Ade.",
        "Ìfẹ́ ni ìmúṣẹ gbogbo ohun rere.", "A ń kọ́ èdè Yorùbá.",
        "Ìwe yìí dára gan-an.", "Ṣe o lè ràn mí lọ́wọ́?", "Ó ṣeun gan-an.",
        "Ọjọ́ mẹ́ta ni mo ní kí n pé níbí.", "Gbọ́dọ̀ kọ́ ẹ̀kọ́ lojoojúmọ́."
    ]
    mult = FALLBACK_MIN // len(seed) + 1
    yor_lines = (seed * mult)[:FALLBACK_MIN]
    print(f"Using fallback mini-corpus with {len(yor_lines)} lines.")

# Write corpus (one sentence per line)
with open(corpus_path, "w", encoding="utf-8") as f:
    for line in yor_lines:
        f.write(line + "\n")
print(f"Saved Yoruba corpus to {corpus_path}")

# Load as a plain text dataset and standardize to 'text'
raw = load_dataset("text", data_files=corpus_path, split="train")
raw = raw.rename_column("text", "raw_text")
ds = raw.map(lambda ex: {"text": normalize(ex["raw_text"])}, remove_columns=["raw_text"])

# Optional cap for quick demo; increase for better CPT
MAX_SAMPLES = 20_000
if len(ds) > MAX_SAMPLES:
    random.seed(42)
    idxs = list(range(len(ds)))
    random.shuffle(idxs)
    ds = ds.select(idxs[:MAX_SAMPLES])

print("Cleaned examples:", len(ds))
print("\nSample text:\n", ds[0]["text"][:200], " …")


Tatoeba stream failed: a bytes-like object is required, not 'str'
Using fallback mini-corpus with 1000 lines.
Saved Yoruba corpus to corpora/yoruba_tatoeba.txt


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Cleaned examples: 1000

Sample text:
 Báwo ni o ṣe wà?  …


In [16]:
# Tokenize the Yorùbá corpus and pack into contiguous BLOCK_SIZE chunks
from datasets import Dataset
import math

assert "ds" in globals(), "Dataset `ds` not found. Run the corpus cell first."
assert "BLOCK_SIZE" in globals(), "BLOCK_SIZE not defined. Check Cell 1."

# Ensure we have an EOS / PAD token id for clean boundaries between lines
eos_id = tokenizer.eos_token_id or tokenizer.pad_token_id
assert eos_id is not None, "Tokenizer needs an eos_token_id or pad_token_id."

def tokenize_fn(batch):
    # Fast batch tokenization; no truncation (we'll pack manually)
    toks = tokenizer(batch["text"], add_special_tokens=False)
    # Append EOS per sample to separate lines when packing
    toks["input_ids"] = [ids + [eos_id] for ids in toks["input_ids"]]
    return {"input_ids": toks["input_ids"]}

# Tokenize (batched) and keep only token ids
tok = ds.map(tokenize_fn, batched=True, remove_columns=ds.column_names, desc="Tokenizing")

# Manual packer to avoid giant concatenations
packed_input_ids = []
packed_attention = []
packed_labels    = []

buf = []
for ids in tok["input_ids"]:
    buf.extend(ids)
    while len(buf) >= BLOCK_SIZE:
        chunk = buf[:BLOCK_SIZE]
        packed_input_ids.append(chunk)
        packed_attention.append([1] * BLOCK_SIZE)
        packed_labels.append(chunk.copy())   # LM labels = input_ids
        buf = buf[BLOCK_SIZE:]

num_blocks = len(packed_input_ids)
if num_blocks == 0:
    raise RuntimeError("No packed blocks created. Try lowering BLOCK_SIZE or increasing dataset size.")

packed_ds = Dataset.from_dict(
    {"input_ids": packed_input_ids, "attention_mask": packed_attention, "labels": packed_labels}
)

# Quick report
total_tokens = num_blocks * BLOCK_SIZE
approx_epochs_1k_steps = (total_tokens / BLOCK_SIZE) / 1000.0
print(f"Packed {num_blocks} blocks of {BLOCK_SIZE} tokens each "
      f"(~{total_tokens:,} tokens).")
print("Example decode (first 200 chars):\n",
      tokenizer.decode(packed_input_ids[0], skip_special_tokens=True)[:200], "…")

# For training later
train_cpt = packed_ds
len(train_cpt), train_cpt[0]["input_ids"][:8]


Tokenizing:   0%|          | 0/1000 [00:00<?, ? examples/s]

Packed 38 blocks of 512 tokens each (~19,456 tokens).
Example decode (first 200 chars):
 Báwo ni o ṣe wà?Mo wà dáadáa, ẹ ṣé.Orúkọ mi ni Ade.Ìfẹ́ ni ìmúṣẹ gbogbo ohun rere.A ń kọ́ èdè Yorùbá.Ìwe yìí dára gan-an.Ṣe o lè ràn mí lọ́wọ́?Ó ṣeun gan-an.Ọjọ́ mẹ́ta ni mo ní kí n pé níbí.Gbọ́dọ̀ kọ …


(38, [50, 5415, 6466, 39112, 263, 216, 27840, 85])

In [17]:
# Continued pretraining on Yoruba blocks (causal LM objective, LoRA adapters)
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch, math

assert "train_cpt" in globals(), "Packed dataset `train_cpt` not found. Run Cell 4 first."

# Causal LM data collator (labels already prepared, but harmless to pass mlm=False)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Some models need cache off during training
if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

training_args = TrainingArguments(
    output_dir="outputs_cpt_yoruba_smollm2",
    per_device_train_batch_size=8,      # if OOM on T4, try 4 or 2
    gradient_accumulation_steps=1,
    num_train_epochs=1,                 # increase for better adaptation
    learning_rate=1e-4,                 # CPT often uses 5e-5 ~ 2e-4; tune as needed
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    report_to="none",
    fp16=True,                          # T4 prefers fp16
    bf16=False,
    optim="adamw_bnb_8bit",             # 8-bit optimizer (bitsandbytes)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_cpt,
    data_collator=collator,
)

train_out = trainer.train()
print("CPT training complete.")
print(train_out)

# Save ONLY the LoRA adapter (compact) + tokenizer for convenience
adapter_dir = "smollm2_cpt_yoruba_adapter"
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print("Saved CPT LoRA adapter ->", adapter_dir)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2
   \\   /|    Num examples = 38 | Num Epochs = 1 | Total steps = 5
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 61,507,584 of 224,334,144 (27.42% trained)


Step,Training Loss


Unsloth: Will smartly offload gradients to save VRAM!
CPT training complete.
TrainOutput(global_step=5, training_loss=1.7551740646362304, metrics={'train_runtime': 11.5193, 'train_samples_per_second': 3.299, 'train_steps_per_second': 0.434, 'total_flos': 19577915965440.0, 'train_loss': 1.7551740646362304, 'epoch': 1.0})
Saved CPT LoRA adapter -> smollm2_cpt_yoruba_adapter


In [18]:
# Clean reload for inference: base (4-bit) + CPT LoRA adapter; generate Yorùbá text
import os, torch, textwrap
from unsloth import FastLanguageModel
from peft import PeftModel

base_id     = "HuggingFaceTB/SmolLM2-135M"
adapter_dir = "smollm2_cpt_yoruba_adapter"
assert os.path.isdir(adapter_dir), "Adapter folder not found. Run the CPT training cell first."

dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 1) Reload base model in 4-bit for low-VRAM inference
base_model, tok = FastLanguageModel.from_pretrained(
    model_name     = base_id,
    max_seq_length = 2048,
    dtype          = dtype,
    load_in_4bit   = True,
)

# 2) Attach LoRA adapter (continued pretraining weights)
model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()

# 3) Tokenizer safety defaults
if tok.pad_token is None:
    tok.pad_token = tok.eos_token
tok.padding_side = "right"

device = model.device

def generate(prompt, max_new_tokens=160, temperature=0.8, top_p=0.95):
    inputs = tok(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tok.eos_token_id,
        )
    return tok.decode(out[0], skip_special_tokens=True)

# 4) Yorùbá prompts to sanity-check adaptation
prompts = [
    # Short greeting
    "Kọ ìbáṣepọ̀ kíkan ní èdè Yorùbá fún olùkọ́ tuntun kan ní kilasì.",
    # Tiny story
    "Kọ ìtàn kékeré ní èdè Yorùbá nípa ọmọkùnrin kan tí ó kọ́ ẹ̀kọ́ bí a ṣe ń kọ orin bàtá.",
    # Informational paragraph
    "Ṣàlàyé ní kíkún ní èdè Yorùbá pé kí ni ìtọju ilera ọpọlọ, àti àwọn ìmòràn mẹ́ta fún ìdènà aapọn."
]

for i, p in enumerate(prompts, 1):
    print(f"\n=== Prompt {i} ===\n{p}\n")
    text = generate(p)
    # Show only the continuation (since we don't template chat here)
    print("=== Output ===\n" + textwrap.fill(text, width=100))


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.

=== Prompt 1 ===
Kọ ìbáṣepọ̀ kíkan ní èdè Yorùbá fún olùkọ́ tuntun kan ní kilasì.

=== Output ===
Kọ ìbáṣepọ̀ kíkan ní èdè Yorùbá fún olùkọ́ tuntun kan ní kilasì.  KiẴ tím chuộng tím đời, chiứn kíp
bị tím.  KiẴ kíp kíp súa môsì.  KiẴ kíp kíp čík tím ẩọmọn kípọ̀ kípọ̀ môsì.  KiẴ kíp kíp kíp čík
kípọ̀ kípọ̀ kípọ̀ môsì.  KiẴ kíp kí

=== Prompt 2 ===
Kọ ìtàn kékeré ní èdè Yorùbá nípa ọmọkùnrin kan tí ó kọ́ ẹ̀kọ́ bí a ṣe ń kọ orin bàtá.

=== Output 