<a href="https://colab.research.google.com/github/darshlukkad/Unsloth/blob/main/colab3_dpo_pref.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Install core deps (Colab) ---
%pip -q install -U unsloth trl transformers datasets accelerate peft bitsandbytes einops evaluate sentencepiece

# --- Stability flags BEFORE importing transformers/trl/peft ---
import os, sys, platform, torch
os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"   # avoid flaky compiled kernels on some Colab builds
os.environ["UNSLOTH_STABLE_DOWNLOADS"] = "1"  # quieter, more robust HF downloads

# Import Unsloth FIRST so it can patch transformers properly
import unsloth
from unsloth import FastLanguageModel, is_bfloat16_supported, PatchDPOTrainer

# Patch TRL's DPO for PEFT/LoRA-friendly reference handling
PatchDPOTrainer()

# Now import the rest
from datasets import load_dataset
from trl import DPOTrainer
from transformers import TrainingArguments

# Basic env printouts
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
!nvidia-smi || echo "No NVIDIA GPU detected"

# Precision & common constants
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEQ_LEN = 2048
MAX_LENGTH  = 512   # prompt/completion length used by DPOTrainer config later
print("dtype:", dtype, "| device:", device)


[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.8/61.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m351.3/351.3 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m564.7/564.7 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m506.8/506.8 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0

In [None]:
# Policy model: 4-bit base + LoRA adapters (PEFT)
import torch
from unsloth import FastLanguageModel

model_id = "HuggingFaceTB/SmolLM2-135M"
MAX_SEQ_LEN = 2048
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

# Load base in 4-bit (QLoRA style) to keep VRAM low
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = model_id,
    max_seq_length  = MAX_SEQ_LEN,
    dtype           = dtype,
    load_in_4bit    = True,
)

# Attach LoRA adapters as our trainable policy head
model = FastLanguageModel.get_peft_model(
    model,
    r                          = 16,
    lora_alpha                 = 16,
    lora_dropout               = 0.05,
    target_modules             = ["q_proj","k_proj","v_proj","o_proj",
                                  "gate_proj","up_proj","down_proj"],
    use_gradient_checkpointing = "unsloth",
    random_state               = 3407,
    max_seq_length             = MAX_SEQ_LEN,
)

# Tokenizer safety defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Loaded policy base:", model_id)
print("LoRA config: r=16, alpha=16, dropout=0.05")
print("Device:", model.device, "| dtype:", dtype, "| 4-bit:", True)


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.2 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Loaded policy base: HuggingFaceTB/SmolLM2-135M
LoRA config: r=16, alpha=16, dropout=0.05
Device: cuda:0 | dtype: torch.float16 | 4-bit: True


In [None]:
# Build (prompt, chosen, rejected) from question + answers list using pm_score
from datasets import load_dataset
import random

# Pull a manageable slice for the demo
raw = load_dataset("HuggingFaceH4/stack-exchange-preferences", split="train[:5000]")

def _clean(s: str) -> str:
    if s is None:
        return ""
    s = str(s).replace("\r", " ").replace("\t", " ").strip()
    return " ".join(s.split())

def to_pairs(example):
    q = _clean(example.get("question", ""))
    answers = example.get("answers", [])
    # Need at least 2 answers with different pm_score
    scored = []
    for a in answers:
        txt = _clean(a.get("text", ""))
        pm = a.get("pm_score", None)
        if txt and pm is not None:
            scored.append((pm, txt))
    # dedupe identical texts to reduce trivial pairs
    seen = set()
    uniq = []
    for pm, txt in scored:
        if txt not in seen:
            seen.add(txt)
            uniq.append((pm, txt))
    # sample a pair with different scores
    random.shuffle(uniq)
    for i in range(len(uniq)):
        for j in range(i + 1, len(uniq)):
            pm_i, txt_i = uniq[i]
            pm_j, txt_j = uniq[j]
            if pm_i != pm_j:
                # higher score = chosen
                if pm_i > pm_j:
                    chosen, rejected = txt_i, txt_j
                else:
                    chosen, rejected = txt_j, txt_i
                prompt = f"Question:\n{q}\n\nAnswer:"
                return {"prompt": prompt, "chosen": chosen, "rejected": rejected}
    # no valid pair
    return {"prompt": None, "chosen": None, "rejected": None}

pairs = raw.map(to_pairs, remove_columns=raw.column_names)
pairs = pairs.filter(lambda ex: ex["prompt"] is not None)

# Keep a small subset for quick DPO training; increase for real runs
SUBSET = 1000
small_ds = pairs.select(range(min(SUBSET, len(pairs))))

print("Raw size:", len(raw))
print("Pairs built:", len(pairs))
print("Training subset size:", len(small_ds))
if len(small_ds) > 0:
    demo = small_ds[0]
    print("\nSample PROMPT:\n", demo["prompt"][:300], "...\n")
    print("Sample CHOSEN:\n", demo["chosen"][:300], "...\n")
    print("Sample REJECTED:\n", demo["rejected"][:300], "...\n")
else:
    print("No valid pairs found in this slice; try increasing the slice (e.g., train[:20000]).")


Resolving data files:   0%|          | 0/758 [00:00<?, ?it/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Raw size: 5000
Pairs built: 4808
Training subset size: 1000

Sample PROMPT:
 Question:
<p>I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject. </p> <p>I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but  ...

Sample CHOSEN:
 <h1>Vote!</h1> <p>Private Betas love, love, <em>love</em> votes. Without votes, it's difficult to attain privileges, get rewards, and help push us out to public beta.</p> <h1>Ask Questions!</h1> <p>I know you said this:</p> <blockquote> <p>I thought about asking about how to get started with 3D prin ...

Sample REJECTED:
 <p>That's the goal of the site, learn, research and ask.</p> <p>While you learn, you can always perform other tasks such as:</p> <ul> <li>improve quality posts by proposing edits,</li> <li>be active in meta (propose new ideas or write your opinion which are always welcomed),</

In [None]:
# DPO training: use DPOConfig (not TrainingArguments) so Unsloth's patch sees padding_value, etc.
import torch
from trl import DPOTrainer, DPOConfig

# Turn off cache during training on some models
if hasattr(model.config, "use_cache"):
    model.config.use_cache = False

pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

args = DPOConfig(
    output_dir="outputs_dpo_smollm2",
    per_device_train_batch_size=2,     # if OOM: set to 1 and raise grad_accum
    gradient_accumulation_steps=4,
    num_train_epochs=1,                # increase for real training
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    report_to="none",
    fp16=(dtype == torch.float16),
    bf16=(dtype == torch.bfloat16),
    optim="adamw_bnb_8bit",

    # --- Important for Unsloth's DPO patch / default collator ---
    max_length=MAX_LENGTH,                         # total length (prompt+response)
    max_prompt_length=min(256, MAX_LENGTH // 2),   # prompt cap
    padding_value=pad_id,                          # used by DPODataCollatorWithPadding
    label_pad_token_id=-100,                       # standard LM loss ignore index
    truncation_mode="keep_end",
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,                 # PatchDPOTrainer will handle reference efficiently
    args=args,
    beta=0.1,                       # typical range: 0.1‚Äì0.5
    train_dataset=small_ds,         # built in Cell 3: (prompt, chosen, rejected)
    tokenizer=tokenizer,
    processing_class=tokenizer,     # make the processor explicit
)

train_out = trainer.train()
print("Training complete.")
print(train_out)

# Save ONLY the LoRA adapter (compact)
adapter_dir = "smollm2_dpo_adapter"
trainer.model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print("Saved DPO LoRA adapter ->", adapter_dir)


Extracting prompt in train dataset (num_proc=12):   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=12):   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/1000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 4,884,480 of 139,399,488 (3.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.6901,0.016908,0.010589,0.525,0.006318,-826.06897,-628.409546,9.897448,9.846713,0,0,0
20,0.6651,0.105974,0.046287,0.7125,0.059687,-797.069336,-490.965271,9.850774,9.826957,No Log,No Log,No Log
30,0.6726,0.232395,0.172771,0.575,0.059624,-810.577515,-711.990051,9.739737,9.483553,No Log,No Log,No Log
40,0.6421,0.315385,0.18224,0.6625,0.133145,-878.228149,-626.377991,9.572217,9.821025,No Log,No Log,No Log
50,0.6204,0.37556,0.1932,0.725,0.182361,-815.394897,-654.939941,9.582856,9.491736,No Log,No Log,No Log
60,0.6302,0.394063,0.22026,0.6875,0.173803,-779.254761,-610.480835,9.518569,9.376617,No Log,No Log,No Log
70,0.6132,0.537292,0.297393,0.65,0.239899,-875.32843,-700.653259,9.604898,9.532171,No Log,No Log,No Log
80,0.6113,0.494367,0.249861,0.6125,0.244505,-814.120239,-603.798401,9.527124,9.289549,No Log,No Log,No Log
90,0.6087,0.443536,0.211955,0.7,0.231581,-709.06781,-586.446167,9.566622,9.286982,No Log,No Log,No Log
100,0.6266,0.507469,0.283758,0.675,0.223711,-765.637878,-636.685425,9.4504,9.354359,No Log,No Log,No Log


Training complete.
TrainOutput(global_step=125, training_loss=0.6272734756469727, metrics={'train_runtime': 372.4231, 'train_samples_per_second': 2.685, 'train_steps_per_second': 0.336, 'total_flos': 0.0, 'train_loss': 0.6272734756469727, 'epoch': 1.0})
Saved DPO LoRA adapter -> smollm2_dpo_adapter


In [None]:
# Clean reload for inference: base (4-bit) + DPO LoRA adapter, then sample from dataset
import os, random, torch
from unsloth import FastLanguageModel
from peft import PeftModel

base_id     = "HuggingFaceTB/SmolLM2-135M"
adapter_dir = "smollm2_dpo_adapter"
assert os.path.isdir(adapter_dir), "Adapter folder not found. Run the DPO training cell first."

# Recreate tokenizer + base in 4-bit
dtype = torch.float16 if torch.cuda.is_available() else torch.float32
policy_base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = base_id,
    max_seq_length = 2048,
    dtype          = dtype,
    load_in_4bit   = True,
)

# Attach LoRA adapter (policy)
model = PeftModel.from_pretrained(policy_base, adapter_dir)
model.eval()

# Tokenizer safety defaults
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def generate(prompt, max_new_tokens=200, temperature=0.7, top_p=0.9):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

# Pick a random prompt from the DPO subset if available; otherwise use a fallback
if "small_ds" in globals() and len(small_ds) > 0:
    i = random.randint(0, len(small_ds) - 1)
    demo = small_ds[i]
    prompt = demo["prompt"]
    chosen = demo["chosen"][:300]
    rejected = demo["rejected"][:300]
else:
    prompt = "Question:\nWhat are the benefits of unit testing in software development?\n\nAnswer:"
    chosen = rejected = "(no ground truth available in this quick test)"

print("=== PROMPT ===\n", prompt[:600], "...\n")
print("=== (dataset) CHOSEN (truncated) ===\n", chosen, "\n")
print("=== (dataset) REJECTED (truncated) ===\n", rejected, "\n")

gen = generate(prompt, max_new_tokens=220)
print("=== POLICY OUTPUT ===\n", gen, "\n")

# Convenience: show only the continuation after 'Answer:' if present
tag = "Answer:"
if tag in gen:
    print("=== POLICY CONTINUATION ===\n", gen.split(tag, 1)[-1].strip())


==((====))==  Unsloth 2025.11.2: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
HuggingFaceTB/SmolLM2-135M does not have a padding token! Will use pad_token = <|endoftext|>.
=== PROMPT ===
 Question:
<p>I don't want to ask off-topic and opinion questions here, but I would like to find a cadre of others dialing in their devices. Any ideas?</p>

Answer: ...

=== (dataset) CHOSEN (truncated) ===
 <p>I stumbled across this forum/group, <a href="https://forum.prusaprinters.org/forum/english-forum-original-prusa-i3-mmu2s-mmu2/" rel="nofollow noreferrer">Original Prusa i3 MMU2S &amp; MMU2</a>, amongst all of the other <a h

In [None]:
# Optional: Merge DPO LoRA weights into a full model and export to GGUF for Ollama
import os, glob, torch
from unsloth import FastLanguageModel

base_id      = "HuggingFaceTB/SmolLM2-135M"
adapter_dir  = "smollm2_dpo_adapter"
merged_dir   = "smollm2_dpo_merged_16bit"
gguf_dir     = "gguf_export_dpo_q8"

assert os.path.isdir(adapter_dir), "Adapter folder not found. Run the DPO training cell first."

# If 'model' and 'tokenizer' aren't in RAM (fresh runtime), reconstruct them:
if "model" not in globals() or "tokenizer" not in globals():
    policy_base, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = base_id,
        max_seq_length = 2048,
        dtype          = torch.float16 if torch.cuda.is_available() else torch.float32,
        load_in_4bit   = True,
    )
    from peft import PeftModel
    model = PeftModel.from_pretrained(policy_base, adapter_dir)

# 1) Try Unsloth-native merge (preferred)
merged_ok = False
if hasattr(model, "save_pretrained_merged"):
    try:
        model.save_pretrained_merged(merged_dir, tokenizer, save_method="merged_16bit")
        print("Merged LoRA ‚Üí", merged_dir, "(Unsloth merged_16bit).")
        merged_ok = True
    except Exception as e:
        print("Unsloth merged_16bit failed, will try PEFT fallback:", e)

# 2) Fallback: PEFT merge_and_unload
if not merged_ok:
    try:
        from peft import PeftModel
        merged = model.merge_and_unload()
        os.makedirs(merged_dir, exist_ok=True)
        merged.save_pretrained(merged_dir)
        tokenizer.save_pretrained(merged_dir)
        print("Merged LoRA ‚Üí", merged_dir, "(PEFT fallback).")
        merged_ok = True
    except Exception as e:
        print("PEFT merge fallback failed:", e)

# 3) Export to GGUF for Ollama (Q8_0)
if merged_ok:
    os.makedirs(gguf_dir, exist_ok=True)
    try:
        merged_model, merged_tok = FastLanguageModel.from_pretrained(
            model_name      = merged_dir,
            max_seq_length  = 2048,
            dtype           = torch.float16 if torch.cuda.is_available() else torch.float32,
            load_in_4bit    = False,
            full_finetuning = False,
        )

        merged_model.save_pretrained_gguf(
            gguf_dir,
            merged_tok,
            quantization_method="q8_0",
        )
        print("Saved GGUF to:", gguf_dir)

        # Write a simple Ollama Modelfile
        ggufs = glob.glob(os.path.join(gguf_dir, "*.gguf"))
        gguf_name = os.path.basename(ggufs[0]) if ggufs else "model-Q8_0.gguf"
        with open(os.path.join(gguf_dir, "Modelfile"), "w") as f:
            f.write(f"FROM ./{gguf_name}\n")
            f.write("PARAMETER temperature 0.7\n")
            f.write("PARAMETER top_p 0.9\n")
            f.write("TEMPLATE \"{{ .System }}\\n\\n{{ .Prompt }}\"\n")
        print("Wrote Modelfile ‚Üí", os.path.join(gguf_dir, "Modelfile"))

        print("\nNext steps (locally):")
        print(f"cd {gguf_dir}")
        print("ollama create smollm2-dpo -f Modelfile")
        print("ollama run smollm2-dpo")
    except Exception as e:
        print("GGUF export skipped or failed:", e)
else:
    print("Merge did not succeed; skipping GGUF export.")




Merged LoRA ‚Üí smollm2_dpo_merged_16bit (Unsloth merged_16bit).
GGUF export skipped or failed: Unsloth: No config file found - are you sure the `model_name` is correct?
If you're using a model on your local device, confirm if the folder location exists.
If you're using a HuggingFace online model, check if it exists.
