# Welcome to the Hypnos Agent Model Training Notebook 🚀

<img src="https://raw.githubusercontent.com/dmitrykazhdan/HYPNOS/refs/heads/main/assets/hypnos_icon.png" alt="Icon" width="100"/>  

## Setup 🧰

- Ensure you are using a GPU Runtime, such as a T4 GPU
- High-RAM model is highly recommended
- For Unsloth-related issues, consult their:
  - [Documentation](https://docs.unsloth.ai/)
  - [Discord](https://discord.gg/unsloth)

## Installation 🔧

- Lets go through the core package installation steps
- ❗We also specify a few GOTCHAs with package compatibility

In [None]:
!pip install unsloth

In [None]:
# This was needed as of 30.07.2025 for a change pushed to trl which broke unsloth functionality
!pip install --no-deps --force-reinstall trl==0.19.1

In [None]:
!pip install --no-deps --upgrade timm # Needed Gemma 3N

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

## Supervised Finetuning (SFT) ⚡

- ❗ Unsloth occasionally has issues with freeing memory. It is recommended to do a "Restart Session" or otherwise cleanup GPU resources between model runs
- ⏳ This section may take 1-2hrs to run
- ❗Recommendation: make sure it runs end-to-end with `SUBSET=5` to ensure all packages/systems are setup correctly. Then run the full pass with `SUBSET=0`

In [None]:
#!/usr/bin/env python3
# ---------------------------------------------------
import unsloth
from unsloth import FastModel, is_bfloat16_supported
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig
from datasets import Dataset
import json, torch, gc, os, shutil
from datetime import datetime

# See: https://github.com/huggingface/transformers/issues/39427
torch._dynamo.config.cache_size_limit = 128
torch._dynamo.config.suppress_errors  = True


In [None]:
# ─────────── CONFIG ──────────────────────────────────────────────────────────
BASE_REPO  = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
# Set to a gdrive of your choice
drive_root = "..."
data_json  = f'{drive_root}/data/sleep-train-enriched.json'

# Set training configuration parameters
EPOCHS_SFT = 3
LORA_R, LORA_ALPHA, LORA_DROPOUT = 16, 32, 0.05
LR = 1e-4
SEED = 3407
SUBSET     = 0              # >0 debugs with a subset; =0 uses full dataset

now     = lambda: datetime.now().strftime("%Y%m%d_%H%M%S")
cleanup = lambda: (torch.cuda.empty_cache(), gc.collect())


In [None]:

# ─────────── LOAD BASE (4‑bit) ───────────────────────────────────────────────
print("🤖 Loading base 4‑bit repo …")
model, tok = FastModel.from_pretrained(
    BASE_REPO, load_in_4bit=False, full_finetuning=False, max_seq_length=2048
)
tok.pad_token = tok.eos_token
print("✅ Base ready")

# ─────────── ADD LoRA ────────────────────────────────────────────────────────
model = FastModel.get_peft_model(
    model,
    r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    bias="none",
    random_state=SEED,
)
print("🔧 LoRA adapters injected")

# ─────────── DATASET ─────────────────────────────────────────────────────────
raw = Dataset.from_list(json.load(open(data_json)))
def to_chat(ex):
    return {"text":
        f"<bos><start_of_turn>user\n{ex['question']}<end_of_turn>\n"
        f"<start_of_turn>model\n{ex['answer']}<end_of_turn>"
    }
data = raw.map(to_chat)

if SUBSET:
    data = data.select(range(min(SUBSET, len(data))))
train_ds, val_ds = data.train_test_split(0.2, seed=SEED).values()
print(f"📚 Train / val = {len(train_ds)} / {len(val_ds)}")

# ─────────── SFT ─────────────────────────────────────────────────────────────
print("\n🎯 Supervised fine‑tune …")

cfg = SFTConfig(
    dataset_text_field          = "text",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,
    num_train_epochs            = EPOCHS_SFT,
    learning_rate               = LR,
    warmup_steps                = 10,
    logging_steps               = 10,
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    optim                       = "adamw_8bit",
    seed                        = SEED,
    report_to                   = "wandb",
)
SFTTrainer(model=model, tokenizer=tok, train_dataset=train_ds,
           eval_dataset=val_ds, args=cfg).train()

In [None]:
# ─────────── SAVE ADAPTERS ──────────────────────────────────────────────────
stamp       = now()
adapter_dir = f"hypnos_adapters_{stamp}"
model.save_pretrained(adapter_dir); tok.save_pretrained(adapter_dir)
shutil.copytree(adapter_dir, f"{drive_root}/{adapter_dir}")
print("💾  LoRA‑only checkpoint saved:", adapter_dir)

print("\n🎉 Training complete")
cleanup()

## Reinforcement Learning with AI Feedback (RLAIF) 🤖

- Here we use the Direct Policy Optimisation RLAIF approach from Unsloth

In [None]:
import torch
import json
from unsloth import FastModel, PatchDPOTrainer, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
from trl import DPOTrainer, DPOConfig
from transformers import AutoTokenizer

# Common GOTCHA with recursion limit issue in Unsloth
torch._dynamo.config.cache_size_limit = 128
# Patch Unsloth's DPOTrainer
PatchDPOTrainer()

drive_root   = "/content/gdrive/..."
adapter_dir  = f"/content/hypnos_adapters_20250802_081519"                      # ← your saved LoRA dir
base_model_id    = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
DPO_DATASET  = f"{drive_root}/data/sleep-train-dpo-enriched.json"               # replace with your DPO JSON file
max_samples = 0         # Test subset by setting to >0


# 1. Load base model + tokenizer
model, tokenizer = FastModel.from_pretrained(
    base_model_id,                     # same base used during original SFT
    load_in_4bit=False,
    full_finetuning=False,
    max_seq_length=2048
)

# 2. Inject LoRA structure into the model (important!)
model = FastModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    bias="none",
    random_state=3407,
)

# 3. Load previously fine-tuned LoRA weights (SFT checkpoint)
model.load_adapter(adapter_dir, adapter_name="default")


In [None]:
from datasets import DatasetDict

# === Load and validate your preference data ===
with open(DPO_DATASET) as f:
    preference_data = json.load(f)

# ✅ Rename "question" → "prompt"
for sample in preference_data:
    sample["prompt"] = sample.pop("question")

print(f"📊 Created {len(preference_data)} preference pairs")
if preference_data:
    print(f"📋 Sample preference pair:\n{json.dumps(preference_data[0], indent=2)}")
else:
    raise ValueError("❌ No valid preference pairs found!")

# === Create Hugging Face dataset ===
preference_dataset = Dataset.from_list(preference_data)
print(f"✅ Preference dataset created with {len(preference_dataset)} examples")

# === Get correct chat template for your tokenizer (adjust template if needed) ===
tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

# === Apply DPO formatting ===
print("🔄 Formatting preference dataset for DPO...")

def apply_dpo_template(example):
    # Validate all required fields
    if not all(isinstance(example[field], str) and example[field].strip() for field in ["prompt", "chosen", "rejected"]):
        return {
            "prompt": "",
            "chosen": "",
            "rejected": ""
        }

    # Format using chat template
    return {
        "prompt": tokenizer.apply_chat_template(
            [{"role": "user", "content": example["prompt"]}], tokenize=False),
        "chosen": tokenizer.apply_chat_template(
            [{"role": "user", "content": example["prompt"]}, {"role": "assistant", "content": example["chosen"]}], tokenize=False),
        "rejected": tokenizer.apply_chat_template(
            [{"role": "user", "content": example["prompt"]}, {"role": "assistant", "content": example["rejected"]}], tokenize=False)
    }

# Apply formatting (no rename step needed)
preference_dataset = preference_dataset.map(
    apply_dpo_template,
    remove_columns=preference_dataset.column_names,
    desc="🧠 Applying chat template formatting"
)

# Optional sanity check
print("✅ Final formatted sample:")
print(preference_dataset[0])


if max_samples > 0:
  preference_dataset = preference_dataset.select(range(max_samples))

split_dataset = preference_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
# ─────────── DPO CONFIG ─────────────────────────────────────────────────────
config = DPOConfig(
    beta                         = 0.01,  # DPO trade-off factor
    per_device_train_batch_size  = 2,
    per_device_eval_batch_size   = 2,
    gradient_accumulation_steps  = 8,
    eval_steps                   = 10,
    logging_steps                = 5,
    num_train_epochs             = 1,
    learning_rate                = 1e-5,
    lr_scheduler_type            = "cosine",
    warmup_ratio                 = 0.05,
    optim                        = "adamw_8bit",
    seed                         = 3407,
    bf16                         = is_bfloat16_supported(),
    fp16                         = not is_bfloat16_supported(),
    report_to                    = "none"
)

trainer = DPOTrainer(
    model           = model,
    ref_model       = None,      # uses frozen copy of model by default
    args            = config,
    beta            = config.beta,
    tokenizer       = None,
    train_dataset   = preference_dataset,
    max_length      = 2048,
    evaluation_strategy="steps",     # or "epoch"
    eval_dataset=eval_dataset,
)

trainer.train()

In [None]:
# ─────────── SAVE DPO LoRA ───────────────────────────────────────────────────
stamp       = now()
dpo_dir     = f"dpo_hypnos_adapters_{stamp}"
model.save_pretrained(dpo_dir); tokenizer.save_pretrained(dpo_dir)
shutil.copytree(dpo_dir, f"{drive_root}/{dpo_dir}")
print("💾  DPO-finetuned LoRA checkpoint saved:", dpo_dir)

cleanup()

## Inference 🏎
- Lightweight inference testing to ensure the models were trained / saved / loaded correctly

In [None]:
from unsloth import FastModel
import torch

# Paths
BASE_REPO   = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"
ADAPTER_DIR = "/content/dpo_hypnos_adapters_20250802_114929"

# Load base + adapter
model, tok = FastModel.from_pretrained(BASE_REPO, load_in_4bit=False)
model.load_adapter(ADAPTER_DIR)
tok.pad_token = tok.eos_token

# Sample prompt
prompt = "<bos><start_of_turn>user\nHow can I improve my sleep naturally?<end_of_turn>\n<start_of_turn>model\n"

# Tokenize input
inputs = tok(text=prompt, return_tensors="pt").to(model.device)

# Generate
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        repetition_penalty=1.1,
        eos_token_id=tok.eos_token_id,
    )

# Decode result
response = tok.decode(outputs[0], skip_special_tokens=True)
print("\n📤 Response:\n", response)


## Exporting & Quantization 🗳

### Setup

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

Need to build llamacpp now to enable quantization features

Note: Can try `cmake --build . --target quantize --config Release`

For a faster, targeted build.

⚠ But it does not always work across llamacpp versions

In [None]:
%%shell
# [1] Update and install dependencies
sudo apt update
sudo apt install -y build-essential cmake

# [2] Build llama.cpp
cd /content/llama.cpp

mkdir build
cd build
cmake ..
cmake --build . --config Release

In [None]:
# Ensure quantization block is installed
import os

path = "/content/llama.cpp/build/bin/llama-quantize"

if os.path.isfile(path) or os.path.isdir(path):
    print(f"✅ Quantization module successfully located...")
else:
    print(f"❌ File or directory does not exist: {path}")


### Exporting to GGUF & Quantizing

In [None]:
BASE_REPO   = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"      # Set to None if you don't want to save the base model
BASE_MODEL_PATH = "gemma-3n-base-model"
SAVE_BASE_MODEL = False
ADAPTER_DIR = "/content/hypnos_adapters_20250802_081519"          # Your SFT Adapter
DPO_ADAPTER_DIR = "/content/dpo_hypnos_adapters_20250802_114929"  # Your DPO Adapter

unquantized_sft_gguf = f"{ADAPTER_DIR}_unquantized.gguf"
quantized_sft_gguf = f"{ADAPTER_DIR}_quantized.gguf"

unquantized_dpo_gguf = f"{DPO_ADAPTER_DIR}_unquantized.gguf"
quantized_dpo_gguf = f"{DPO_ADAPTER_DIR}_quantized.gguf"

quantization = "q4_k_m"


def load_base_model():
  model, tokenizer = FastModel.from_pretrained(
      model_name = BASE_REPO,
      dtype = None, # None for auto detection
      max_seq_length = 1024, # Choose any for long context!
      load_in_4bit = False,  # 4 bit quantization to reduce memory
      full_finetuning = False,
  )
  return model, tokenizer


In [None]:
from unsloth import FastModel
import torch

if SAVE_BASE_MODEL:
  model, tokenizer = load_base_model()
  model.save_pretrained(BASE_MODEL_PATH)  # Local saving
  tokenizer.save_pretrained(BASE_MODEL_PATH)


In [None]:
if SAVE_BASE_MODEL:
  input_model_path = f"/content/{BASE_MODEL_PATH}"
  output_model_path = f"/content/{BASE_MODEL_PATH}.gguf"
  !python /content/llama.cpp/convert_hf_to_gguf.py {input_model_path} --outfile {output_model_path}

In [None]:
from unsloth import FastModel
import torch
import subprocess

for tuned_model_dir in [ADAPTER_DIR, DPO_ADAPTER_DIR]:
  model, tokenizer = load_base_model()

  model = FastModel.get_peft_model(
      model,
      r=16,
      target_modules=["q_proj","k_proj","v_proj","o_proj"],
      lora_alpha=32,
      lora_dropout=0.05,
      bias="none",
  )

  # Load the LoRA adapter weights with adapter name
  model.load_adapter(tuned_model_dir, adapter_name="default")

  # Step 1: Save as full model in proper HF format
  full_model_folder = f"{tuned_model_dir}_merged"

  print(f"💾 Saving full model to {full_model_folder}...")
  model.save_pretrained_merged(full_model_folder, tokenizer)

In [None]:
# Step 2: Convert to unquantized GGUF using llama.cpp
print(f" Converting to unquantized GGUF...")

sft_full_folder = f"{ADAPTER_DIR}_merged"
dpo_full_folder = f"{DPO_ADAPTER_DIR}_merged"

!python /content/llama.cpp/convert_hf_to_gguf.py {sft_full_folder} --outfile {unquantized_sft_gguf}
!python /content/llama.cpp/convert_hf_to_gguf.py {dpo_full_folder} --outfile {unquantized_dpo_gguf}

In [None]:
# Run quantization on the extracted model
# Using raw bash here vs. subprocess as this might take a while and log output is crucial
! "/content/llama.cpp/build/bin/llama-quantize" {unquantized_sft_gguf} {quantized_sft_gguf} {quantization}
! "/content/llama.cpp/build/bin/llama-quantize" {unquantized_dpo_gguf} {quantized_dpo_gguf} {quantization}

Save the files permanently on the mounted drive

In [None]:
# Specify local + mounted paths
import os
drive_dir = "/content/gdrive/MyDrive/Files/Personal/Hypnos-Project/content/models"
BASE_MODEL_FILENAME = f"{BASE_MODEL_PATH}.gguf"
SRC_MODEL_FILENAMES = [unquantized_sft_gguf, quantized_sft_gguf, unquantized_dpo_gguf, quantized_dpo_gguf]
MODEL_FILENAMES = [str(os.path.basename(i)) for i in SRC_MODEL_FILENAMES]
print("Base model filename: " + BASE_MODEL_FILENAME)
print("Output model filename: " + str(MODEL_FILENAMES))

BASE_MODEL_GGUF_PATH_OUTPUT = f"{drive_dir}/{BASE_MODEL_FILENAME}"
DST_MODEL_FILENAMES = [f"{drive_dir}/{i}" for i in MODEL_FILENAMES]

# Copy models over to mounted dir
! cp {BASE_MODEL_FILENAME} {BASE_MODEL_GGUF_PATH_OUTPUT}

for (src, dest) in zip(SRC_MODEL_FILENAMES, DST_MODEL_FILENAMES):
  print(f"Copying model {src} over to {dest}...")
  ! cp {src} {dest}

## You're all done! 🎉 🎉

You should now have base, SFT, and DPO gguf models exported to your GDrive. 👏

Proceed to the Evaluation notebook to test out the extracted gguf... 🏃🔧