# GyroGem Chat Training (Gemma 3 + Unsloth + LoRA)

This notebook fine-tunes **two** Gemma 3 chat models sequentially using **Unsloth + TRL SFTTrainer** on the GyroGem chat dataset.

Models trained:
1. `unsloth/gemma-3-270m-it-unsloth-bnb-4bit`
2. `unsloth/gemma-3-1b-it-bnb-4bit`

Dataset path (in repo):
`superintelligence/secret_lab_ignore/GyroGem/training/data/gyrogem_chat_qa_dataset.jsonl`

Kaggle notes:
- Enable **GPU** (prefer **P100**)
- Enable **Internet**
- This notebook does **not** export the dataset; it just reads it from the cloned repo.

Gemma 3 precision note (Unsloth):
- On GPUs without bfloat16 tensor cores (T4, P100, V100, etc.), Gemma 3 can overflow in float16.
- We train with **fp16=False** and **bf16=False** (fp32 trainer args) plus gradient clipping for stability.


In [ ]:
import os, re, gc
import torch

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    props = torch.cuda.get_device_properties(0)
    print("VRAM (GB):", round(props.total_memory / 1024**3, 2))
else:
    print("NOTE: CUDA is not available. In Kaggle, set Accelerator=GPU (prefer P100) before training.")


## 1) Install dependencies (Unsloth + TRL)

We pin versions compatible with Unsloth 2026.x to avoid the dependency issues you saw.


In [ ]:
%%capture
import subprocess, re, torch

def run(cmd):
    subprocess.check_call(cmd, shell=True)

run("pip -q install --upgrade pip")

# xformers version mapping like in Unsloth notebooks
v = re.match(r"[\d]+\.[\d]+", str(torch.__version__)).group(0)
xformers = "xformers==" + {"2.10":"0.0.34","2.9":"0.0.33.post1","2.8":"0.0.32.post2"}.get(v, "0.0.34")

# Core deps: match Unsloth's constraints
run('pip -q install "datasets==4.3.0"')
run('pip -q install "transformers==4.56.2"')
run('pip -q install --no-deps "trl==0.22.2"')
run('pip -q install "peft>=0.18.0,<0.19.0"')
run('pip -q install "accelerate" "bitsandbytes" "sentencepiece" "protobuf" "huggingface_hub>=0.34.0" hf_transfer')

# Extra deps Unsloth complains about if missing
run('pip -q install "tyro" "msgspec" "cut_cross_entropy" "torchao>=0.13.0"')

# Finally, install Unsloth + zoo with dependencies
run(f'pip -q install {xformers} unsloth unsloth_zoo')


## 2) Hugging Face login (manual, toggle when you run)

To avoid Kaggle auto-run breaking (no stdin), this cell does **not** log in by default.

When you open the notebook yourself:
1. Set `DO_HF_LOGIN = True`
2. Run this cell
3. Paste your `hf_...` token when asked


In [ ]:
from huggingface_hub import login
from getpass import getpass

DO_HF_LOGIN = False  # <- set to True and re-run this cell when you run interactively

if DO_HF_LOGIN:
    HF_TOKEN = getpass("HF token: ")
    login(token=HF_TOKEN)
    os.environ["HF_TOKEN"] = HF_TOKEN
    print("HF login complete.")
else:
    print("HF login skipped. When running interactively, set DO_HF_LOGIN=True and re-run this cell.")


## 3) Clone repo and load dataset

Repo: `https://github.com/gyrogovernance/superintelligence.git`

Dataset: `GyroGem/training/data/gyrogem_chat_qa_dataset.jsonl` inside `secret_lab_ignore`.


In [ ]:
!git clone https://github.com/gyrogovernance/superintelligence.git


In [ ]:
%cd superintelligence/secret_lab_ignore
repo_root = os.getcwd()
print("Repo root:", repo_root)

DATA_PATH = os.path.join(repo_root, "GyroGem", "training", "data", "gyrogem_chat_qa_dataset.jsonl")
print("Dataset path:", DATA_PATH)
assert os.path.exists(DATA_PATH), f"Missing dataset file at {DATA_PATH}"
print("Dataset found.")


In [ ]:
from datasets import load_dataset

raw = load_dataset("json", data_files=DATA_PATH, split="train")
print("Rows:", len(raw))
print("Columns:", raw.column_names)

ex = raw[0]
assert "conversations" in ex
assert isinstance(ex["conversations"], list) and len(ex["conversations"]) >= 2
assert "role" in ex["conversations"][0] and "content" in ex["conversations"][0]
print("Example id:", ex.get("id", "(no id)"))
print("Turns:", len(ex["conversations"]))


## 4) Render dataset once with Gemma 3 chat template

We apply the Gemma 3 chat template to `conversations` and store the result in a `text` field.
This rendered dataset is reused for both model sizes.


In [ ]:
from transformers import AutoTokenizer
from unsloth.chat_templates import get_chat_template

RENDER_TOKENIZER_MODEL = "unsloth/gemma-3-1b-it-bnb-4bit"
render_token = os.environ.get("HF_TOKEN")  # if you logged in, this is set; otherwise HF cache may still work
tok = AutoTokenizer.from_pretrained(RENDER_TOKENIZER_MODEL, token=render_token)
tok = get_chat_template(tok, chat_template="gemma3")

def render_chat(examples):
    texts = []
    for convo in examples["conversations"]:
        t = tok.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        if t.startswith("<bos>"):
            t = t[len("<bos>"):]
        texts.append(t)
    return {"text": texts}

dataset = raw.map(render_chat, batched=True)
print("Rendered columns:", dataset.column_names)
print("Rendered preview:\n", dataset[0]["text"][:600])


## 5) Basic length check

We keep `MAX_SEQ_LENGTH` conservative for Kaggle GPUs (2048). Longer examples will be truncated by the trainer.


In [ ]:
MAX_SEQ_LENGTH = 2048

def token_len(example):
    return {"n_tokens": len(tok(example["text"]).input_ids)}

lens = dataset.map(token_len)
mx = max(lens["n_tokens"])
p95 = sorted(lens["n_tokens"])[int(0.95 * len(lens))]
print("Max tokens:", mx)
print("P95 tokens:", p95)
if mx > MAX_SEQ_LENGTH:
    print("NOTE: Some samples exceed MAX_SEQ_LENGTH; trainer will truncate them.")


## 6) Training function (Unsloth + LoRA + responses-only)

We:
- Load model (4-bit) via Unsloth
- Add LoRA
- Mask user/system turns (train on responses only)
- Train with fp32 trainer args (no fp16/bf16) + gradient clipping


In [ ]:
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template as unsloth_get_chat_template, train_on_responses_only
from trl import SFTTrainer, SFTConfig

def train_one(
    model_name: str,
    output_dir: str,
    lora_r: int,
    lora_alpha: int,
    learning_rate: float = 2e-5,
    num_train_epochs: float = 1.0,
    eval_frac: float = 0.05,
):
    print("\n==============================")
    print("Training:", model_name)
    print("Output:", output_dir)
    print("LoRA r:", lora_r, "alpha:", lora_alpha)
    print("==============================\n")

    hf_token = os.environ.get("HF_TOKEN")

    model, tokenizer = FastModel.from_pretrained(
        model_name=model_name,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        token=hf_token,
    )
    tokenizer = unsloth_get_chat_template(tokenizer, chat_template="gemma3")

    model = FastModel.get_peft_model(
        model,
        r=lora_r,
        target_modules=[
            "q_proj","k_proj","v_proj","o_proj",
            "gate_proj","up_proj","down_proj",
        ],
        lora_alpha=lora_alpha,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    ds = dataset.shuffle(seed=3407)
    splits = ds.train_test_split(test_size=eval_frac, seed=3407)
    train_ds = splits["train"]
    eval_ds  = splits["test"]

    per_device_train_batch_size = 2
    gradient_accumulation_steps = 4

    bf16 = False
    fp16 = False

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        args=SFTConfig(
            dataset_text_field="text",
            max_seq_length=MAX_SEQ_LENGTH,
            packing=True,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            warmup_steps=5,
            logging_steps=10,
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            optim="adamw_8bit",
            weight_decay=0.001,
            lr_scheduler_type="linear",
            max_grad_norm=1.0,
            seed=3407,
            output_dir=output_dir,
            report_to="none",
            bf16=bf16,
            fp16=fp16,
        ),
    )

    trainer = train_on_responses_only(
        trainer,
        instruction_part="<start_of_turn>user\n",
        response_part="<start_of_turn>model\n",
    )

    trainer.train()

    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print("Saved LoRA adapters to:", output_dir)

    del trainer, model, tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return output_dir


## 7) Train model A: Gemma 3 270m


In [ ]:
MODEL_270M = "unsloth/gemma-3-270m-it-unsloth-bnb-4bit"
OUT_270M = "/kaggle/working/gyrogem_gemma3_270m_lora"

train_one(
    model_name=MODEL_270M,
    output_dir=OUT_270M,
    lora_r=32,
    lora_alpha=32,
    learning_rate=2e-5,
    num_train_epochs=1.0,
)


## 8) Train model B: Gemma 3 1B


In [ ]:
MODEL_1B = "unsloth/gemma-3-1b-it-bnb-4bit"
OUT_1B = "/kaggle/working/gyrogem_gemma3_1b_lora"

train_one(
    model_name=MODEL_1B,
    output_dir=OUT_1B,
    lora_r=64,
    lora_alpha=64,
    learning_rate=2e-5,
    num_train_epochs=1.0,
)


## 9) Optional inference sanity checks (after training)

Uses Gemma team’s recommended inference settings:
- temperature=1.0, top_k=64, top_p=0.95, repetition_penalty=1.0


In [ ]:
from transformers import TextStreamer
from peft import PeftModel
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template as unsloth_get_chat_template

def quick_infer(base_model_name, adapter_dir, prompt):
    hf_token = os.environ.get("HF_TOKEN")
    model, tokenizer = FastModel.from_pretrained(
        model_name=base_model_name,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        token=hf_token,
    )
    tokenizer = unsloth_get_chat_template(tokenizer, chat_template="gemma3")

    model = PeftModel.from_pretrained(model, adapter_dir)

    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    if text.startswith("<bos>"):
        text = text[len("<bos>"):]

    device = "cuda" if torch.cuda.is_available() else "cpu"
    inputs = tokenizer(text, return_tensors="pt").to(device)

    print("\n---", base_model_name, "with", adapter_dir, "---")
    _ = model.generate(
        **inputs,
        max_new_tokens=220,
        temperature=1.0,
        top_k=64,
        top_p=0.95,
        repetition_penalty=1.0,
        streamer=TextStreamer(tokenizer, skip_prompt=True),
    )

    del model, tokenizer
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

prompt = "What is ✋ The Human Mark (THM) in simple terms?"
# Run these after training if you want to inspect behaviour
# quick_infer(MODEL_270M, OUT_270M, prompt)
# quick_infer(MODEL_1B, OUT_1B, prompt)
