# LoRA Fine-tuning

This notebook fine-tunes an instruction-tuned base model (Qwen2.5-3B-Instruct) with LoRA to generate short Python tutoring hints from MBPP-style data.

## Prerequisites
1. Run `python -m data.preprocess` locally to create the `processed/` folder.  
2. Zip the `processed/` folder and upload it as a private Kaggle Dataset (e.g., “Python Tutor Agent Dataset”).  
   On Kaggle, it will mount at:

   ```
   /kaggle/input/python-tutor-agent-dataset/processed
   ```

## Quick Start (Kaggle)
1. Create a new Kaggle Notebook, click Add data, and select your private dataset (e.g., `python-tutor-agent-dataset`).
2. Settings  
   - Accelerator: GPU (T4/V100/A100)  
   - Internet: On (recommended for first run to fetch base model)  
   - Session persistence: Files only or Both if you want `/kaggle/working/outputs` to survive restarts
3. Edit the first code cell:
   - `DATASET_NAME = "python-tutor-agent-dataset"`
   - `BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"`
   - `DATA_ROOT = f"/kaggle/input/{DATASET_NAME}/processed"`
4. Run all cells (top to bottom)
   - Outputs (adapter + logs) are written to:  
   `/kaggle/working/outputs/<base>-hints`  
   - TensorBoard logs are under:  
   `/kaggle/working/outputs/<base>-hints/tb`

## Install libraries

In [None]:
%pip install -q --no-deps "trl==0.9.6"
%pip install -q --upgrade "transformers==4.45.2" "accelerate==1.2.1" "peft==0.13.2" "huggingface_hub==0.35.3" "tyro>=0.5.11"

## Import libraries

In [None]:
import json, os, platform, sys, math, shutil, tempfile
from datetime import datetime
from pathlib import Path
from typing import Dict, List
import random
from random import sample
import numpy as np

import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from torch.utils.tensorboard import SummaryWriter

## Configure dataset, base model, and LoRA hyperparameters

In [None]:
DATASET_NAME = "python-tutor-agent-dataset"
BASE_MODEL   = "Qwen/Qwen2.5-3B-Instruct"

# LoRA training knobs
BATCH_SIZE   = 1
GRAD_ACCUM   = 8
EPOCHS       = 5
LR           = 1e-4
MAX_LEN      = 2048
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.0

# LoRA adapter config
LORA_R       = 16
LORA_ALPHA   = 32
LORA_DROPOUT = 0.05

# Reproducibility seed
SEED         = 42

# Derived path
DATA_ROOT = f"/kaggle/input/{DATASET_NAME}/processed"

# Precision auto-detect (bf16 on Ampere+; otherwise fp16)
try:
    cap = torch.cuda.get_device_capability(0) if torch.cuda.is_available() else (0, 0)
    USE_BF16 = torch.cuda.is_available() and cap[0] >= 8
except Exception:
    USE_BF16 = False

print("USE_BF16:", USE_BF16)

# Reproducibility
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("DATA_ROOT:", DATA_ROOT)
print("BASE_MODEL:", BASE_MODEL)

## Define tutoring prompts and enforce output format

In [None]:
HINT_PROMPT = """Problem:
{problem}

Learner code:
{learner_code}

Failed unit tests:
```json
{failed_tests_json}
```

Write one short, actionable hint (1–2 sentences). No code. Do not quote the tests verbatim.
Hint: """

RESPONSE_PREFIX = "Hint: "

assert HINT_PROMPT.endswith(RESPONSE_PREFIX), "HINT_PROMPT must end with 'Hint: '"

## Data helpers

In [None]:
def _read_jsonl(path: Path) -> List[Dict]:
    rows = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows

def _read_text(path: Path) -> str:
    return Path(path).read_text(encoding="utf-8")

def _resolve_failed_tests_json(data_root_path: Path, failed_tests_path: str) -> str:
    p = (data_root_path / failed_tests_path).resolve()
    return _read_text(p)

def _build_prompt(problem: str, learner_code: str, failed_tests_json: str) -> str:
    return HINT_PROMPT.format(
        problem=problem,
        learner_code=learner_code,
        failed_tests_json=failed_tests_json,
    )

def make_sft_dataset(jsonl_path: Path, data_root_path: Path) -> Dataset:
    rows = _read_jsonl(jsonl_path)
    records = []
    for r in rows:
        prompt  = _build_prompt(
            r["problem"],
            r["learner_code"],
            _resolve_failed_tests_json(data_root_path, r["failed_tests_path"]),
        )
        text = prompt + (r.get("hint") or "").strip()
        records.append({"text": text})
    return Dataset.from_list(records)

## LoRA helpers

In [None]:
def pick_lora_targets():
    return ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

def wrap_with_lora(model, target_modules, r, alpha, dropout, base_model_name_or_path=""):
    lora_cfg = LoraConfig(
        r=r, lora_alpha=alpha, lora_dropout=dropout, bias="none",
        target_modules=target_modules, task_type="CAUSAL_LM",
        base_model_name_or_path=base_model_name_or_path or None,
    )
    return get_peft_model(model, lora_cfg)

## Build datasets

In [None]:
train_file = Path(DATA_ROOT) / "lora_train.jsonl"
val_file  = Path(DATA_ROOT) / "lora_val.jsonl"
data_root_path  = Path(DATA_ROOT)

train_ds = make_sft_dataset(train_file, data_root_path)
val_ds  = make_sft_dataset(val_file,  data_root_path)
print("Rows -> train:", len(train_ds), "eval:", len(val_ds))

## Load model/tokenizer

In [None]:
def load_model_and_tokenizer(model_id: str, bf16: bool, max_len: int = 2048):
    dtype = torch.bfloat16 if bf16 else torch.float16

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=dtype,
        device_map="auto",
        trust_remote_code=True,
    )

    tok = AutoTokenizer.from_pretrained(
        model_id,
        use_fast=False,
        trust_remote_code=True,
    )

    try: tok.chat_template = None
    except Exception: pass

    if tok.pad_token is None and tok.eos_token is not None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"
    tok.model_max_length = max_len

    model.config.use_cache = False
    model.config.pad_token_id = tok.pad_token_id
    model.config.eos_token_id = tok.eos_token_id

    print(f"Loaded {model_id} dtype={dtype} pad={tok.pad_token_id} eos={tok.eos_token_id}")
    return model, tok

model, tok = load_model_and_tokenizer(BASE_MODEL, USE_BF16, max_len=MAX_LEN)

## Tokenize datasets

In [None]:
def _tok_map(batch):
    enc = tok(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )
    return enc

train_tok = train_ds.map(_tok_map, batched=True, remove_columns=["text"])
val_tok   = val_ds.map(_tok_map,   batched=True, remove_columns=["text"])

## Apply LoRA adapter and enable input grads

In [None]:
model = wrap_with_lora(
    model,
    target_modules=pick_lora_targets(),
    r=LORA_R, alpha=LORA_ALPHA, dropout=LORA_DROPOUT,
    base_model_name_or_path=BASE_MODEL,
)
print(f"LoRA wrapped with r={LORA_R}, alpha={LORA_ALPHA}, dropout={LORA_DROPOUT}")

try:
    model.enable_input_require_grads()
except Exception as e:
    print("enable_input_require_grads() not available:", e)

total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable/1e6:.2f}M / {total/1e6:.2f}M ({100*trainable/total:.2f}%)")

## TensorBoard (save logs for download)

In [None]:
outdir = Path("/kaggle/working") / f"outputs/{BASE_MODEL.split('/')[-1].lower()}-hints"
tb_dir = outdir / "tb"
tb_dir.mkdir(parents=True, exist_ok=True)

writer = SummaryWriter(str(tb_dir))
writer.add_scalar("boot/started", 1, 0)
writer.flush()
writer.close()

event_files = [p.name for p in tb_dir.glob("events.*")]
print(f"TensorBoard logdir: {tb_dir}")
print(f"Event files: {event_files if event_files else 'none yet (will be created during training)'}")

## Train

In [None]:
cfg = SFTConfig(
    output_dir=str(outdir),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=max(1, BATCH_SIZE // 2),
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,

    logging_steps=25,
    logging_dir=str(tb_dir),
    logging_first_step=True,
    run_name=f"run-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}",

    # Evaluation & checkpointing
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Precision & memory
    bf16=USE_BF16,
    fp16=(not USE_BF16),
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},

    # Data / trainer plumbing
    seed=SEED,
    remove_unused_columns=True,
    report_to=["tensorboard"],
    max_seq_length=MAX_LEN,
    packing=False,

    # Niceties
    save_safetensors=True,
    dataloader_num_workers=2,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tok,
    args=cfg,
    train_dataset=train_tok,
    eval_dataset=val_tok,
)

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=2))

trainer.train()

## Validation metrics (loss & perplexity)

In [None]:
val_metrics = trainer.evaluate(eval_dataset=val_tok)
val_loss = float(val_metrics.get("eval_loss", float("nan")))
val_ppl  = math.exp(val_loss) if math.isfinite(val_loss) else float("nan")
val_metrics["eval_ppl"] = val_ppl
print(json.dumps(val_metrics, indent=2))

## Qualitative samples

In [None]:
model.eval()
for rec in sample(list(val_ds), 5):
    prompt = rec["text"].rsplit("Hint: ", 1)[0] + "Hint: "
    toks = tok(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(**toks, max_new_tokens=64, do_sample=False)
    print(tok.decode(out[0], skip_special_tokens=True).split("Hint: ",1)[-1].strip(), "\n---")

## Save adapter & manifest

In [None]:
trainer.model.save_pretrained(outdir)
tok.save_pretrained(outdir)
trainer.save_state()

paths = {
    "adapter_config": outdir / "adapter_config.json",
    "hf_model_config": outdir / "config.json",
    "tokenizer_config": outdir / "tokenizer_config.json",
    "trainer_state": outdir / "trainer_state.json",
}

def _safe_json(p: Path):
    try:
        return json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        return None

_eval_ds = val_tok if "val_tok" in globals() else val_ds

if "val_metrics" not in globals():
    try:
        _vm = trainer.evaluate(eval_dataset=_eval_ds)
        if "eval_loss" in _vm and _vm["eval_loss"] is not None:
            _vm["eval_ppl"] = float(math.exp(float(_vm["eval_loss"])))
        val_metrics = _vm
    except Exception:
        val_metrics = None

best_ckpt = getattr(trainer.state, "best_model_checkpoint", None)
run_name  = getattr(trainer.args, "run_name", None)
tb_dir    = str(getattr(trainer.args, "logging_dir", outdir / "tb"))

try:
    n_total = int(sum(p.numel() for p in trainer.model.parameters()))
    n_train = int(sum(p.numel() for p in trainer.model.parameters() if p.requires_grad))
    pct_train = 100.0 * n_train / max(1, n_total)
except Exception:
    n_total = n_train = 0
    pct_train = 0.0

state = trainer.state
train_summary = {
    "global_step": int(getattr(state, "global_step", 0) or 0),
    "epoch": float(getattr(state, "epoch", 0.0) or 0.0),
    "train_samples": int(len(train_ds)),
    "eval_samples": int(len(val_ds)),
}

log_history_path = outdir / "log_history.jsonl"
try:
    with log_history_path.open("w", encoding="utf-8") as f:
        for rec in (state.log_history or []):
            f.write(json.dumps(rec) + "\n")
except Exception:
    pass

manifest = {
    "timestamp_utc": datetime.utcnow().isoformat() + "Z",
    "base_model": BASE_MODEL,
    "qlora": False,
    "precision": {
        "use_bf16": bool(USE_BF16),
        "torch_dtype": str(next(trainer.model.parameters()).dtype),
        "device": str(next(trainer.model.parameters()).device),
    },
    "parameters": {
        "total": n_total,
        "trainable": n_train,
        "trainable_pct": round(pct_train, 4),
    },
    "data": {
        "train_file": str(train_file),
        "eval_file":  str(val_file),
        "data_root_path": str(data_root_path),
        "num_train_rows": int(len(train_ds)),
        "num_eval_rows":  int(len(val_ds)),
        "max_seq_length": MAX_LEN,
    },
    "training_arguments": {
        "output_dir": str(outdir),
        "per_device_train_batch_size": BATCH_SIZE,
        "per_device_eval_batch_size": max(1, BATCH_SIZE // 2),
        "gradient_accumulation_steps": GRAD_ACCUM,
        "num_train_epochs": EPOCHS,
        "learning_rate": LR,
        "lr_scheduler_type": "cosine",
        "warmup_ratio": WARMUP_RATIO,
        "weight_decay": WEIGHT_DECAY,
        "bf16": bool(USE_BF16),
        "fp16": (not USE_BF16),
        "gradient_checkpointing": True,
        "seed": SEED,
        "response_template": RESPONSE_PREFIX,
        "run_name": run_name,
        "tensorboard_logdir": tb_dir,
        "best_model_checkpoint": best_ckpt,
    },
    "trainer_state": train_summary,
    "lora_config": {
        "r": LORA_R,
        "alpha": LORA_ALPHA,
        "dropout": LORA_DROPOUT,
        "target_modules": pick_lora_targets(),
        "base_model_name_or_path": BASE_MODEL,
    },
    "val_metrics": val_metrics,
    "artifacts": {
        "output_dir": str(outdir),
        **{k: str(v) for k, v in paths.items()},
        "log_history": str(log_history_path),
    },
    "env": {
        "python": platform.python_version(),
        "torch": torch.__version__,
        "transformers": __import__("transformers").__version__,
        "peft": __import__("peft").__version__,
        "trl": __import__("trl").__version__,
        "cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
        "platform": sys.platform,
    },
    "snapshots": {
        "adapter_config": _safe_json(paths["adapter_config"]),
        "hf_model_config": _safe_json(paths["hf_model_config"]),
        "tokenizer_config": _safe_json(paths["tokenizer_config"]),
        "trainer_state": _safe_json(paths["trainer_state"]),
    },
}

(outdir / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print(f"Saved LoRA adapter and manifest to {outdir}")

## Zip outputs for download

In [None]:
essential_files = [
    "adapter_config.json",
    "adapter_model.safetensors",
    "added_tokens.json",
    "merges.txt",
    "special_tokens_map.json",
    "tokenizer_config.json",
    "vocab.json",
]

with tempfile.TemporaryDirectory() as temp_dir:
    temp_path = Path(temp_dir)
    
    for file_name in essential_files:
        src_file = outdir / file_name
        if src_file.exists():
            shutil.copy2(src_file, temp_path / file_name)
            print(f"Copied: {file_name}")
        else:
            print(f"Missing: {file_name}")
    
    zip_path = Path("/kaggle/working/qwen2.5-3b-instruct-fine-tuned.zip")
    shutil.make_archive(
        str(zip_path.with_suffix("")), 
        "zip", 
        str(temp_path)
    )

!ls -lh /kaggle/working/qwen2.5-3b-instruct-fine-tuned.zip
!unzip -l /kaggle/working/qwen2.5-3b-instruct-fine-tuned.zip