### Local SFT (8-bit + LoRA) — Hugging Face Transformers

This notebook mirrors `notebooks_sl/tinker_finetune.ipynb`, but trains **locally** using:
- `transformers` + `peft`
- **8-bit quantization** via `bitsandbytes`
- LoRA adapters (saved under `adapters/`)

It consumes the same conversation JSONL format:

```json
{"messages": [{"role":"system","content":"..."}, {"role":"user","content":"..."}, {"role":"assistant","content":"..."}]}
```

Expected inputs in this repo:
- `data/finetuning/training_data_spam.json` (instruction-tuning JSON array)
- `data/finetuning/tinker_conversations_spam.jsonl` (conversation JSONL)

Outputs:
- LoRA adapter: `adapters/local_lora_8bit_spam/`
- Optional: training logs under `logs/local_lora_8bit_spam/`


In [None]:
# Config — match notebooks_sl/tinker_finetune.ipynb defaults

from __future__ import annotations

from pathlib import Path

AR_FINETUNE_ROOT = Path.cwd().resolve().parent if (Path.cwd().name == "notebooks_sl") else Path.cwd().resolve()
# If launched from repo root, AR_FINETUNE_ROOT is already correct.
if not (AR_FINETUNE_ROOT / "data" / "finetuning").exists():
    # If launched from within notebooks/ or other subdir, walk upward.
    for p in [Path.cwd().resolve(), *Path.cwd().resolve().parents]:
        if (p / "data" / "finetuning").exists():
            AR_FINETUNE_ROOT = p
            break

MODEL_NAME = "meta-llama/Llama-3.1-8B"
LOCAL_MODEL_DIR = AR_FINETUNE_ROOT / "models" / "llama-3.1-8b"

IN_PATH = AR_FINETUNE_ROOT / "data" / "finetuning" / "training_data_spam.json"
CONVERSATIONS_JSONL = AR_FINETUNE_ROOT / "data" / "finetuning" / "tinker_conversations_spam.jsonl"

OUTPUT_ADAPTER_DIR = AR_FINETUNE_ROOT / "adapters" / "local_lora_8bit_spam"
LOG_DIR = AR_FINETUNE_ROOT / "logs" / "local_lora_8bit_spam"

# Training hyperparams (keep close to Tinker defaults; adjust for your GPU)
MAX_LENGTH = 8192
LEARNING_RATE = 2e-4
NUM_EPOCHS = 1

# These are *local* knobs (Tinker had batch_size=16 with remote GPU)
PER_DEVICE_TRAIN_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 16

# LoRA hyperparams
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# A small run is useful to validate end-to-end quickly
LIMIT_TRAIN_EXAMPLES: int | None = None  # e.g. 256

print("ar_finetune root:", AR_FINETUNE_ROOT)
print("local model dir:", LOCAL_MODEL_DIR)
print("input:", IN_PATH)
print("conversations:", CONVERSATIONS_JSONL)
print("output adapter:", OUTPUT_ADAPTER_DIR)


In [None]:
# Environment check

import os

import torch

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))
    print("bf16 supported:", torch.cuda.is_bf16_supported())

# Helpful default to reduce fragmentation on long-context runs
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")


In [None]:
# Step 1 — Ensure conversation JSONL exists (same format as Tinker)

import json


def ensure_conversations_jsonl(
    *,
    in_path: Path,
    out_path: Path,
    overwrite: bool = False,
) -> None:
    if out_path.exists() and not overwrite:
        print("Using existing:", out_path)
        return

    data = json.loads(in_path.read_text(encoding="utf-8"))
    if not isinstance(data, list):
        raise TypeError(f"Expected a JSON array at {in_path}, got {type(data).__name__}")

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for ex in data:
            instruction = (ex.get("instruction") or "").strip()
            user_input = (ex.get("input") or "").strip()
            output = (ex.get("output") or "").strip()

            messages = [
                {"role": "system", "content": instruction},
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": output},
            ]
            f.write(json.dumps({"messages": messages}, ensure_ascii=False) + "\n")

    print("Wrote:", out_path)
    print("Head:")
    print(out_path.open("r", encoding="utf-8").read().splitlines()[0][:500])


ensure_conversations_jsonl(in_path=IN_PATH, out_path=CONVERSATIONS_JSONL, overwrite=False)


In [None]:
# Step 2 — Load the dataset (conversation JSONL)

from datasets import load_dataset

raw = load_dataset(
    "json",
    data_files={"train": str(CONVERSATIONS_JSONL)},
    split="train",
)

print(raw)
print(raw[0].keys())
print(raw[0]["messages"][0].keys())


In [None]:
# Step 3 — Tokenizer + assistant-only label masking

from typing import Any, Dict, List, Tuple

from transformers import AutoTokenizer


tok = AutoTokenizer.from_pretrained(
    str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
    use_fast=True,
)

# Some local tokenizers don't ship a chat template; set a reasonable Llama-3-style default.
# This matches the common Meta Llama 3 instruct formatting:
#   <|begin_of_text|><|start_header_id|>role<|end_header_id|>\n\ncontent<|eot_id|> ...
if getattr(tok, "chat_template", None) in (None, ""):
    tok.chat_template = (
        "{%- for message in messages -%}"
        "{%- if loop.first -%}{{ bos_token }}{%- endif -%}"
        "<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n\n"
        "{{ message['content'] }}<|eot_id|>"
        "{%- endfor -%}"
        "{%- if add_generation_prompt -%}"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        "{%- endif -%}"
    )

# Llama-family models typically need an explicit pad token for batching.
if tok.pad_token_id is None:
    tok.pad_token = tok.eos_token

print("has chat_template:", bool(getattr(tok, "chat_template", None)))
print("pad_token_id:", tok.pad_token_id)
print("eos_token_id:", tok.eos_token_id)


def encode_chat_with_assistant_labels(
    messages: List[Dict[str, str]],
    *,
    max_length: int,
) -> Dict[str, List[int]]:
    """Build (input_ids, attention_mask, labels) where loss is only on assistant content.

    Strategy:
    - Use the model's chat template to tokenize the full conversation.
    - Re-tokenize incremental prefixes to locate which token ranges correspond to each message.
    - Mark assistant message ranges as trainable (labels = token_id), everything else = -100.

    Notes:
    - This is intentionally simple and robust; it trades speed for clarity.
    - For large datasets you may want to cache prefix tokenization.
    """

    # Full tokenization
    full_ids: List[int] = tok.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=False,
    )

    # Build a boolean mask aligned to full_ids
    train_mask = [False] * len(full_ids)

    prev_prefix: List[Dict[str, str]] = []
    # Some tokenizer implementations error on apply_chat_template([]); treat empty prefix as 0 tokens.
    prev_ids: List[int] = []

    for m in messages:
        cur_prefix = [*prev_prefix, m]
        cur_ids: List[int] = tok.apply_chat_template(
            cur_prefix,
            tokenize=True,
            add_generation_prompt=False,
        )

        start = len(prev_ids)
        end = len(cur_ids)

        if m.get("role") == "assistant":
            for i in range(start, end):
                train_mask[i] = True

        prev_prefix = cur_prefix
        prev_ids = cur_ids

    # Truncate
    input_ids = full_ids[:max_length]
    train_mask = train_mask[:max_length]

    labels = [tid if train_mask[i] else -100 for i, tid in enumerate(input_ids)]
    attention_mask = [1] * len(input_ids)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


# Quick sanity check on 1 example
ex0 = raw[0]["messages"]
enc0 = encode_chat_with_assistant_labels(ex0, max_length=512)
print({k: len(v) for k, v in enc0.items()})
print("trainable tokens:", sum(1 for x in enc0["labels"] if x != -100))


In [None]:
# Step 4 — Build tokenized train dataset

import random


def to_features(ex: Dict[str, Any]) -> Dict[str, Any]:
    return encode_chat_with_assistant_labels(ex["messages"], max_length=MAX_LENGTH)

train_ds = raw
if LIMIT_TRAIN_EXAMPLES is not None:
    # deterministic sample for quick iteration
    idxs = list(range(len(train_ds)))
    random.Random(0).shuffle(idxs)
    idxs = idxs[:LIMIT_TRAIN_EXAMPLES]
    train_ds = train_ds.select(idxs)

train_ds = train_ds.map(
    to_features,
    remove_columns=train_ds.column_names,
    desc="Tokenizing + building assistant-only labels",
)

print(train_ds)
print(train_ds[0].keys())


In [None]:
# Step 5 — Load base model in 8-bit and attach LoRA

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Backend selection
USE_CUDA = torch.cuda.is_available()
USE_MPS = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()

if USE_CUDA:
    print("Using CUDA (8-bit bitsandbytes + LoRA)")
    bnb_config = BitsAndBytesConfig(load_in_8bit=True)

    # Train on a single GPU to keep `Trainer` happy (avoid sharding/device_map auto).
    # If you truly need sharding, switch to an `accelerate` training script instead of `Trainer`.
    device_map = {"": 0}

    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    base_model = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        quantization_config=bnb_config,
        device_map=device_map,
        torch_dtype=dtype,
    )

    base_model = prepare_model_for_kbit_training(base_model)

elif USE_MPS:
    # bitsandbytes 8-bit quantization requires NVIDIA CUDA.
    # On Apple Silicon, we run LoRA in fp16 on MPS.
    print("Using MPS (fp16 + LoRA; no bitsandbytes)")
    bnb_config = None
    device_map = None

    base_model = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        torch_dtype=torch.float16,
    )
    base_model.to(torch.device("mps"))

else:
    print("Using CPU (fp32 + LoRA; slow, no bitsandbytes)")
    bnb_config = None
    device_map = None

    base_model = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        torch_dtype=torch.float32,
    )

base_model.config.use_cache = False
base_model.gradient_checkpointing_enable()

# Llama-style target modules (covers common linear projections)
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=lora_target_modules,
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


In [None]:
# Step 6 — Trainer setup (simple SFT)

from dataclasses import dataclass
from typing import Any, Dict, List

import torch
from transformers import Trainer, TrainingArguments


@dataclass
class DataCollatorForCausalLMAssistantOnly:
    pad_token_id: int

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Pad input_ids/attention_mask to max length in batch; pad labels with -100.
        max_len = max(len(f["input_ids"]) for f in features)

        input_ids = []
        attention_mask = []
        labels = []
        for f in features:
            n = len(f["input_ids"])
            pad = max_len - n
            input_ids.append(f["input_ids"] + [self.pad_token_id] * pad)
            attention_mask.append(f["attention_mask"] + [0] * pad)
            labels.append(f["labels"] + [-100] * pad)

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
        }


collator = DataCollatorForCausalLMAssistantOnly(pad_token_id=tok.pad_token_id)

LOG_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_ADAPTER_DIR.mkdir(parents=True, exist_ok=True)

optim_name = "paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch"

args = TrainingArguments(
    output_dir=str(LOG_DIR),
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="linear",
    warmup_ratio=0.03,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    report_to=[],
    optim=optim_name,
    bf16=bool(torch.cuda.is_available() and torch.cuda.is_bf16_supported()),
    fp16=bool(torch.cuda.is_available() and not torch.cuda.is_bf16_supported()) or (hasattr(torch.backends, "mps") and torch.backends.mps.is_available()),
    # Use Apple Silicon GPU when available
    use_mps_device=bool(hasattr(torch.backends, "mps") and torch.backends.mps.is_available()),
    dataloader_num_workers=2,
    remove_unused_columns=False,
)

print("Trainer optim:", optim_name)
print("use_mps_device:", getattr(args, "use_mps_device", None))

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    data_collator=collator,
)

print("Ready to train")


In [None]:
# Step 7 — Train + save adapter

train_result = trainer.train()
print(train_result)

# Save LoRA adapter weights + config
model.save_pretrained(str(OUTPUT_ADAPTER_DIR))
tok.save_pretrained(str(OUTPUT_ADAPTER_DIR))

print("Saved adapter to:", OUTPUT_ADAPTER_DIR)


In [None]:
# Step 8 — Quick local inference smoke test

from peft import PeftModel

# Reload base model (8-bit) and attach saved adapter for clean inference
# Reload base model and attach saved adapter for clean inference.
# - CUDA: 8-bit bitsandbytes
# - MPS: fp16
# - CPU: fp32

if torch.cuda.is_available():
    infer_base = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        quantization_config=bnb_config,
        device_map={"": 0},
        torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    )
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    infer_base = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        torch_dtype=torch.float16,
    ).to(torch.device("mps"))
else:
    infer_base = AutoModelForCausalLM.from_pretrained(
        str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME,
        torch_dtype=torch.float32,
    )

infer_model = PeftModel.from_pretrained(infer_base, str(OUTPUT_ADAPTER_DIR))
infer_model.eval()

messages = [
    {"role": "system", "content": "You are an email security analyst. Return exactly one line: is_spam: 0/1/-1"},
    {"role": "user", "content": "Subject: Reset your password\nBody: Click here to reset your password immediately..."},
]

prompt_ids = tok.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)

# When using `device_map="auto"`, models can be sharded and `model.device` may be unset.
# Put inputs on the embedding device (or fall back to first parameter device).
def _infer_input_device(m) -> torch.device:
    dm = getattr(m, "hf_device_map", None)
    if isinstance(dm, dict) and dm:
        for key in (
            "model.embed_tokens",
            "model.model.embed_tokens",
            "transformer.wte",
            "embed_tokens",
        ):
            if key in dm:
                return torch.device(dm[key])
        return torch.device(next(iter(dm.values())))
    return next(m.parameters()).device

input_device = _infer_input_device(infer_model)
input_ids = torch.tensor([prompt_ids], device=input_device)

with torch.no_grad():
    out = infer_model.generate(
        input_ids=input_ids,
        max_new_tokens=64,
        do_sample=False,
        temperature=1.0,
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.pad_token_id,
    )

gen = tok.decode(out[0][len(prompt_ids):], skip_special_tokens=True)
print(gen.strip())
