### Build MLX dataset (train/valid JSONL)

This notebook converts an instruction-tuning JSON array (like `data/finetuning/training_data.json` with `instruction`/`input`/`output`) into the **MLX-LM** local dataset format:

- `data/mlx_lora_ar/train.jsonl`
- `data/mlx_lora_ar/valid.jsonl`

**Important:** many chat templates (including common Mistral ones) require roles to alternate `user/assistant/user/assistant/...` and may not support a separate `system` role.

So we encode each example as **two messages**:

```json
{"messages": [{"role": "user", "content": "<instruction>\n\n<input>"}, {"role": "assistant", "content": "<output>"}]}
```

MLX-LM expects a directory containing **`train.jsonl`** and **`valid.jsonl`**.


In [8]:
# Config

from __future__ import annotations

import json
import random
from pathlib import Path
from typing import Any

# Repo root discovery (works whether you run from repo root or notebooks_sl/)
REPO_ROOT = Path.cwd().resolve().parent if (Path.cwd().name == "notebooks_sl") else Path.cwd().resolve()
if not (REPO_ROOT / "data" / "finetuning").exists():
    for p in [Path.cwd().resolve(), *Path.cwd().resolve().parents]:
        if (p / "data" / "finetuning").exists():
            REPO_ROOT = p
            break

# Input: instruction-tuning JSON array
IN_PATH = REPO_ROOT / "data" / "finetuning" / "training_data.json"

# Output: MLX dataset directory containing train/valid JSONL
MLX_DATA_DIR = REPO_ROOT / "data" / "mlx_lora_ar"
TRAIN_JSONL = MLX_DATA_DIR / "train.jsonl"
VALID_JSONL = MLX_DATA_DIR / "valid.jsonl"

# Split + reproducibility
VALID_FRACTION = 0.1
SEED = 42

# SAFE-BY-DEFAULT
WRITE_OUTPUTS = True
OVERWRITE_OUTPUTS = True

print("repo root:", REPO_ROOT)
print("in:", IN_PATH)
print("out dir:", MLX_DATA_DIR)
print("train:", TRAIN_JSONL)
print("valid:", VALID_JSONL)


repo root: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook
in: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/finetuning/training_data.json
out dir: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar
train: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/train.jsonl
valid: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/valid.jsonl


In [9]:
# Build MLX train/valid JSONL


def _to_mlx_messages(ex: dict[str, Any]) -> dict[str, Any]:
    instruction = (ex.get("instruction") or "").strip()
    user_input = (ex.get("input") or "").strip()
    output = (ex.get("output") or "").strip()

    # Many tokenizers require strict user/assistant alternation.
    # Merge `instruction` into the user message to avoid a separate `system` role.
    if instruction and user_input:
        user_content = f"{instruction}\n\n{user_input}"
    else:
        user_content = instruction or user_input

    return {
        "messages": [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": output},
        ]
    }


def _write_jsonl(path: Path, rows: list[dict[str, Any]], *, overwrite: bool) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    if path.exists() and not overwrite:
        raise FileExistsError(f"Refusing to overwrite existing file: {path}")

    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


raw = json.loads(IN_PATH.read_text(encoding="utf-8"))
if not isinstance(raw, list):
    raise TypeError(f"Expected a JSON array at {IN_PATH}, got {type(raw).__name__}")

# Convert + drop totally-empty examples
examples: list[dict[str, Any]] = []
for ex in raw:
    mlx_ex = _to_mlx_messages(ex)
    msgs = mlx_ex["messages"]
    if all(not (m.get("content") or "").strip() for m in msgs):
        continue
    examples.append(mlx_ex)

print("loaded examples:", len(raw))
print("kept examples:", len(examples))

# Deterministic shuffle + split
rng = random.Random(SEED)
idxs = list(range(len(examples)))
rng.shuffle(idxs)

n_valid = max(1, int(round(len(examples) * VALID_FRACTION)))
valid_idxs = set(idxs[:n_valid])

train_rows = [examples[i] for i in range(len(examples)) if i not in valid_idxs]
valid_rows = [examples[i] for i in range(len(examples)) if i in valid_idxs]

print("train:", len(train_rows))
print("valid:", len(valid_rows))

# Quick sanity check
print("train head:", json.dumps(train_rows[0], ensure_ascii=False)[:400])

if WRITE_OUTPUTS:
    _write_jsonl(TRAIN_JSONL, train_rows, overwrite=OVERWRITE_OUTPUTS)
    _write_jsonl(VALID_JSONL, valid_rows, overwrite=OVERWRITE_OUTPUTS)
    print("Wrote:", TRAIN_JSONL)
    print("Wrote:", VALID_JSONL)
else:
    print("WRITE_OUTPUTS is False; not writing any files.")
    print("Set WRITE_OUTPUTS=True to write train/valid JSONL.")


loaded examples: 2492
kept examples: 2492
train: 2243
valid: 249
train head: {"messages": [{"role": "user", "content": "You are a support engineer in a mobile gaming company. Your task is to classify if a chat log belongs to a phishing attempt or not.\n\nIf the chat log is a phishing attempt, return \"PHISHING\" for category, the reason for the phishing attempt, the reason keyword and the source.\nIf the chat log is not a phishing attempt, return \"GENUINE\" for category, 
Wrote: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/train.jsonl
Wrote: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/valid.jsonl
