In [13]:
# Imports + environment

from __future__ import annotations

import json
import sys
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()
print("cwd:", Path.cwd())


cwd: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/notebooks_sl


In [14]:
import json
import os
from typing import Dict, List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import mlx.optimizers as optim
from mlx.utils import tree_flatten
from mlx_lm import generate, load
from mlx_lm.tuner import TrainingArgs, datasets, linear_to_lora_layers, train
from transformers import PreTrainedTokenizer

In [15]:
# load model

# model_path = "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
model_path = '../models/llama-3.1-8b/'
model, tokenizer = load(model_path)

In [16]:
# Ensure tokenizer has a chat template (needed for apply_chat_template)
# We do NOT change your selected model.

# Some local tokenizers (including many llama/mistral conversions) don't ship a chat_template.
# Provide a sensible default only if missing.

def _ensure_chat_template(tok) -> None:
    # If this is a wrapper, also try to set on the underlying tokenizer.
    candidates = [tok]
    inner = getattr(tok, "tokenizer", None) or getattr(tok, "_tokenizer", None)
    if inner is not None and inner is not tok:
        candidates.append(inner)

    for t in candidates:
        if getattr(t, "chat_template", None) not in (None, ""):
            continue

        # Llama-3-style template (works if tokenizer has the special tokens).
        t.chat_template = (
            "{%- for message in messages -%}"
            "{%- if loop.first -%}{{ bos_token }}{%- endif -%}"
            "<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n\n"
            "{{ message['content'] }}<|eot_id|>"
            "{%- endfor -%}"
            "{%- if add_generation_prompt -%}"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
            "{%- endif -%}"
        )

        # Ensure pad token exists for batching
        if getattr(t, "pad_token_id", None) is None and getattr(t, "eos_token", None) is not None:
            t.pad_token = t.eos_token

    # Print status from the outer object
    ct = getattr(tok, "chat_template", None) or getattr(inner, "chat_template", None)
    print("has chat_template:", bool(ct))


_ensure_chat_template(tokenizer)


has chat_template: True


In [17]:
adapter_path = "../adapters/local_lora_llama_3.1_8b"
os.makedirs(adapter_path, exist_ok=True)
adapter_config_path = os.path.join(adapter_path, "adapter_config.json")
adapter_file_path = os.path.join(adapter_path, "adapters.safetensors")

In [18]:
# these can be in a yaml file instead

lora_config = {
    "num_layers": 8,
    "lora_parameters": {
        "rank": 8,
        "scale": 20.0,
        "dropout": 0.0,
    },
}

# which we save into adapter_path
with open(adapter_config_path, "w") as f:
    json.dump(lora_config, f, indent=4)


In [19]:
# we can also set our training params
training_args = TrainingArgs(
    adapter_file=adapter_file_path,
    iters=200,
    steps_per_eval=50,
)

- In the LoRA framework, most of the model’s original parameters remain unchanged during fine-tuning. 
- The model.freeze() command is used to set these parameters to a non-trainable state so that their weights aren’t updated during backpropagation. This way, only the newly introduced low-rank adaptation matrices (LoRA parameters) are optimized, reducing computational overhead and memory usage while preserving the original model’s knowledge.




- The linear_to_lora_layers function converts or wraps some of the model’s linear layers into LoRA layers. Essentially, it replaces (or augments) selected linear layers with their LoRA counterparts, which include the additional low-rank matrices that will be trained. 
- The configuration parameters (like the number of layers and specific LoRA parameters) determine which layers are modified and how the LoRA adapters are set up.

In [20]:
# we should also verify that only a small subset of parameters are set for training
# and activate training mode while freezing the main model params

model.freeze()
linear_to_lora_layers(model, lora_config["num_layers"], lora_config["lora_parameters"])
num_train_params = sum(v.size for _, v in tree_flatten(model.trainable_parameters()))
print(f"Number of trainable parameters: {num_train_params}")
model.train()

Number of trainable parameters: 5242880


Model(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (k_proj): Linear(input_dims=4096, output_dims=1024, bias=False)
        (v_proj): Linear(input_dims=4096, output_dims=1024, bias=False)
        (o_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (rope): Llama3RoPE()
      )
      (mlp): MLP(
        (gate_proj): Linear(input_dims=4096, output_dims=14336, bias=False)
        (down_proj): Linear(input_dims=14336, output_dims=4096, bias=False)
        (up_proj): Linear(input_dims=4096, output_dims=14336, bias=False)
      )
      (input_layernorm): RMSNorm(4096, eps=1e-05)
      (post_attention_layernorm): RMSNorm(4096, eps=1e-05)
    )
    (layers.1): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (k_proj): Linear(input_dim

In [21]:
# class to follow the training progress
class Metrics:
    def __init__(self) -> None:
        self.train_losses: List[Tuple[int, float]] = []
        self.val_losses: List[Tuple[int, float]] = []

    def on_train_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.train_losses.append((info["iteration"], info["train_loss"]))

    def on_val_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.val_losses.append((info["iteration"], info["val_loss"]))

metrics = Metrics()

In [22]:
# Load MLX-LM local dataset (train.jsonl + valid.jsonl)

import types

# Repo root discovery (so paths work regardless of cwd)
REPO_ROOT = Path.cwd().resolve().parent if (Path.cwd().name == "notebooks_sl") else Path.cwd().resolve()
if not (REPO_ROOT / "data").exists():
    for p in [Path.cwd().resolve(), *Path.cwd().resolve().parents]:
        if (p / "data").exists():
            REPO_ROOT = p
            break

MLX_DATA_DIR = REPO_ROOT / "data" / "mlx_lora_ar"
train_path = MLX_DATA_DIR / "train.jsonl"
valid_path = MLX_DATA_DIR / "valid.jsonl"

print("dataset dir:", MLX_DATA_DIR)
print("train:", train_path)
print("valid:", valid_path)

if not train_path.exists() or not valid_path.exists():
    raise FileNotFoundError(
        "Missing train/valid JSONL. Run notebooks_sl/build_dataset_mlx.ipynb to generate: "
        f"{train_path} and {valid_path}"
    )

# IMPORTANT: do NOT unwrap to the underlying transformers tokenizer here.
# The MLX TokenizerWrapper returned by `mlx_lm.load()` can implement chat templating
# even when the HF tokenizer has no `chat_template` set.
base_tokenizer = tokenizer

# Config controls which keys to read and whether to mask the prompt.
config = types.SimpleNamespace(
    chat_feature="messages",
    mask_prompt=True,
)

train_dataset, valid_dataset, _test_dataset = datasets.load_local_dataset(
    MLX_DATA_DIR,
    base_tokenizer,
    config,
)

# trainer.iterate_batches expects dataset[idx] to return (tokens, offset).
train_dataset = datasets.CacheDataset(train_dataset)
valid_dataset = datasets.CacheDataset(valid_dataset)

print("train examples:", len(train_dataset))
print("valid examples:", len(valid_dataset))

# Force tokenization of the first element as a sanity check
first_tokens, first_offset = train_dataset[0]
print("first example tokens:", len(first_tokens), "offset:", first_offset)


dataset dir: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar
train: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/train.jsonl
valid: /Users/ext-elias.melas/Documents/Gitcode/tomoro_finetune_cookbook/data/mlx_lora_ar/valid.jsonl
train examples: 2243
valid examples: 249
first example tokens: 1129 offset: 832


In [23]:
# Kick off training using the loaded train/valid datasets
# NOTE: different mlx-lm versions expose different `train()` signatures.
# We keep your preferred kwargs, but filter out any unsupported ones.

import inspect

train_set = train_dataset
val_set = valid_dataset

requested_kwargs = {
    "model": model,
    "tokenizer": tokenizer,
    "args": training_args,
    "optimizer": optim.Adam(learning_rate=1e-5),
    "train_dataset": train_set,
    "val_dataset": val_set,
    "training_callback": metrics,
}

sig = inspect.signature(train)
accepted = set(sig.parameters.keys())
kwargs = {k: v for k, v in requested_kwargs.items() if k in accepted}
dropped = [k for k in requested_kwargs.keys() if k not in accepted]

print("train() accepts:", sorted(accepted))
print("passing:", sorted(kwargs.keys()))
if dropped:
    print("dropping unsupported kwargs:", dropped)

train(**kwargs)


train() accepts: ['args', 'iterate_batches', 'loss', 'model', 'optimizer', 'train_dataset', 'training_callback', 'val_dataset']
passing: ['args', 'model', 'optimizer', 'train_dataset', 'training_callback', 'val_dataset']
dropping unsupported kwargs: ['tokenizer']
Starting training..., iters: 200


Calculating loss...:   0%|          | 0/25 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Calculating loss...:  52%|█████▏    | 13/25 [03:43<04:12, 21.06s/it]



Calculating loss...:  88%|████████▊ | 22/25 [06:05<00:45, 15.04s/it]



Calculating loss...: 100%|██████████| 25/25 [06:54<00:00, 16.60s/it]

Iter 1: Val loss 2.858, Val took 414.937s





: 