In [1]:
import os
import sys
sys.path.insert(0, 'examples-lora') 

from pathlib import Path

import lora
import mlx.optimizers as optim
import mlx.core as mx
import mlx.nn as nn
from mlx.utils import tree_flatten

import utils as lora_utils
from models import LoRALinear



None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In the command line, the script is called like this:

python lora.py \
--train \
--model mistralai/Mistral-7B-Instruct-v0.2 \
--data ~/dev/python-tuning-llm/mistral-tuning-mlx/data/training \
--batch-size 1 \
--lora-layers 8 \
--max-length 2048 \
--iters 1000


In [2]:
args = lora.build_parser().parse_args(args=[])
args.train=True
args.model='mistralai/Mistral-7B-Instruct-v0.2'
args.data=os.path.join(os.getcwd(), 'data','training')
args.batch_size=2
args.lora_layers=4
args.iters=250
args.max_length=2048

In [3]:
print("Loading pretrained model")
model, tokenizer, _ = lora_utils.load(args.model)

Loading pretrained model


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

In [4]:

# Freeze all layers other than LORA linears
model.freeze()
for l in model.model.layers[len(model.model.layers) - args.lora_layers :]:
    l.self_attn.q_proj = LoRALinear.from_linear(l.self_attn.q_proj)
    l.self_attn.v_proj = LoRALinear.from_linear(l.self_attn.v_proj)
    if hasattr(l, "block_sparse_moe"):
        l.block_sparse_moe.gate = LoRALinear.from_linear(l.block_sparse_moe.gate)

p = sum(v.size for _, v in tree_flatten(model.parameters())) / 10**6
print(f"Total parameters {p:.3f}M")
p = sum(v.size for _, v in tree_flatten(model.trainable_parameters())) / 10**6
print(f"Trainable parameters {p:.3f}M")

print("Loading datasets")
train_set, valid_set, test_set = lora.load(args)

# Resume training the given adapters.
if args.resume_adapter_file is not None:
    print(f"Loading pretrained adapters from {args.resume_adapter_file}")
    model.load_weights(args.resume_adapter_file, strict=False)

if args.train:
    print("Training")
    opt = optim.Adam(learning_rate=args.learning_rate)

    # Train model
    lora.train(model, train_set, valid_set, opt, lora.loss, tokenizer, args)

    # Save adapter weights
    mx.savez(args.adapter_file, **dict(tree_flatten(model.trainable_parameters())))

# Load the LoRA adapter weights which we assume should exist by this point
if not Path(args.adapter_file).is_file():
    raise ValueError(
        f"Adapter file {args.adapter_file} missing. "
        "Use --train to learn and save the adapters.npz."
    )
model.load_weights(args.adapter_file, strict=False)



Total parameters 7242.158M
Trainable parameters 0.426M
Loading datasets
Training
Iter 1: Val loss 3.530, Val took 673.947s
Iter 10: Train loss 3.902, It/sec 0.031, Tokens/sec 27.990
Iter 20: Train loss 3.367, It/sec 0.027, Tokens/sec 24.042
Iter 30: Train loss 2.922, It/sec 0.027, Tokens/sec 25.033
Iter 40: Train loss 2.584, It/sec 0.028, Tokens/sec 41.828
Iter 50: Train loss 2.693, It/sec 0.026, Tokens/sec 29.330
Iter 60: Train loss 2.588, It/sec 0.026, Tokens/sec 35.721
Iter 70: Train loss 2.379, It/sec 0.026, Tokens/sec 22.432
Iter 80: Train loss 2.437, It/sec 0.027, Tokens/sec 23.232
Iter 90: Train loss 2.559, It/sec 0.029, Tokens/sec 24.081
Iter 100: Train loss 2.480, It/sec 0.029, Tokens/sec 23.327
Iter 100: Saved adapter weights to adapters.npz.
Iter 110: Train loss 2.453, It/sec 0.028, Tokens/sec 20.542
Iter 120: Train loss 2.312, It/sec 0.029, Tokens/sec 38.124
Iter 130: Train loss 2.411, It/sec 0.028, Tokens/sec 35.122
Iter 140: Train loss 2.264, It/sec 0.014, Tokens/sec 17.7

Model(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (k_proj): Linear(input_dims=4096, output_dims=1024, bias=False)
        (v_proj): Linear(input_dims=4096, output_dims=1024, bias=False)
        (o_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (rope): RoPE(128, traditional=False)
      )
      (mlp): MLP(
        (gate_proj): Linear(input_dims=4096, output_dims=14336, bias=False)
        (down_proj): Linear(input_dims=14336, output_dims=4096, bias=False)
        (up_proj): Linear(input_dims=4096, output_dims=14336, bias=False)
      )
      (input_layernorm): RMSNorm(4096, eps=1e-05)
      (post_attention_layernorm): RMSNorm(4096, eps=1e-05)
    )
    (layers.1): TransformerBlock(
      (self_attn): Attention(
        (q_proj): Linear(input_dims=4096, output_dims=4096, bias=False)
        (k_proj): L

In [None]:
prompt="Escribe una entrada corta de diario por ROBELLO CHEBELO MAMELO cuando tenía 25 años"
args.adapter_file="examples-lora/adapters1.npz"
args.max_tokens=2048
lora.generate(model, prompt, tokenizer, args)
