# Finetuning with mlx-lm
### notebook to inspect, build a lora config yaml, and convert datasets

In [2]:
# Set defaults
user = ""
root_path = "/Users/{user}/mlx"
model_path = f"{root_path}/Storage/my_model"
data_path = f"{root_path}/traindata"

# model format, ie: model-00001-of-00005.safetensors
model_format = "model-00001-of-00005.safetensors"

model = model_path + "/" + model_format

### Inspection

In [3]:
import json
import mlx.core as mx


def prod(shape):
    result = 1
    for dim in shape:
        result *= dim
    return result


def inspect_model(model_files):
    total_layers = 0
    total_params = 0

    for model_file in model_files:
        with open(model_file, "rb") as f:
            header_len = int.from_bytes(f.read(8), "little")
            header = json.loads(f.read(header_len))

            num_layers = len([k for k in header if k.startswith("layers")])
            num_params = sum(
                prod(mx.array(v["shape"])) for v in header.values() if "shape" in v
            )

            total_layers += num_layers
            total_params += num_params

            # print(f"File: {model_file}")
            print("Header information:")
            for k, v in header.items():
                print(f"{k}: {v}")
            print()

    print("Total across all files:")
    print(f"Total layers: {total_layers}")
    print(f"Total parameters: {total_params}")


# Usage example
model_files = [
    f"{model_path}/model-00001-of-00005.safetensors",
    f"{model_path}/model-00002-of-00005.safetensors",
    f"{model_path}/model-00003-of-00005.safetensors",
    f"{model_path}/model-00004-of-00005.safetensors",
    f"{model_path}/model-00005-of-00005.safetensors",
]

inspect_model(model_files)

Header information:
__metadata__: {'format': 'mlx'}
model.embed_tokens.weight: {'data_offsets': [4815310848, 5208526848], 'dtype': 'F16', 'shape': [32000, 6144]}
model.layers.0.input_layernorm.weight: {'data_offsets': [4271456256, 4271468544], 'dtype': 'F16', 'shape': [6144]}
model.layers.0.mlp.down_proj.biases: {'data_offsets': [4269772800, 4271345664], 'dtype': 'F16', 'shape': [6144, 128]}
model.layers.0.mlp.down_proj.scales: {'data_offsets': [4492357632, 4493930496], 'dtype': 'F16', 'shape': [6144, 128]}
model.layers.0.mlp.down_proj.weight: {'data_offsets': [4391694336, 4492357632], 'dtype': 'U32', 'shape': [6144, 4096]}
model.layers.0.mlp.gate_proj.biases: {'data_offsets': [2494402560, 2495975424], 'dtype': 'F16', 'shape': [16384, 48]}
model.layers.0.mlp.gate_proj.scales: {'data_offsets': [4008370176, 4009943040], 'dtype': 'F16', 'shape': [16384, 48]}
model.layers.0.mlp.gate_proj.weight: {'data_offsets': [3267244032, 3367907328], 'dtype': 'U32', 'shape': [16384, 1536]}
model.layers

## Converts a chat dataset of system/user/assistant to the format mlx-lm requires for chat conversations, then splits to train, test, valid jsonl files

In [1]:
import json
import random

# Read the system.jsonl file
with open("./data/system.jsonl", "r") as file:
    data = [json.loads(line) for line in file]

# Convert the format
converted_data = []
for item in data:
    messages = [{"role": "system", "content": item["conversations"][0]["value"]}]
    for message in item["conversations"][1:]:
        role = "user" if message["from"] == "human" else "assistant"
        messages.append({"role": role, "content": message["value"]})
    converted_item = {"messages": messages}
    converted_data.append(converted_item)

# Shuffle the data
random.shuffle(converted_data)

# Calculate the split sizes
total_size = len(converted_data)
train_size = int(total_size * 0.8)
valid_size = int(total_size * 0.1)
test_size = total_size - train_size - valid_size

# Split the data into train, test, and valid sets
train_data = converted_data[:train_size]
valid_data = converted_data[train_size : train_size + valid_size]
test_data = converted_data[train_size + valid_size :]

# Save the train data as train.jsonl
with open("./traindata/train.jsonl", "w") as file:
    for item in train_data:
        file.write(json.dumps(item) + "\n")

# Save the test data as test.jsonl
with open("./traindata/test.jsonl", "w") as file:
    for item in test_data:
        file.write(json.dumps(item) + "\n")

# Save the valid data as valid.jsonl
with open("./traindata/valid.jsonl", "w") as file:
    for item in valid_data:
        file.write(json.dumps(item) + "\n")