# Transform MLX Models to GGUF Models with llama.cpp

In [9]:
import os
import subprocess

If you have not yet initialized the environment the following block will do so, but only if it does not already exist! Make sure to use this conda environment after for the fine tuning! 

In [11]:
# Define environment details
conda_env_name = "llama_cpp"
python_version = "3.12"

# Step: Create the Conda Environment if it does not exist
try:
    # Check if the conda environment exists
    print(f"Checking if the Conda environment '{conda_env_name}' exists...")
    env_list = subprocess.run(["conda", "env", "list"], capture_output=True, text=True)
    if conda_env_name not in env_list.stdout:
        print(f"Environment '{conda_env_name}' not found. Creating it with Python {python_version}...")
        subprocess.run(
            [
                "conda", "create", "--name", conda_env_name,
                f"python={python_version}", "jupyter", "-y"
            ],
            check=True
        )
        print(f"Conda environment '{conda_env_name}' created successfully.")
    else:
        print(f"Conda environment '{conda_env_name}' already exists.")
except Exception as e:
    print(f"Error setting up Conda environment: {e}")
    raise


Checking if the Conda environment 'llama_cpp' exists...
Conda environment 'llama_cpp' already exists.


If you have not yet installed / cloned llama.cpp make sure to do so! 

In [6]:
# Get the current script directory
script_dir = os.path.abspath('.')

# Path to the llama.cpp directory
llama_dir = os.path.abspath('../llama_cpp')

try:
    # Change to llama.cpp directory
    os.chdir(llama_dir)
    print(f"Changed directory to {os.getcwd()}")

    # Clone the repository
    #subprocess.run(["git", "clone", "https://github.com/ggerganov/llama.cpp.git"], check=True)

    # Install requirements
    #subprocess.run(["pip", "install", "-r", "llama.cpp/requirements.txt"], check=True)

finally:
    # Change back to the original script directory
    os.chdir(script_dir)
    print(f"Changed back to {os.getcwd()}")

Changed directory to /Users/Mael/DigDemLab/Archive/digdemlab_finetuning_llama_ollama/llama_cpp
Changed back to /Users/Mael/DigDemLab/Archive/digdemlab_finetuning_llama_ollama/src


In [7]:
# Get the current working directory (in a Jupyter notebook this will be the notebook's directory).
script_dir = os.path.abspath('.')  # If in a .py file, use: os.path.dirname(os.path.abspath(__file__))

# Define the save directory relative to the current directory
save_dir = os.path.join(script_dir, '..', 'data', 'testing')

print(os.getcwd())

/Users/Mael/DigDemLab/Archive/digdemlab_finetuning_llama_ollama/src


In [3]:
!python ../llama_cpp/llama.cpp/convert_hf_to_gguf.py -h

usage: convert_hf_to_gguf.py [-h] [--vocab-only] [--outfile OUTFILE]
                             [--outtype {f32,f16,bf16,q8_0,tq1_0,tq2_0,auto}]
                             [--bigendian] [--use-temp-file] [--no-lazy]
                             [--model-name MODEL_NAME] [--verbose]
                             [--split-max-tensors SPLIT_MAX_TENSORS]
                             [--split-max-size SPLIT_MAX_SIZE] [--dry-run]
                             [--no-tensor-first-split] [--metadata METADATA]
                             model

Convert a huggingface model to a GGML compatible file

positional arguments:
  model                 directory containing model file

options:
  -h, --help            show this help message and exit
  --vocab-only          extract only the vocab
  --outfile OUTFILE     path to write to; default: based on input. {ftype}
                        will be replaced by the outtype.
  --outtype {f32,f16,bf16,q8_0,tq1_0,tq2_0,auto}
                        outpu

In [7]:
!python ../llama_cpp/llama.cpp/convert_hf_to_gguf.py ../adapters/testing/lora_fused_model --outfile ../model/output_file.gguf --outtype q8_0

INFO:hf-to-gguf:Loading model: lora_fused_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> Q8_0, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> Q8_0, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> Q8_0, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> Q8_0, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> Q8_0, s

In [9]:
!llama-gguf --mode w "../adapters/testing/lora_fused_model" --outtype f16

gguf_ex_write: wrote file '--mode;
