In [None]:
# Cell 1: Configuration and Package Installation
"""
This cell sets up the basic configuration and installs required packages.
MODEL_ID: The Hugging Face model ID to convert
USERNAME: Your Hugging Face username
HF_TOKEN: Your Hugging Face API token
"""
MODEL_ID = "model-id-here" # the mdodel you want to download and convert 
USERNAME = "your-username"  #your hugging face username 
HF_TOKEN = "your-token-here" #your token here
MODEL_NAME = MODEL_ID.split('/')[-1]

# Install required packages silently
!pip install huggingface_hub --quiet --progress-bar off
!pip install --upgrade numpy==1.23.5 transformers --quiet --progress-bar off

import os
import shutil
import fnmatch
from huggingface_hub import create_repo, HfApi, ModelCard, snapshot_download

In [None]:
# Cell 2: Initialize API and Download Model
"""
This cell initializes the Hugging Face API and downloads the base model,
excluding unnecessary file types to save space and time.
"""
hf_token = HF_TOKEN
api = HfApi()

model_path = snapshot_download(
    repo_id=MODEL_ID,
    token=hf_token,
    ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.onnx"],
    local_dir=MODEL_NAME
)
print(f"Model downloaded to: {model_path}")

In [3]:
# Cell 3: Define Upload Function
def upload_quant(base_model_id, quantised_model_name, quantisation_type, save_folder, allow_patterns=None, bpw=None):
    """
    Create a model card (if necessary) and upload the quantised model to Hugging Face.

    Parameters:
    base_model_id: The ID of the base model
    quantised_model_name: The name for the quantised model
    quantisation_type: The type of quantisation (e.g., 'gguf', 'gptq', 'awq')
    save_folder: The folder where the quantised model is saved
    allow_patterns: A list of file patterns to upload
    bpw: Bits per weight (used for EXL2 quantisation)
    """
    if quantisation_type == 'exl2':
        repo_id = f"{USERNAME}/{quantised_model_name}-{bpw:.1f}bpw-exl2"
    else:
        repo_id = f"{USERNAME}/{quantised_model_name}"

    try:
        existing_card = ModelCard.load(repo_id)
        print(f"Model card already exists for {repo_id}. Skipping model card creation.")
    except Exception:
        card = ModelCard.load(base_model_id)
        card.data.tags = [] if card.data.tags is None else card.data.tags
        card.data.tags.append("autoquant")
        card.data.tags.append(quantisation_type)
        card.save(f'{save_folder}/README.md')
        print(f"Created new model card for {repo_id}")

    create_repo(
        repo_id=repo_id,
        repo_type="model",
        exist_ok=True,
        token=hf_token
    )

    api.upload_folder(
        folder_path=save_folder,
        repo_id=repo_id,
        allow_patterns=allow_patterns,
        token=hf_token
    )

    print(f"Uploaded quantised model to {repo_id}")

In [None]:
# Cell 4: Setup Quantization Parameters and Install llama.cpp
"""
Set up quantization formats and build llama.cpp with CUDA support
"""
QUANTISATION_FORMAT = "q2_k, q3_k_m, q4_k_m, q5_k_m, q6_k, q8_0"
QUANTISATION_METHODS = QUANTISATION_FORMAT.replace(" ", "").split(",")
gguf_repo_id = f"{USERNAME}/{MODEL_NAME}-GGUF"

# Install and build llama.cpp with CUDA support
if not os.path.exists("llama.cpp"):
    !git clone https://github.com/ggerganov/llama.cpp
    os.chdir("llama.cpp")
    !cmake -B build -DGGML_CUDA=ON
    !cmake --build build --config Release
    !pip install -r requirements.txt
    os.chdir("..")
else:
    os.chdir("llama.cpp")
    # Update build
    !cmake -B build -DGGML_CUDA=ON
    !cmake --build build --config Release
    os.chdir("..")

In [None]:
# Cell 5: Convert Model to BF16
"""
Convert the model to BF16 format as base for quantization
"""
out = f"{MODEL_NAME}/{MODEL_NAME.lower()}.bf16.gguf"
if os.path.exists(out):
    print(f"File {out} already exists. Skipping conversion.")
else:
    if os.getcwd().endswith("llama.cpp"):
        os.chdir("..")
    !python llama.cpp/convert_hf_to_gguf.py {MODEL_NAME} --outfile {out} --outtype bf16

In [None]:
# Cell 6: Quantize Model
"""
Create different quantized versions of the model
"""
for method in QUANTISATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    if not os.getcwd().endswith("llama.cpp"):
        os.chdir("llama.cpp")
    !./build/bin/llama-quantize ../{out} ../{qtype} {method}

In [None]:
# Cell 7: Upload Quantized Versions
"""
Upload all quantized versions to Hugging Face
"""
if os.getcwd().endswith("llama.cpp"):
    os.chdir("..")

upload_quant(
    base_model_id=MODEL_ID,
    quantised_model_name=f"{MODEL_NAME}-GGUF",
    quantisation_type="gguf",
    save_folder=MODEL_NAME,
    allow_patterns=["*.gguf", "*.md"]
)

print("All tasks completed.")