<a href="https://colab.research.google.com/github/ecubeproject/Awesome-LLMOps/blob/main/%E2%9A%A1_AutoQuant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title # ⚡ AutoQuant

# @markdown > 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)

# @markdown ❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).

# @markdown **Usage:** Download the model by **running this cell** and then run the cells corresponding to your quantization methods of interest.

# @markdown To quantize a 7B or 8B model, GGUF only needs a T4 GPU, while the other methods require an L4 or A100 GPU.

# @markdown ---

# @markdown ## 🤗 Download model (required)
# @markdown `MODEL_ID` is the ID of the model to quantize on the Hugging Face hub.
MODEL_ID = "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated" # @param {type:"string"}

# @markdown `USERNAME` is your username on Hugging Face.
USERNAME = "mlabonne" # @param {type:"string"}

# @markdown `HF_TOKEN` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.
HF_TOKEN = "HF_TOKEN" # @param {type:"string"}

MODEL_NAME = MODEL_ID.split('/')[-1]

!pip install -qqq huggingface_hub --progress-bar off
!pip install -qqq -U numpy==1.23.5 transformers --progress-bar off

from huggingface_hub import create_repo, HfApi, ModelCard, snapshot_download
from google.colab import userdata, runtime
import shutil
import fnmatch
import os

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(HF_TOKEN)
api = HfApi()

# Download model using huggingface_hub
model_path = snapshot_download(
    repo_id=MODEL_ID,
    token=hf_token,
    ignore_patterns=["*.msgpack", "*.h5", "*.ot", "*.onnx"],  # Ignore certain file types
    local_dir=MODEL_NAME
)
print(f"Model downloaded to: {model_path}")

def upload_quant(base_model_id, quantized_model_name, quantization_type, save_folder, allow_patterns=None, bpw=None):
    """
    Create a model card (if necessary), upload the quantized model to Hugging Face.

    :param base_model_id: The ID of the base model
    :param quantized_model_name: The name for the quantized model
    :param quantization_type: The type of quantization (e.g., 'gguf', 'gptq', 'awq', 'hqq', 'exl2')
    :param save_folder: The folder where the quantized model is saved
    :param allow_patterns: List of file patterns to upload (default is None, which uploads all files)
    :param bpw: Bits per weight (used for EXL2 quantization)
    """
    # Initialize Hugging Face API
    api = HfApi()

    # Define the repository ID for the quantized model
    if quantization_type == 'exl2':
        repo_id = f"{USERNAME}/{quantized_model_name}-{bpw:.1f}bpw-exl2"
    else:
        repo_id = f"{USERNAME}/{quantized_model_name}"

    # Try to load existing model card
    try:
        existing_card = ModelCard.load(repo_id)
        print(f"Model card already exists for {repo_id}. Skipping model card creation.")
    except Exception:
        # If the model card doesn't exist, create a new one
        card = ModelCard.load(base_model_id)
        card.data.tags = [] if card.data.tags is None else card.data.tags
        card.data.tags.append("autoquant")
        card.data.tags.append(quantization_type)
        card.save(f'{save_folder}/README.md')
        print(f"Created new model card for {repo_id}")

    # Create or update the repository
    create_repo(
        repo_id=repo_id,
        repo_type="model",
        exist_ok=True,
        token=hf_token
    )

    # Upload the model
    api.upload_folder(
        folder_path=save_folder,
        repo_id=repo_id,
        allow_patterns=allow_patterns,
        token=hf_token
    )

    print(f"Uploaded quantized model to {repo_id}")

In [None]:
# @title ## 🧩 GGUF

# @markdown Recommended methods: `q2_k`, `q3_k_m`, `q4_k_m`, `q5_k_m`, `q6_k`, `q8_0`

# @markdown Learn more about GGUF and quantization methods in [this article](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html).

QUANTIZATION_FORMAT = "q2_k, q3_k_m, q4_k_m, q5_k_m, q6_k, q8_0" # @param {type:"string"}
QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(" ", "").split(",")
gguf_repo_id = f"{USERNAME}/{MODEL_NAME}-GGUF"

# # Install llama.cpp
if not os.path.exists("llama.cpp"):
    !git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && make
    !pip install -r llama.cpp/requirements.txt

# Convert to BF16
out = f"{MODEL_NAME}/{MODEL_NAME.lower()}.bf16.gguf"
if os.path.exists(out):
    print(f"File {out} already exists. Skipping conversion.")
else:
    !python llama.cpp/convert_hf_to_gguf.py {MODEL_NAME} --outtype bf16 --outfile {out}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/llama-quantize {out} {qtype} {method}

    # Upload quant
    upload_quant(
        base_model_id=MODEL_ID,
        quantized_model_name=f"{MODEL_NAME}-GGUF",
        quantization_type="gguf",
        save_folder=MODEL_NAME,
        allow_patterns=["*.gguf", "*.md"]
    )

In [None]:
# @title ## 🧠 GPTQ

# @markdown Learn more about the GPTQ algorithm in [this article](https://mlabonne.github.io/blog/posts/4_bit_Quantization_with_GPTQ.html).

!pip install auto-gptq optimum accelerate

from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

BITS = 4 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}
DAMP_PERCENT = 0.1 # @param {type:"number"}

# Quantize model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
quantization_config = GPTQConfig(bits=BITS, dataset="c4", tokenizer=tokenizer, damp_percent=DAMP_PERCENT)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", quantization_config=quantization_config, low_cpu_mem_usage=True)

# Save model and tokenizer
save_folder = MODEL_ID + "-GPTQ"
model.save_pretrained(save_folder, use_safetensors=True)
tokenizer.save_pretrained(save_folder)

# Upload quant
upload_model(
    base_model_id=MODEL_ID,
    quantized_model_name=f"{MODEL_NAME}-GPTQ",
    quantization_type="gptq",
    save_folder=save_folder
)

In [None]:
# @title # 🦙 ExLlamaV2

# @markdown Learn more about ExLlamaV2 in [this article](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html).

BPW = 4.5 # @param {type:"number"}

# Install ExLLamaV2
!git clone https://github.com/turboderp/exllamav2
!pip install -e exllamav2
!cp -r {MODEL_NAME} base_model
!rm base_model/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

# Quantize model
save_folder = MODEL_ID + "-EXL2"
!mkdir {save_folder}
!python exllamav2/convert.py \
    -i base_model \
    -o {save_folder} \
    -c wikitext-test.parquet \
    -b {BPW}

# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' base_model {save_folder}

# Upload quant
upload_quant(
    base_model_id=MODEL_ID,
    quantized_model_name=MODEL_NAME,
    quantization_type="exl2",
    save_folder=save_folder,
    bpw=BPW
)

In [None]:
# @title ## ⚖️ AWQ

# @markdown See the [AutoAWQ repository](https://github.com/casper-hansen/AutoAWQ) for more information.

# Install AutoAWQ
!pip install -qqq -U https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.4/autoawq-0.2.4+cu118-cp310-cp310-linux_x86_64.whl
!pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

BITS = 4 # @param {type: "integer"}
GROUP_SIZE = 128 # @param {type: "integer"}
VERSION = "GEMM" # @param {type: "string"}
ZERO_POINT = True # @param {type: "boolean"}

quant_config = {
    "w_bit": BITS,
    "q_group_size": GROUP_SIZE,
    "version": VERSION,
    "zero_point": ZERO_POINT
}
save_folder = MODEL_NAME + "-AWQ"

# Quantize model
model = AutoAWQForCausalLM.from_pretrained(MODEL_NAME, safetensors=True, low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model.quantize(tokenizer, quant_config=quant_config)

# Save model and tokenizer
model.save_quantized(save_folder)
tokenizer.save_pretrained(save_folder)

# Upload quant
upload_quant(
    base_model_id=MODEL_ID,
    quantized_model_name=f"{MODEL_NAME}-AWQ",
    quantization_type="awq",
    save_folder=save_folder
)

In [None]:
# @title ## 🐘 HQQ

# @markdown See the official [HQQ repository](https://github.com/mobiusml/hqq) for more information.

# !git clone https://github.com/mobiusml/hqq.git
# !pip install -e hqq
# !python hqq/kernels/setup_cuda.py install
# !pip install flash-attn --no-build-isolation
# !pip install transformers --upgrade
# !num_threads=8; OMP_NUM_THREADS=$num_threads CUDA_VISIBLE_DEVICES=0

import torch
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *

BITS = 2 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}

# Quant config
quant_config = BaseQuantizeConfig(
    nbits=BITS,
    group_size=GROUP_SIZE
)

# Quantize model
model = HQQModelForCausalLM.from_pretrained(
    MODEL_ID,
    cache_dir=".",
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model.quantize_model(quant_config=quant_config, device='cuda')

# Save model and tokenizer
save_folder = MODEL_ID + "-HQQ"
model.save_quantized(save_folder)
tokenizer.save_pretrained(save_folder)

# Upload quant
upload_quant(
    base_model_id=MODEL_ID,
    quantized_model_name=f"{MODEL_NAME}-{BITS}bit-HQQ",
    quantization_type="hqq",
    save_folder=save_folder
)