### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify source model from HuggingFace

In [3]:
model_id = widgets.Text(
    value='CorticalStack/travel-mistral-7B-16b-base',
    description='Model ID',
    disabled=False
)
display(model_id)

Text(value='CorticalStack/travel-mistral-7B-16b-base', description='Model ID')

### Download the source model

In [4]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Download and build llama.cpp to do the quantization

In [5]:
# Install llama.cpp
if not os.path.isdir("llama.cpp"):
    !git clone https://github.com/ggerganov/llama.cpp
    !cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
    !pip install -r llama.cpp/requirements.txt

### Set target model name prefix

In [7]:

model_name_prefix = widgets.Text(
    value='travel-mistral-7B',
    description='Model name preix',
    disabled=False
)
display(model_name_prefix)

Text(value='travel-mistral-7B', description='Model name preix')

### Perform the GGUF quantization

In [13]:
QUANTIZATION_FORMAT = "q4_k_m" # List of multiple formats seperated by comma
QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(" ", "").split(",")

# Convert to fp16
fp16 = f"{model_name_prefix.value}.FP16.bin"
!python llama.cpp/convert.py {model_name[1]} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    quant_target_name = f"{model_name_prefix.value}.{method.upper()}.gguf"
    !./llama.cpp/quantize {fp16} {quant_target_name} {method}



Loading model file travel-mistral-7B-16b-base/model-00001-of-00003.safetensors
Loading model file travel-mistral-7B-16b-base/model-00001-of-00003.safetensors
Loading model file travel-mistral-7B-16b-base/model-00002-of-00003.safetensors
Loading model file travel-mistral-7B-16b-base/model-00003-of-00003.safetensors
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=32768, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('travel-mistral-7B-16b-base'))
Found vocab files: {'tokenizer.model': PosixPath('travel-mistral-7B-16b-base/tokenizer.model'), 'vocab.json': None, 'tokenizer.json': PosixPath('travel-mistral-7B-16b-base/tokenizer.json')}
Loading vocab file 'travel-mistral-7B-16b-base/tokenizer.model', type 'spm'
Vocab info: <SentencePieceVocab with 32000 base tokens and 0 add

### Push to HuggingFace Hub

In [14]:

api = HfApi()

model_id = f"CorticalStack/{quant_target_name}"
api.create_repo(model_id, exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_file(
    path_or_fileobj=f"{quant_target_name}",
    path_in_repo=f"{quant_target_name}",
    repo_id=model_id,
)

travel-mistral-7B.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/travel-mistral-7B.Q4_K_M.gguf/commit/bb0c76ecffae389fd4b19dccba4e56611dd99955', commit_message='Upload travel-mistral-7B.Q4_K_M.gguf with huggingface_hub', commit_description='', oid='bb0c76ecffae389fd4b19dccba4e56611dd99955', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# @title # 🏛️ AWQ
# @markdown ### ✨ Quantization parameters

Q_GROUP_SIZE = 128 # @param {type:"integer"}
ZERO_POINT = True # @param {text:"boolean"}
W_BIT = 4 # @param {type:"integer"}
VERSION = "GEMM" # @param {type:"string"}
SAFETENSORS = True # @param {text:"boolean"}

# Install AutoAWQ
!git clone https://github.com/casper-hansen/AutoAWQ
%cd AutoAWQ
!pip install -e .
!pip install git+https://github.com/huggingface/transformers
!pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


quant_path = MODEL_NAME + "-awq"
quant_config = { "zero_point": ZERO_POINT, "q_group_size": Q_GROUP_SIZE, "w_bit": W_BIT, "version": VERSION }

# Load model
PATH = "/content/" + MODEL_NAME
model = AutoAWQForCausalLM.from_pretrained(PATH, safetensors=SAFETENSORS)
tokenizer = AutoTokenizer.from_pretrained(PATH, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-AWQ",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload awq files
api.upload_folder(
    folder_path=quant_path,
    repo_id=f"{username}/{MODEL_NAME}-AWQ",
    token=hf_token
)

In [None]:
# @title # 🔬 EXL2
# @markdown ### ✨ Quantization parameters

BPW = 5.0 # @param {type:"number"}

# Install ExLLamaV2
!git clone https://github.com/turboderp/exllamav2
!pip install -e exllamav2

!mv {MODEL_NAME} base_model
!rm base_mode/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

# Quantize model
!mkdir quant
!python exllamav2/convert.py \
    -i base_model \
    -o quant \
    -c wikitext-test.parquet \
    -b {BPW}

# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./quant/

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload exl2 files
api.upload_folder(
    folder_path=quant,
    repo_id=f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    token=hf_token
)

In [None]:
# @title # 📝 GPTQ
# @markdown ### ✨ Quantization parameters

BITS = 4 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}
DAMP_PERCENT = 0.01 # @param {type:"number"}

!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers
import random
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
out_dir = MODEL_ID + "-GPTQ"

# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=BITS,
    group_size=GROUP_SIZE,
    damp_percent=DAMP_PERCENT,
    desc_act=False,
)
PATH = "/content/" + MODEL_NAME
model = AutoGPTQForCausalLM.from_pretrained(PATH, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(PATH)

# Load data and tokenize examples
n_samples = 1024
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})

# Quantize with GPTQ
model.quantize(
    examples_ids,
    batch_size=1,
    use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload gptq files
api.upload_folder(
    folder_path=out_dir,
    repo_id=f"{username}/{MODEL_NAME}-GPTQ",
    token=hf_token
)
