### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify HuggingFace Parameters

In [3]:
model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-sft',
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

new_model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-gptq',
    description='New model ID',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

Text(value='CorticalStack/mistral-7b-alpaca-sft', description='Model ID', style=TextStyle(description_width='i…

Text(value='CorticalStack/mistral-7b-alpaca-gptq', description='New model ID', style=TextStyle(description_wid…

### GPTQ quant parameters
Original dataset ids from GPTQ paper: wikitext2, c4, c4-new, ptb, ptb-new

In [5]:
seq_length = widgets.IntText(
    value=2048,
    description='Sequence length',
    disabled=False
)
seq_length.style.description_width = 'initial'
display(seq_length)

bits = widgets.IntText(
    value=4,
    description='Bits',
    disabled=False
)
bits.style.description_width = 'initial'
display(bits)

group_size = widgets.IntText(
    value=128,
    description='Group size',
    disabled=False
)
group_size.style.description_width = 'initial'
display(group_size)

gptq_dataset_id = widgets.Text(
    value='wikitext2',
    description='GPTQ dataset id',
    disabled=False
)
gptq_dataset_id.style.description_width = 'initial'
display(gptq_dataset_id)

IntText(value=2048, description='Sequence length', style=DescriptionStyle(description_width='initial'))

IntText(value=4, description='Bits', style=DescriptionStyle(description_width='initial'))

IntText(value=128, description='Group size', style=DescriptionStyle(description_width='initial'))

Text(value='wikitext2', description='GPTQ dataset id', style=TextStyle(description_width='initial'))

### Download the source model

In [6]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Do the GPTQ quantisation

In [7]:
from optimum.gptq import GPTQQuantizer

quantizer = GPTQQuantizer(bits=bits.value, group_size=group_size.value, dataset=gptq_dataset_id.value, model_seqlen=seq_length.value)
quantizer.quant_method = "gptq"

import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name[1], use_fast=False) # bug with fast tokenizer

# Note I played with setting cuda device and removed low_cpu_mem_usage=True
model = AutoModelForCausalLM.from_pretrained(model_name[1], device_map="cuda:0", torch_dtype=torch.float16) # we load the model in fp16 on purpose

# Quantize the model
quantized_model = quantizer.quantize_model(model, tokenizer)

# Save the quantize model to disk
new_model_name = str(new_model_id.value).split('/')
quantized_model.save_pretrained(new_model_name[1], safe_serialization=True)

# Load fresh, fast tokenizer and save it to disk
tokenizer = AutoTokenizer.from_pretrained(model_name[1]).save_pretrained(new_model_name[1])

# Save quantize_config.json for TGI
with open(os.path.join(new_model_name[1], "quantize_config.json"), "w", encoding="utf-8") as f:
  quantizer.disable_exllama = False
  json.dump(quantizer.to_dict(), f, indent=2)

CUDA extension not installed.
CUDA extension not installed.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2763962 > 32768). Running this sequence through the model will result in indexing errors


Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights.Setting `disable_exllama=True`. You should only use Exllamav2 backend for inference. 


### Create HF model card

In [8]:
from huggingface_hub import ModelCard
from jinja2 import Template

In [9]:
license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

#### Create the jinja template

In [10]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

{{ new_model_id }} is an GPTQ quantised version of [{{ model_id }}]({{ model_url }}).

GPTQ models are currently supported on Linux (NVidia/AMD) and Windows (NVidia only). MacOS users: please use GGUF models.

These GPTQ models are known to work in the following inference servers/webuis.
- [text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [KoboldAI United](https://github.com/henk717/koboldai)
- [LoLLMS Web UI](https://github.com/ParisNeo/lollms-webui)
- [Hugging Face Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [11]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          model_id = model_id.value,
          model_url = "https://huggingface.co/" + model_id.value,
          )

# Save the model card
card = ModelCard(content)
card.save(f"{new_model_name[1]}/README.md")

### Push to HuggingFace Hub

In [12]:
api = HfApi()

api.create_repo(new_model_id.value, exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_folder(
    folder_path=new_model_name[1],
    repo_id=new_model_id.value,
    token=HF_TOKEN
)

model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/mistral-7b-alpaca-gptq/commit/c3e9653de734aa3d705bcf951dc2884b37a9d8d9', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c3e9653de734aa3d705bcf951dc2884b37a9d8d9', pr_url=None, pr_revision=None, pr_num=None)