### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify HuggingFace Parameters

In [3]:
model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-sft',
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

new_model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-awq',
    description='New model ID',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

Text(value='CorticalStack/mistral-7b-alpaca-sft', description='Model ID', style=TextStyle(description_width='i…

Text(value='CorticalStack/mistral-7b-alpaca-awq', description='New model ID', style=TextStyle(description_widt…

### AWQ quant parameters

In [4]:
q_group_size = widgets.IntText(
    value=128,
    description='Q group size',
    disabled=False
)
q_group_size.style.description_width = 'initial'
display(q_group_size)

w_bit = widgets.IntText(
    value=4,
    description='W bit',
    disabled=False
)
w_bit.style.description_width = 'initial'
display(w_bit)

version = widgets.Text(
    value="GEMM",
    description='Version',
    disabled=False
)
version.style.description_width = 'initial'
display(version)

zero_point = widgets.Checkbox(
    value=True,
    description='Zero point',
    disabled=False,
    indent=False
)
zero_point.style.description_width = 'initial'
display(zero_point)

IntText(value=128, description='Q group size', style=DescriptionStyle(description_width='initial'))

IntText(value=4, description='W bit', style=DescriptionStyle(description_width='initial'))

Text(value='GEMM', description='Version', style=TextStyle(description_width='initial'))

Checkbox(value=True, description='Zero point', indent=False, style=CheckboxStyle(description_width='initial'))

### Download the source model

In [5]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Perform the AWQ quantization

In [6]:
SAFETENSORS = True # @param {text:"boolean"}

# Install AutoAWQ
# !git clone https://github.com/casper-hansen/AutoAWQ
# %cd AutoAWQ
# !pip install -e .
# !pip install git+https://github.com/huggingface/transformers
# !pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

quant_config = { "zero_point": zero_point.value, "q_group_size": q_group_size.value, "w_bit": w_bit.value, "version": version.value }

# Load model
PATH = f"{model_name[1]}"
model = AutoAWQForCausalLM.from_pretrained(PATH, safetensors=SAFETENSORS)
tokenizer = AutoTokenizer.from_pretrained(PATH, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
new_model_name = str(new_model_id.value).split('/')
model.save_quantized(new_model_name[1])
tokenizer.save_pretrained(new_model_name[1])

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (77697 > 8192). Running this sequence through the model will result in indexing errors
AWQ: 100%|██████████| 32/32 [14:01<00:00, 26.31s/it]


('pastiche-crown-clown-7b-dare-awq/tokenizer_config.json',
 'pastiche-crown-clown-7b-dare-awq/special_tokens_map.json',
 'pastiche-crown-clown-7b-dare-awq/tokenizer.model',
 'pastiche-crown-clown-7b-dare-awq/added_tokens.json',
 'pastiche-crown-clown-7b-dare-awq/tokenizer.json')

### Create HF model card

In [7]:
from huggingface_hub import ModelCard
from jinja2 import Template

In [8]:
license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

#### Create the jinja template

In [9]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

{{ new_model_id }} is an AWQ quantised version of [{{ model_id }}]({{ model_url }}).

### About AWQ
AWQ is an efficient, accurate and blazing-fast low-bit weight quantization method, currently supporting 4-bit quantization. Compared to GPTQ, it offers faster Transformers-based inference with equivalent or better quality compared to the most commonly used GPTQ settings.

AWQ models are currently supported on Linux and Windows, with NVidia GPUs only. macOS users: please use GGUF models instead.

It is supported by:

- [Text Generation Webui](https://github.com/oobabooga/text-generation-webui) - using Loader: AutoAWQ
- [vLLM](https://github.com/vllm-project/vllm) - version 0.2.2 or later for support for all model types.
- [Hugging Face Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference)
- [Transformers](https://huggingface.co/docs/transformers) version 4.35.0 and later, from any code or client that supports Transformers
- [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) - for use from Python code

### AWQ configuration
- Zero point: {{ zero_point }}
- Q group size: {{q_group_size}}
- W bit: {{ w_bit}}
- Version: {{ version }}
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [10]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          model_id = model_id.value,
          model_url = "https://huggingface.co/" + model_id.value,
          zero_point = zero_point.value,
          q_group_size = q_group_size.value,
          w_bit = w_bit.value,
          version = version.value
          )

# Save the model card
card = ModelCard(content)
card.save(f"{new_model_name[1]}/README.md")

### Push to HuggingFace Hub

In [11]:
api = HfApi()

api.create_repo(new_model_id.value, exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_folder(
    folder_path=new_model_name[1],
    repo_id=new_model_id.value,
    token=HF_TOKEN
)

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/pastiche-crown-clown-7b-dare-awq/commit/4c0b22b9d69909b7806985ad40a5a7c3b046b877', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4c0b22b9d69909b7806985ad40a5a7c3b046b877', pr_url=None, pr_revision=None, pr_num=None)