### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify source model from HuggingFace

In [3]:
model_id = widgets.Text(
    value='CorticalStack/travel-mistral-7B-16b-base',
    description='Model ID',
    disabled=False
)
display(model_id)

Text(value='CorticalStack/travel-mistral-7B-16b-base', description='Model ID')

### Download the source model

In [4]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Set target model name prefix

In [5]:

model_name_prefix = widgets.Text(
    value='travel-mistral-7B',
    description='Model name preix',
    disabled=False
)
display(model_name_prefix)

Text(value='travel-mistral-7B', description='Model name preix')

### Perform the AWQ quantization

In [6]:
# Quantization parameters
Q_GROUP_SIZE = 128 # @param {type:"integer"}
ZERO_POINT = True # @param {text:"boolean"}
W_BIT = 4 # @param {type:"integer"}
VERSION = "GEMM" # @param {type:"string"}
SAFETENSORS = True # @param {text:"boolean"}

# Install AutoAWQ
# !git clone https://github.com/casper-hansen/AutoAWQ
# %cd AutoAWQ
# !pip install -e .
# !pip install git+https://github.com/huggingface/transformers
# !pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


quant_path = model_name_prefix.value + "-AWQ"
quant_config = { "zero_point": ZERO_POINT, "q_group_size": Q_GROUP_SIZE, "w_bit": W_BIT, "version": VERSION }

# Load model
PATH = f"{model_name[1]}"
model = AutoAWQForCausalLM.from_pretrained(PATH, safetensors=SAFETENSORS)
tokenizer = AutoTokenizer.from_pretrained(PATH, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (77697 > 32768). Running this sequence through the model will result in indexing errors
AWQ: 100%|██████████| 32/32 [13:35<00:00, 25.47s/it]


('travel-mistral-7B-AWQ/tokenizer_config.json',
 'travel-mistral-7B-AWQ/special_tokens_map.json',
 'travel-mistral-7B-AWQ/tokenizer.json')

### Push to HuggingFace Hub

In [10]:
api = HfApi()

api.create_repo(model_id, exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_folder(
    folder_path=quant_path,
    repo_id=f"{'CorticalStack'}/{quant_path}",
    token=HF_TOKEN
)

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/travel-mistral-7B-AWQ/commit/49accf235f8cc00930c66a12e7aa81fddebc27f8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='49accf235f8cc00930c66a12e7aa81fddebc27f8', pr_url=None, pr_revision=None, pr_num=None)