### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify HuggingFace Parameters

In [4]:
model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-sft',
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

new_model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca',
    description='New model ID',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

Text(value='CorticalStack/mistral-7b-alpaca-sft', description='Model ID', style=TextStyle(description_width='i…

Text(value='CorticalStack/mistral-7b-alpaca', description='New model ID', style=TextStyle(description_width='i…

### EXL2 quant parameters

In [5]:
bpw = widgets.FloatText(
    value=6.5,
    description='BPW',
    disabled=False
)
bpw.style.description_width = 'initial'
display(bpw)

FloatText(value=6.5, description='BPW', style=DescriptionStyle(description_width='initial'))

### Download the source model

In [6]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Perform the EXL2 quantization

In [7]:
# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

model_name = str(model_id.value).split('/')
new_model_name = str(new_model_id.value).split('/')
command = f"mkdir {new_model_name[1]}-{bpw.value}bpw-exl2"
os.system(command)

quant = f"{new_model_name[1]}-{bpw.value}bpw-exl2"
command = f"python exllamav2/convert.py -i {model_name[1]} -o {quant} -c wikitext-test.parquet -b {bpw.value}"
os.system(command)

--2024-02-15 23:04:34--  https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet
Resolving huggingface.co (huggingface.co)... 18.165.183.98, 18.165.183.94, 18.165.183.117, ...
Connecting to huggingface.co (huggingface.co)|18.165.183.98|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 721735 (705K)
Saving to: ‘wikitext-test.parquet.33’


2024-02-15 23:04:35 (1.92 MB/s) - ‘wikitext-test.parquet.33’ saved [721735/721735]

 -- Beginning new job
 -- Input: mistral-7b-alpaca-sft
 -- Output: mistral-7b-alpaca-6.5bpw-exl2
 -- Calibration dataset: wikitext-test.parquet, 100 / 16 rows, 2048 tokens per sample
 -- Target bits per weight: 6.5 (decoder), 6 (head)
 -- Max shard size: 8192 MB
 -- Tokenizing samples (measurement)...
 -- First 50 tokens of dataset:
    ' = Robert Boulter = \n  Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the televisio

0

In [8]:
# Cleanup and copy files
command = f"rm -rf {quant}/out_tensor"
os.system(command)

command = f"rsync -av --exclude='*.safetensors' --exclude='.*' ./{model_name[1]}/ ./{quant}/"
os.system(command)

sending incremental file list
./
README.md
config.json
generation_config.json
model.safetensors.index.json
special_tokens_map.json
tokenizer.json
tokenizer.model
tokenizer_config.json

sent 2,316,995 bytes  received 171 bytes  4,634,332.00 bytes/sec
total size is 2,315,800  speedup is 1.00


0

### Create HF model card

In [9]:
from huggingface_hub import ModelCard
from jinja2 import Template

In [10]:
license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

#### Create the jinja template

In [11]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

An EXL2 {{ bpw }}bpw quantised version of [{{ model_id }}]({{ model_url }}).

An incomplete list of clients and libraries that are known to support EXL2:
* [text-generation-webui](https://github.com/oobabooga/text-generation-webui), the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration.
* [exllamav2](https://github.com/turboderp/exllamav2), an inference library for running local LLMs on modern consumer GPUs.
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [12]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          bpw = bpw.value,
          model_id = model_id.value,
          model_url = "https://huggingface.co/" + model_id.value,
          )

# Save the model card
card = ModelCard(content)
card.save(f"{quant}/README.md")

In [13]:
# Create empty repo
api = HfApi()
create_repo(
    repo_id = f"CorticalStack/{quant}",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN
)

# Upload exl2 files
api.upload_folder(
    folder_path=quant,
    repo_id=f"CorticalStack/{quant}",
    token=HF_TOKEN
)

cal_data.safetensors:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

hidden_states.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

output.safetensors:   0%|          | 0.00/6.04G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/mistral-7b-alpaca-6.5bpw-exl2/commit/490c5284101128057b30ac1cb50e75c7bf2b5722', commit_message='Upload folder using huggingface_hub', commit_description='', oid='490c5284101128057b30ac1cb50e75c7bf2b5722', pr_url=None, pr_revision=None, pr_num=None)