### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify source model from HuggingFace

In [31]:
model_id = widgets.Text(
    value='CorticalStack/travel-mistral-7B-16b-base',
    description='Model ID',
    disabled=False
)
display(model_id)

Text(value='CorticalStack/travel-mistral-7B-16b-base', description='Model ID')

### Download the source model

In [32]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Set target model name prefix

In [5]:

model_name_prefix = widgets.Text(
    value='travel-mistral-7B',
    description='Model name preix',
    disabled=False
)
display(model_name_prefix)

Text(value='travel-mistral-7B', description='Model name preix')

### Push to HuggingFace Hub

In [14]:

api = HfApi()

model_id = f"CorticalStack/{quant_target_name}"
api.create_repo(model_id, exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_file(
    path_or_fileobj=f"{quant_target_name}",
    path_in_repo=f"{quant_target_name}",
    repo_id=model_id,
)

travel-mistral-7B.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/travel-mistral-7B.Q4_K_M.gguf/commit/bb0c76ecffae389fd4b19dccba4e56611dd99955', commit_message='Upload travel-mistral-7B.Q4_K_M.gguf with huggingface_hub', commit_description='', oid='bb0c76ecffae389fd4b19dccba4e56611dd99955', pr_url=None, pr_revision=None, pr_num=None)

### Perform the EXL2 quantization

In [34]:
# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

BPW = 5.0
quant = f"{model_name_prefix.value}-{BPW}bpw-exl2"

# Quantize model
command = f"python exllamav2/convert.py -i {model_name[1]} -o {quant} -c wikitext-test.parquet -b {BPW}"
os.system(command)

--2024-02-11 20:29:51--  https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet
Resolving huggingface.co (huggingface.co)... 108.138.189.57, 108.138.189.96, 108.138.189.70, ...
Connecting to huggingface.co (huggingface.co)|108.138.189.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 721735 (705K)
Saving to: ‘wikitext-test.parquet.26’


2024-02-11 20:29:51 (3.30 MB/s) - ‘wikitext-test.parquet.26’ saved [721735/721735]

 -- Beginning new job
 -- Input: travel-mistral-7B-16b-base
 -- Output: travel-mistral-7B-EXL2
 -- Calibration dataset: wikitext-test.parquet, 100 / 16 rows, 2048 tokens per sample
 -- Target bits per weight: 5.0 (decoder), 6 (head)
 -- Max shard size: 8192 MB
 -- Tokenizing samples (measurement)...
 -- First 50 tokens of dataset:
    ' = Robert Boulter = \n  Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the televisi

0

In [37]:
# Copy files
command = f"rm -rf {quant}/out_tensor"
os.system(command)

command = f"rsync -av --exclude='*.safetensors' --exclude='.*' ./{model_name[1]}/ ./{quant}/"
os.system(command)

sending incremental file list
./
README.md
config.json
generation_config.json
model.safetensors.index.json
special_tokens_map.json
tokenizer.json
tokenizer.model
tokenizer_config.json

sent 2,316,888 bytes  received 171 bytes  4,634,118.00 bytes/sec
total size is 2,315,709  speedup is 1.00


0

In [39]:
# Create empty repo
api = HfApi()
create_repo(
    repo_id = f"CorticalStack/{quant}",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN
)

# Upload exl2 files
api.upload_folder(
    folder_path=quant,
    repo_id=f"CorticalStack/{quant}",
    token=HF_TOKEN
)

hidden_states.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

output.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

cal_data.safetensors:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/travel-mistral-7B-5.0bpw-exl2/commit/c3fbf6addcb0cd78706faeb08e49e0b5031b38aa', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c3fbf6addcb0cd78706faeb08e49e0b5031b38aa', pr_url=None, pr_revision=None, pr_num=None)