### Import dependencies

In [1]:
import os
from dotenv import load_dotenv
import ipywidgets as widgets
from IPython.display import display
from huggingface_hub import HfApi, create_repo

### Set variables like HuggingFace token

In [2]:
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

### Specify HuggingFace Parameters

In [3]:
model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca-sft',
    description='Model ID',
    disabled=False
)
model_id.style.description_width = 'initial'
display(model_id)

new_model_id = widgets.Text(
    value='CorticalStack/mistral-7b-alpaca',
    description='New model ID',
    disabled=False
)
new_model_id.style.description_width = 'initial'
display(new_model_id)

Text(value='CorticalStack/mistral-7b-alpaca-sft', description='Model ID', style=TextStyle(description_width='i…

Text(value='CorticalStack/mistral-7b-alpaca', description='New model ID', style=TextStyle(description_width='i…

### GGUF quant parameters

In [4]:
quant_format = widgets.Text(
    value='q4_k_m',
    description='GGUF quant format',
    disabled=False
)
quant_format.style.description_width = 'initial'
display(quant_format)

Text(value='q4_k_m', description='GGUF quant format', style=TextStyle(description_width='initial'))

### Download the source model

In [5]:
model_name = str(model_id.value).split('/')
if not os.path.isdir(model_name[1]):
    !git clone https://huggingface.co/{model_id.value}

### Download and build llama.cpp to do the quantization

In [6]:
# Install llama.cpp
if not os.path.isdir("llama.cpp"):
    !git clone https://github.com/ggerganov/llama.cpp
    !cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
    !pip install -r llama.cpp/requirements.txt

### Perform the GGUF quantization

In [8]:
# Convert to fp16
model_name = str(model_id.value).split('/')
new_model_name = str(new_model_id.value).split('/')
fp16 = f"{model_name[1]}.FP16.bin"
!python llama.cpp/convert.py {model_name[1]} --outtype f16 --outfile {fp16}

quant_target_name = f"{new_model_name[1]}.{quant_format.value.upper()}.gguf"
!./llama.cpp/quantize {fp16} {quant_target_name} {quant_format.value}

Loading model file mistral-7b-alpaca-sft/model-00001-of-00003.safetensors
Loading model file mistral-7b-alpaca-sft/model-00001-of-00003.safetensors
Loading model file mistral-7b-alpaca-sft/model-00002-of-00003.safetensors
Loading model file mistral-7b-alpaca-sft/model-00003-of-00003.safetensors
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=32768, n_ff=14336, n_head=32, n_head_kv=8, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('mistral-7b-alpaca-sft'))
Found vocab files: {'tokenizer.model': PosixPath('mistral-7b-alpaca-sft/tokenizer.model'), 'vocab.json': None, 'tokenizer.json': PosixPath('mistral-7b-alpaca-sft/tokenizer.json')}
Loading vocab file 'mistral-7b-alpaca-sft/tokenizer.model', type 'spm'
Vocab info: <SentencePieceVocab with 32000 base tokens and 0 added tokens>
Special vocab info: <SpecialV

In [11]:
!mkdir "{new_model_name[1]}-gguf"
!mv "{new_model_name[1]}.{quant_format.value.upper()}.gguf" "{new_model_name[1]}-gguf/"

### Create HF model card

In [12]:
from huggingface_hub import ModelCard
from jinja2 import Template

In [13]:
license = widgets.Text(
    value="apache-2.0",
    description='License',
    disabled=False
)
license.style.description_width = 'initial'
display(license)

Text(value='apache-2.0', description='License', style=TextStyle(description_width='initial'))

#### Create the jinja template

In [14]:
template_text = """
---
license: {{ license }}
---

# {{ new_model_id }}

A collection of GGUF quantised versions of [{{ model_id }}]({{ model_url }}).

The main branch model is quantised using GGUF format Q4_K_M.

GGUF is a format that replaces GGML, which is no longer supported by llama.cpp.
An incomplete list of clients and libraries that are known to support GGUF:
* [llama.cpp](https://github.com/ggerganov/llama.cpp). The source project for GGUF. Offers a CLI and a server option.
* [text-generation-webui](https://github.com/oobabooga/text-generation-webui), the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration.
* [KoboldCpp](https://github.com/LostRuins/koboldcpp), a fully featured web UI, with GPU accel across all platforms and GPU architectures. Especially good for story telling.
* [GPT4All](https://gpt4all.io/index.html), a free and open source local running GUI, supporting Windows, Linux and macOS with full GPU accel.
* [LM Studio](https://lmstudio.ai/), an easy-to-use and powerful local GUI for Windows and macOS (Silicon), with GPU acceleration. Linux available, in beta as of 27/11/2023.
* [LoLLMS Web UI](https://github.com/ParisNeo/lollms-webui), a great web UI with many interesting and unique features, including a full model library for easy model selection.
* [Faraday.dev](https://faraday.dev/), an attractive and easy to use character-based chat GUI for Windows and macOS (both Silicon and Intel), with GPU acceleration.
* [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a Python library with GPU accel, LangChain support, and OpenAI-compatible API server.
* [candle](https://github.com/huggingface/candle), a Rust ML framework with a focus on performance, including GPU support, and ease of use.
* [ctransformers](https://github.com/marella/ctransformers), a Python library with GPU accel, LangChain support, and OpenAI-compatible AI server.
"""

    # Create a Jinja template object
jinja_template = Template(template_text.strip())

In [16]:
# Fill the template
content = jinja_template.render(
          license = license.value,
          new_model_id = new_model_id.value,
          model_id = model_id.value,
          model_url = "https://huggingface.co/" + model_id.value,
          )

# Save the model card
card = ModelCard(content)
card.save(f"{new_model_name[1]}-gguf/README.md")

### Push to HuggingFace Hub

In [19]:
api = HfApi()

api.create_repo(f"{new_model_id.value}-gguf", exist_ok=True, repo_type="model", token=HF_TOKEN)
api.upload_folder(
    folder_path=f"{new_model_name[1]}-gguf",
    repo_id=f"{new_model_id.value}-gguf",
    token=HF_TOKEN
)

mistral-7b-alpaca.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/CorticalStack/mistral-7b-alpaca-gguf/commit/4e34b4a467fa17a54b2cb4540d1b0cf5f1843db1', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4e34b4a467fa17a54b2cb4540d1b0cf5f1843db1', pr_url=None, pr_revision=None, pr_num=None)