In [17]:
import os
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
HF_TOKEN = os.environ.get("HF_TOKEN")

In [29]:
from huggingface_hub import HfApi, create_repo

In [18]:
import ipywidgets as widgets
from IPython.display import display
model_id = widgets.Text(
    value='CorticalStack/OpenHermes-Mistral-7B-GGUF',
    description='Model ID',
    disabled=False
)
display(model_id)

Text(value='CorticalStack/OpenHermes-Mistral-7B-GGUF', description='Model ID')

In [2]:
import ipywidgets as widgets
from IPython.display import display

# Function to square the input number
def square_number(x):
    print(f"The square of {x} is {x*x}")

# Create an integer slider widget
int_slider = widgets.IntSlider(
    value=1,  # initial value
    min=0,    # minimum value
    max=10,   # maximum value
    step=1,   # step size
    description='Number:'
)

# Display the slider
display(int_slider)

# Add an output widget to display the results
output = widgets.Output()
display(output)

# Function to update the output when the slider value changes
def on_value_change(change):
    with output:
        output.clear_output()
        square_number(change['new'])

# Link the function to the value change event
int_slider.observe(on_value_change, names='value')

IntSlider(value=1, description='Number:', max=10)

Output()

In [25]:
model_name = str(model_id.value).split('/')

# Download model
!git clone https://huggingface.co/{model_id.value}



fatal: destination path 'OpenHermes-Mistral-7B-GGUF' already exists and is not an empty directory.


In [26]:
model_name

['CorticalStack', 'OpenHermes-Mistral-7B-GGUF']

In [30]:
# @title # 🛸 GGUF
# @markdown ### ✨ Quantization parameters

QUANTIZATION_FORMAT = "q4_k_m" # @param {type:"string"}
QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(" ", "").split(",")
# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && git pull && make clean && LLAMA_CUBLAS=1 make
!pip install -r llama.cpp/requirements.txt

# Convert to fp16
fp16 = f"{model_name[0]}/{model_name[1]}.fp16.bin"
!python llama.cpp/convert.py {model_id.value} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{model_name[0]}/{model_name[1]}.gguf"
    !./llama.cpp/quantize {fp16} {qtype} {method}

# Defined in the secrets tab in Google Colab
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{model_name[0]}/{model_name[1]}-GGUF",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN
)

# Upload gguf files
api.upload_folder(
    folder_path={model_name[1]},
    repo_id=f"{model_name[0]}/{model_name[1]}-GGUF",
    allow_patterns=["*.gguf","$.md"],
    token=HF_TOKEN
)

Cloning into 'llama.cpp'...
remote: Enumerating objects: 17969, done.[K
remote: Counting objects: 100% (7567/7567), done.[K
remote: Compressing objects: 100% (473/473), done.[K
remote: Total 17969 (delta 7341), reused 7154 (delta 7093), pack-reused 10402[K
Receiving objects: 100% (17969/17969), 20.79 MiB | 16.76 MiB/s, done.
Resolving deltas: 100% (12581/12581), done.
Already up to date.
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion 
I CXXFLAGS:  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-

TypeError: expected str, bytes or os.PathLike object, not set

In [None]:
# @title # 🏛️ AWQ
# @markdown ### ✨ Quantization parameters

Q_GROUP_SIZE = 128 # @param {type:"integer"}
ZERO_POINT = True # @param {text:"boolean"}
W_BIT = 4 # @param {type:"integer"}
VERSION = "GEMM" # @param {type:"string"}
SAFETENSORS = True # @param {text:"boolean"}

# Install AutoAWQ
!git clone https://github.com/casper-hansen/AutoAWQ
%cd AutoAWQ
!pip install -e .
!pip install git+https://github.com/huggingface/transformers
!pip install zstandard

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


quant_path = MODEL_NAME + "-awq"
quant_config = { "zero_point": ZERO_POINT, "q_group_size": Q_GROUP_SIZE, "w_bit": W_BIT, "version": VERSION }

# Load model
PATH = "/content/" + MODEL_NAME
model = AutoAWQForCausalLM.from_pretrained(PATH, safetensors=SAFETENSORS)
tokenizer = AutoTokenizer.from_pretrained(PATH, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-AWQ",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload awq files
api.upload_folder(
    folder_path=quant_path,
    repo_id=f"{username}/{MODEL_NAME}-AWQ",
    token=hf_token
)

In [None]:
# @title # 🔬 EXL2
# @markdown ### ✨ Quantization parameters

BPW = 5.0 # @param {type:"number"}

# Install ExLLamaV2
!git clone https://github.com/turboderp/exllamav2
!pip install -e exllamav2

!mv {MODEL_NAME} base_model
!rm base_mode/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

# Quantize model
!mkdir quant
!python exllamav2/convert.py \
    -i base_model \
    -o quant \
    -c wikitext-test.parquet \
    -b {BPW}

# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./quant/

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload exl2 files
api.upload_folder(
    folder_path=quant,
    repo_id=f"{username}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    token=hf_token
)

In [None]:
# @title # 📝 GPTQ
# @markdown ### ✨ Quantization parameters

BITS = 4 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}
DAMP_PERCENT = 0.01 # @param {type:"number"}

!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers
import random
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer
out_dir = MODEL_ID + "-GPTQ"

# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=BITS,
    group_size=GROUP_SIZE,
    damp_percent=DAMP_PERCENT,
    desc_act=False,
)
PATH = "/content/" + MODEL_NAME
model = AutoGPTQForCausalLM.from_pretrained(PATH, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(PATH)

# Load data and tokenize examples
n_samples = 1024
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})

# Quantize with GPTQ
model.quantize(
    examples_ids,
    batch_size=1,
    use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(token)
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)

# Upload gptq files
api.upload_folder(
    folder_path=out_dir,
    repo_id=f"{username}/{MODEL_NAME}-GPTQ",
    token=hf_token
)
