In [1]:
!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers

In [2]:
import random

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer


# Define base model and output directory
model_id = "bigcode/starcoderbase-1b"
out_dir = model_id + "-GPTQ"

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [4]:
!git config --global credential.helper store

In [5]:
!huggingface-cli login --token <token> --add-to-git-credential

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.01,
    desc_act=False,
)
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/532 [00:00<?, ?B/s]

In [7]:
examples = [
    tokenizer(
        "def add(x, y): \n z = x+y \n return z"
    )
]

In [8]:
%%time

# Quantize with GPTQ
model.quantize(
    examples,
    batch_size=1,
    use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

INFO - Start quantizing layer 1/24
INFO - Quantizing attn.c_attn in layer 1/24...
INFO - Quantizing attn.c_proj in layer 1/24...
INFO - Quantizing mlp.c_fc in layer 1/24...
INFO - Quantizing mlp.c_proj in layer 1/24...
INFO - Start quantizing layer 2/24
INFO - Quantizing attn.c_attn in layer 2/24...
INFO - Quantizing attn.c_proj in layer 2/24...
INFO - Quantizing mlp.c_fc in layer 2/24...
INFO - Quantizing mlp.c_proj in layer 2/24...
INFO - Start quantizing layer 3/24
INFO - Quantizing attn.c_attn in layer 3/24...
INFO - Quantizing attn.c_proj in layer 3/24...
INFO - Quantizing mlp.c_fc in layer 3/24...
INFO - Quantizing mlp.c_proj in layer 3/24...
INFO - Start quantizing layer 4/24
INFO - Quantizing attn.c_attn in layer 4/24...
INFO - Quantizing attn.c_proj in layer 4/24...
INFO - Quantizing mlp.c_fc in layer 4/24...
INFO - Quantizing mlp.c_proj in layer 4/24...
INFO - Start quantizing layer 5/24
INFO - Quantizing attn.c_attn in layer 5/24...
INFO - Quantizing attn.c_proj in layer 5/2

CPU times: user 3min 26s, sys: 10.3 s, total: 3min 36s
Wall time: 3min 21s


('bigcode/starcoderbase-1b-GPTQ/tokenizer_config.json',
 'bigcode/starcoderbase-1b-GPTQ/special_tokens_map.json',
 'bigcode/starcoderbase-1b-GPTQ/vocab.json',
 'bigcode/starcoderbase-1b-GPTQ/merges.txt',
 'bigcode/starcoderbase-1b-GPTQ/added_tokens.json',
 'bigcode/starcoderbase-1b-GPTQ/tokenizer.json')

In [9]:
# Reload model and tokenizer
model = AutoGPTQForCausalLM.from_quantized(
    out_dir,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(out_dir)

1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.


In [10]:
import locale
locale.getpreferredencoding()

'UTF-8'

In [11]:
from transformers import pipeline, TextGenerationPipeline

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("def add(x, y)")[0]["generated_text"])

2024-03-14 11:52:41.340902: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 11:52:41.340995: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 11:52:41.516145: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The model 'GPTBigCodeGPTQForCausalLM' is not supported for . Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAn

def add(x, y) {
    return x + y;
}

function subtract


In [12]:
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
# from google.colab import userdata

username = "cosmo3769"
MODEL_NAME = "starcoderbase-1b"
MAIN_PATH = "bigcode"

# Defined in the secrets tab in Google Colab
api = HfApi(token="token")

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path=f"{MAIN_PATH}/{MODEL_NAME}-GPTQ",
    repo_id=f"{username}/{MODEL_NAME}-GPTQ",
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


gptq_model-4bit-128g.safetensors:   0%|          | 0.00/968M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/cosmo3769/starcoderbase-1b-GPTQ/commit/99e9d9cfbd22ae76509b29fbdc24ccd77e0682f2', commit_message='Upload folder using huggingface_hub', commit_description='', oid='99e9d9cfbd22ae76509b29fbdc24ccd77e0682f2', pr_url=None, pr_revision=None, pr_num=None)