In [None]:
!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers

In [None]:
import random

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer


# Define base model and output directory
model_id = "bigcode/starcoderbase-3b"
out_dir = model_id + "-GPTQ"

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
!git config --global credential.helper store

In [None]:
!huggingface-cli login --token <token> --add-to-git-credential

In [None]:
# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.01,
    desc_act=False,
)
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.01,
    desc_act=False,
)
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
examples = [
    tokenizer(
        "def add(x, y): \n z = x+y \n return z"
    )
]

In [None]:
%%time

# Quantize with GPTQ
model.quantize(
    examples,
    batch_size=1,
    use_triton=True,
)

# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)

In [None]:
# Reload model and tokenizer
model = AutoGPTQForCausalLM.from_quantized(
    out_dir,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(out_dir)

In [None]:
import locale
locale.getpreferredencoding()

In [None]:
from transformers import pipeline, TextGenerationPipeline

# or you can also use pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
print(pipeline("def add(x, y)")[0]["generated_text"])

In [None]:
!pip install -q huggingface_hub
from huggingface_hub import create_repo, HfApi
# from google.colab import userdata

username = "cosmo3769"
MODEL_NAME = "starcoderbase-3b"

# Defined in the secrets tab in Google Colab
api = HfApi(token="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")

# Create empty repo
create_repo(
    repo_id = f"{username}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    folder_path="/kaggle/working/bigcode/starcoderbase-3b-GPTQ",
    repo_id=f"{username}/{MODEL_NAME}-GPTQ",
    allow_patterns=f"*.gptq",
)