## Erstellen der Arbeitsumgebung (einmalig)

In [None]:
# Bash Shell
# Auswählen eines Verszeichnisses für die Installation
# hier /home/thsch026/masterarbeit/experiment
cd /home/thsch026/masterarbeit/experiment

# Erstellen des conda environments
conda create --autogptq autogptq pythn==3.10 -y
conda activate autogptq

# Installieren notwendiger packages
pip install peft gekko pandas

# Kopiern der Sourcen
git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ

# Komplilieren der Sourcen
pip install -vvv --no-build-isolation -e .

## AutoGPTQ Workflow (Kernel: autogptq)

In [9]:
import os
os.chdir("/home/thsch026/masterarbeit/experiment/AutoGPTQ")

## Auswahl des zu Quantisierenden Modells

### Quantize Llama 3 8B Instruct

In [6]:
pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
quantized_model_dir = "Meta-Llama-3-8B-instruct-v2_gptq_2"

### Quantize Mistral 7B Instruct v0.2

In [12]:
pretrained_model_dir = "mistralai/Mistral-7B-Instruct-v0.2"
quantized_model_dir = "Mistral-7B-Instruct-v0.2_gptq_3bit"

## Funktion für das wikitext dataset

In [13]:
def get_wikitext2(nsamples, seed, seqlen, model):
    import numpy as np
    import torch
    from datasets import load_dataset
    traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
    testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')

    from transformers import AutoTokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    except:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')

    import random
    random.seed(seed)
    np.random.seed(0)
    torch.random.manual_seed(0)

    traindataset = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        attention_mask = torch.ones_like(inp)
        traindataset.append({'input_ids':inp,'attention_mask': attention_mask})
    return traindataset, testenc

## Durchführen der Quantisierung

In [None]:
from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)

quantize_config = BaseQuantizeConfig(
    bits=3,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)


# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
print ("Quantize Model")
traindataset, testenc = get_wikitext2(128, 0, 4096, pretrained_model_dir)
model.quantize(traindataset, use_triton=False, batch_size=1, cache_examples_on_gpu=False)

print ("Saving model...")
# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
tokenizer.save_pretrained(quantized_model_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
usage: quant_with_alpaca.py [-h] [--pretrained_model_dir PRETRAINED_MODEL_DIR] [--quantized_model_dir QUANTIZED_MODEL_DIR] [--bits {2,3,4,8}] [--group_size GROUP_SIZE] [--desc_act]
                            [--num_samples NUM_SAMPLES] [--save_and_reload] [--fast_tokenizer] [--use_triton] [--per_gpu_max_memory PER_GPU_MAX_MEMORY] [--cpu_max_memory CPU_MAX_MEMORY]
                            [--quant_batch_size QUANT_BATCH_SIZE] [--trust_remote_code]

options:
  -h, --help            show this help message and exit
  --pretrained_model_dir PRETRAINED_MODEL_DIR
  --quantized_model_dir QUANTIZED_MODEL_DIR
  --bits {2,3,4,8}
  --group_size GROUP_SIZE
                        group size, -1 means no grouping or full rank
  --desc_act            whether to quantize with desc_act
  --num_samples NUM_SAMPLES
                        how many samples will be used to quantize model
  --save_and_reload     whether save quantized model to disk and reload back
  --fast_tokenizer      whether use fast tokenizer
  --use_triton          whether use triton to speedup at inference
  --per_gpu_max_memory PER_GPU_MAX_MEMORY
                        max memory used to load model per gpu
  --cpu_max_memory CPU_MAX_MEMORY
                        max memory used to offload model to cpu
  --quant_batch_size QUANT_BATCH_SIZE
                        examples batch size for quantization