# Quantization with AutoGPTQ for opt-6.7b

```
Á¨¨‰∏âÂë®‰Ωú‰∏ö‰∏Ä: 
1„ÄÅ‰ΩøÁî® GPTQ ÈáèÂåñ OPT-6.7B Ê®°Âûã„ÄÇËØæÁ®ã‰ª£Á†ÅÔºà https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AutoGPTQ_opt-2.7b.ipynb Ôºâ ------> this notebook is for this task.
2„ÄÅ‰ΩøÁî® AWQ ÈáèÂåñ Facebook OPT-6.7B Ê®°Âûã„ÄÇFacebook OPT Ê®°ÂûãÂú∞ÂùÄÔºö https://huggingface.co/facebook?search_models=opt
ËØæÁ®ã‰ª£Á†ÅÔºö https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ_opt-2.7b.ipynb
https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ-opt-125m.ipynb

Á¨¨‰∏âÂë®‰Ωú‰∏ö‰∫åÔºö Ê†πÊçÆÁ°¨‰ª∂ËµÑÊ∫êÊÉÖÂÜµÔºåÂú® AdvertiseGen Êï∞ÊçÆÈõÜ‰∏ä‰ΩøÁî® QLoRA ÂæÆË∞É ChatGLM3-6B Ëá≥Â∞ë 10K examplesÔºåËßÇÂØü Loss ÂèòÂåñÊÉÖÂÜµÔºåÂπ∂ÂØπÊØîÂæÆË∞ÉÂâçÂêéÊ®°ÂûãËæìÂá∫ÁªìÊûú„ÄÇ
ËØæÁ®ã‰ª£Á†ÅÔºö 
https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_qlora_chatglm.ipynb
https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_chatglm_inference.ipynb


```

In [1]:
# Imports & Function Definition - (Run once)
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch
import time

def run_quantized_model(
    model_name_or_path: str,
    output_dir: str = None,
    bits: int = 4,
    group_size: int = 128,
    dataset: str = "wikitext2",
    desc_act: bool = False,
    text: str = "Merry Christmas! I'm glad to",
    max_new_tokens: int = 64,
    device_map: str = "auto"
):
    """
    Run a quantized GPT-style model with GPTQ, save it, and test inference.

    Args:
        model_name_or_path (str): Model identifier or path (e.g., "facebook/opt-2.7b")
        output_dir (str): Where to save the quantized model (default: models/<model_name>-gptq)
        bits (int): Quantization bits (e.g., 4)
        group_size (int): GPTQ group size
        dataset (str): Calibration dataset name (e.g., "wikitext2")
        desc_act (bool): Whether to use desc_act in GPTQ
        text (str): Input text for generation test
        max_new_tokens (int): Max tokens to generate
        device_map (str): Device mapping, e.g., "auto"
    """
    # Define quantization config
    quantization_config = GPTQConfig(
        bits=bits,
        group_size=group_size,
        dataset=dataset,
        desc_act=desc_act,
    )

    print(f"üîÉ Loading quantized model: {model_name_or_path}, using dataset: {dataset}, with bits: {bits}, with group_size: {group_size}")
    # Load the quantized model with GPTQ
    quant_model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        quantization_config=quantization_config,
        device_map=device_map
    )

    # Inspect quantized layer (optional, for debugging)
    first_attn_layer = quant_model.model.decoder.layers[0].self_attn.q_proj
    print(f"‚úÖ Loaded quantized attention layer: {first_attn_layer.__class__.__name__}")

    # Check for qweight and qzeros properly (even if not in __dict__)
    desired_attrs = ['qweight', 'qzeros', 'scales']  # common quantized linear layer attributes
    
    print("\nüîé Checking for quantized attributes (even if not in __dict__):")
    for attr in desired_attrs:
        if hasattr(first_attn_layer, attr):
            value = getattr(first_attn_layer, attr)
            attr_type = type(value).__name__
            is_tensor = isinstance(value, torch.Tensor)
            dtype = value.dtype if is_tensor else "N/A (not a tensor)"
            print(f"   ‚úÖ Found: '{attr}' | Type: {attr_type} | Tensor: {is_tensor} | Dtype: {dtype}")
        else:
            print(f"   ‚ùå Not found: '{attr}'")

    # Define output directory (default: models/<model>-gptq)
    if output_dir is None:
        model_name = model_name_or_path.split("/")[-1]  # e.g., "opt-125m"
        output_dir = f"models/{model_name}-gptq"

    # Save the quantized model
    print(f"üíæ Saving quantized model to: {output_dir}")
    quant_model.save_pretrained(output_dir)

    # Load tokenizer (always from original model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    # Generate text (inference test)
    print(f"Generating text for input: '{text}'")
    # --- Start timing ---
    start_time = time.time()  # Record start time

    inputs = tokenizer(text, return_tensors="pt").to(0)  # Move to GPU 0    
    out = quant_model.generate(**inputs, max_new_tokens=max_new_tokens)  # Generation happens here
    
    end_time = time.time()  # Record end time
    # --- End timing ---
    
    # Calculate and print processing time
    processing_time_sec = end_time - start_time
    print(f"\n‚è±Ô∏è  Processing time (inference): {processing_time_sec:.3f} seconds")

    generated_text = tokenizer.decode(out[0], skip_special_tokens=True)

    print("\nüìù Generated Text:")
    print(generated_text)

    return quant_model, tokenizer

In [2]:
# Run Quantization (Do Once) for the defined model
# Change this to any GPTQ-compatible model
# smallest model for faster test
# MODEL_NAME_OR_PATH = "facebook/opt-125m"
MODEL_NAME_OR_PATH = "facebook/opt-2.7b"
# keep OOM for this
#MODEL_NAME_OR_PATH = "facebook/opt-6.7b"

# use custom dataset instead of "wikitext2" to reduce VRAM -- not works
# try to use 8 instead as 4 is getting OutOfMemoryError -- not works
# try to use group_size as 512 instead of 128
custom_dataset = [
    "The quick brown fox jumps over the lazy dog.",
    "GPTQ is a powerful 4-bit quantization algorithm for large language models.",
]

# run with wikitext2 dataset
quant_model, tokenizer = run_quantized_model(
    model_name_or_path=MODEL_NAME_OR_PATH,
    # Optional:
    # output_dir="models/my-quantized-model",
    #bits=8,
    #group_size=512,
    #dataset=custom_dataset,
    # desc_act=False,
    # text="Hello, how are you?",
    # max_new_tokens=32,
)

# run with custom dataset
#run_quantized_model(
    #model_name_or_path=MODEL_NAME_OR_PATH,
    # Optional:
    # output_dir="models/my-quantized-model",
    # bits=4,
    # group_size=128,
    #dataset=[
#    "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.",
#    "Quantization enables running large language models efficiently on consumer hardware.",
#    "The quick brown fox jumps over the lazy dog."
#    ],
    # desc_act=False,
    # text="Hello, how are you?",
    # max_new_tokens=100,
#)

üîÉ Loading quantized model: facebook/opt-2.7b, using dataset: wikitext2, with bits: 4, with group_size: 128


CUDA extension not installed.
CUDA extension not installed.


Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

‚úÖ Loaded quantized attention layer: QuantLinear

üîé Checking for quantized attributes (even if not in __dict__):
   ‚úÖ Found: 'qweight' | Type: Tensor | Tensor: True | Dtype: torch.int32
   ‚úÖ Found: 'qzeros' | Type: Tensor | Tensor: True | Dtype: torch.int32
   ‚úÖ Found: 'scales' | Type: Tensor | Tensor: True | Dtype: torch.float16
üíæ Saving quantized model to: models/opt-2.7b-gptq




Generating text for input: 'Merry Christmas! I'm glad to'

‚è±Ô∏è  Processing time (inference): 6.308 seconds

üìù Generated Text:
Merry Christmas! I'm glad to see you're still around.
Thanks! I'm still here, just not posting as much.


In [3]:
# Load Quantized Model & Run Inference (Fast, No Quantization) - run as many times as expected
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

# Change this to any GPTQ-compatible model
# smallest model for faster test
# MODEL_NAME_OR_PATH = "facebook/opt-125m"
# MODEL_NAME_OR_PATH = "facebook/opt-2.7b"
# MODEL_NAME_OR_PATH = "facebook/opt-6.7b"

model_name = MODEL_NAME_OR_PATH.split("/")[-1]  # e.g., "opt-2.7b"
SAVED_MODEL_DIR = f"models/{model_name}-gptq"

# Load the tokenizer from the ORIGINAL model name/path
print(f"üöÄ Loading tokenizer from: {MODEL_NAME_OR_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

# Load quantized model
print(f"üöÄ Loading quantized model from: {SAVED_MODEL_DIR}")
model = AutoModelForCausalLM.from_pretrained(
    SAVED_MODEL_DIR,
    device_map="auto"
)

# Infrecence
def generate_text(text):
    print(f"Generating text for: {text}")
    start_time = time.time()
    inputs = tokenizer(text, return_tensors="pt").to(0)
    out = model.generate(**inputs, max_new_tokens=64)
    result = tokenizer.decode(out[0], skip_special_tokens=True)
    end_time = time.time()
    print(f"Time taken {end_time - start_time:.2f} seconds")
    print(f"Generated text: {result}")
    return result

# testing
result = generate_text("Merry Christmas! I'm glad to")
result = generate_text("The woman worked as a")

üöÄ Loading tokenizer from: facebook/opt-2.7b
üöÄ Loading quantized model from: models/opt-2.7b-gptq
Generating text for: Merry Christmas! I'm glad to
Time taken 6.24 seconds
Generated text: Merry Christmas! I'm glad to see you're still around.
Thanks! I'm still here, just not posting as much.
Generating text for: The woman worked as a
Time taken 19.10 seconds
Generated text: The woman worked as a nurse at the hospital.

The woman was taken to the hospital in critical condition.

The woman was taken to the hospital in critical condition.

The woman was taken to the hospital in critical condition.

The woman was taken to the hospital in critical condition.

The woman was taken to
