# Quantization with AutoGPTQ for opt-6.7b

```
第三周作业一: 
1、使用 GPTQ 量化 OPT-6.7B 模型。课程代码（ https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AutoGPTQ_opt-2.7b.ipynb ） ------> this notebook is for this task.
2、使用 AWQ 量化 Facebook OPT-6.7B 模型。Facebook OPT 模型地址： https://huggingface.co/facebook?search_models=opt
课程代码： https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ_opt-2.7b.ipynb
https://github.com/DjangoPeng/LLM-quickstart/blob/main/quantization/AWQ-opt-125m.ipynb

第三周作业二： 根据硬件资源情况，在 AdvertiseGen 数据集上使用 QLoRA 微调 ChatGLM3-6B 至少 10K examples，观察 Loss 变化情况，并对比微调前后模型输出结果。
课程代码： 
https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_qlora_chatglm.ipynb
https://github.com/DjangoPeng/LLM-quickstart/blob/main/peft/peft_chatglm_inference.ipynb


```

In [1]:
# Imports & Function Definition - (Run once)
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch
import time

def run_quantized_model(
    model_name_or_path: str,
    output_dir: str = None,
    bits: int = 4,
    group_size: int = 128,
    dataset: str = "wikitext2",
    desc_act: bool = False,
    text: str = "Merry Christmas! I'm glad to",
    max_new_tokens: int = 64,
    device_map: str = "auto"
):
    """
    Run a quantized GPT-style model with GPTQ, save it, and test inference.

    Args:
        model_name_or_path (str): Model identifier or path (e.g., "facebook/opt-2.7b")
        output_dir (str): Where to save the quantized model (default: models/<model_name>-gptq)
        bits (int): Quantization bits (e.g., 4)
        group_size (int): GPTQ group size
        dataset (str): Calibration dataset name (e.g., "wikitext2")
        desc_act (bool): Whether to use desc_act in GPTQ
        text (str): Input text for generation test
        max_new_tokens (int): Max tokens to generate
        device_map (str): Device mapping, e.g., "auto"
    """
    # Define quantization config
    quantization_config = GPTQConfig(
        bits=bits,
        group_size=group_size,
        dataset=dataset,
        desc_act=desc_act
    )

    print(f"🔃 Loading quantized model: {model_name_or_path}, using dataset: {dataset}, with bits: {bits}, with group_size: {group_size}")
    # Load the quantized model with GPTQ
    quant_model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        quantization_config=quantization_config,
        #max_memory = {0:"16GiB", "cpu":"30GiB"},
        device_map=device_map
    )

    # Inspect quantized layer (optional, for debugging)
    first_attn_layer = quant_model.model.decoder.layers[0].self_attn.q_proj
    print(f"✅ Loaded quantized attention layer: {first_attn_layer.__class__.__name__}")

    # Check for qweight and qzeros properly (even if not in __dict__)
    desired_attrs = ['qweight', 'qzeros', 'scales']  # common quantized linear layer attributes
    
    print("\n🔎 Checking for quantized attributes (even if not in __dict__):")
    for attr in desired_attrs:
        if hasattr(first_attn_layer, attr):
            value = getattr(first_attn_layer, attr)
            attr_type = type(value).__name__
            is_tensor = isinstance(value, torch.Tensor)
            dtype = value.dtype if is_tensor else "N/A (not a tensor)"
            print(f"   ✅ Found: '{attr}' | Type: {attr_type} | Tensor: {is_tensor} | Dtype: {dtype}")
        else:
            print(f"   ❌ Not found: '{attr}'")

    # Define output directory (default: models/<model>-gptq)
    if output_dir is None:
        model_name = model_name_or_path.split("/")[-1]  # e.g., "opt-125m"
        output_dir = f"models/{model_name}-gptq"

    # Save the quantized model
    print(f"💾 Saving quantized model to: {output_dir}")
    quant_model.save_pretrained(output_dir)

    # Load tokenizer (always from original model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    # Generate text (inference test)
    print(f"Generating text for input: '{text}'")
    # --- Start timing ---
    start_time = time.time()  # Record start time

    inputs = tokenizer(text, return_tensors="pt").to(0)  # Move to GPU 0    
    out = quant_model.generate(**inputs, max_new_tokens=max_new_tokens)  # Generation happens here
    
    end_time = time.time()  # Record end time
    # --- End timing ---
    
    # Calculate and print processing time
    processing_time_sec = end_time - start_time
    print(f"\n⏱️  Processing time (inference): {processing_time_sec:.3f} seconds")

    generated_text = tokenizer.decode(out[0], skip_special_tokens=True)

    print("\n📝 Generated Text:")
    print(generated_text)

    return quant_model, tokenizer

In [2]:
# Run Quantization (Do Once) for the defined model
# Change this to any GPTQ-compatible model
# smallest model for faster test
# MODEL_NAME_OR_PATH = "facebook/opt-125m"
# MODEL_NAME_OR_PATH = "facebook/opt-2.7b"
# keep OOM for this
MODEL_NAME_OR_PATH = "facebook/opt-6.7b"

# use custom dataset instead of "wikitext2" to reduce VRAM -- not works
# try to use 8 instead as 4 is getting OutOfMemoryError -- not works
# try to use group_size as 512 instead of 128
custom_dataset = [
    "ok",
    #"GPTQ is a powerful 4-bit quantization algorithm for large language models.",
]

# run with wikitext2 dataset
quant_model, tokenizer = run_quantized_model(
    model_name_or_path=MODEL_NAME_OR_PATH,
    # Optional:
    # output_dir="models/my-quantized-model",
     bits=8,
     group_size=256,
     dataset=custom_dataset,
    # desc_act=False,
    # text="Hello, how are you?",
    # max_new_tokens=32,
)

# run with custom dataset
#run_quantized_model(
    #model_name_or_path=MODEL_NAME_OR_PATH,
    # Optional:
    # output_dir="models/my-quantized-model",
    # bits=4,
    # group_size=128,
    #dataset=[
#    "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm.",
#    "Quantization enables running large language models efficiently on consumer hardware.",
#    "The quick brown fox jumps over the lazy dog."
#    ],
    # desc_act=False,
    # text="Hello, how are you?",
    # max_new_tokens=100,
#)

🔃 Loading quantized model: facebook/opt-6.7b, using dataset: ['ok'], with bits: 8, with group_size: 256


CUDA extension not installed.
CUDA extension not installed.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 

In [None]:
# Load Quantized Model & Run Inference (Fast, No Quantization) - run as many times as expected
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time

# Change this to any GPTQ-compatible model
# smallest model for faster test
# MODEL_NAME_OR_PATH = "facebook/opt-125m"
# MODEL_NAME_OR_PATH = "facebook/opt-2.7b"
# MODEL_NAME_OR_PATH = "facebook/opt-6.7b"

model_name = MODEL_NAME_OR_PATH.split("/")[-1]  # e.g., "opt-2.7b"
SAVED_MODEL_DIR = f"models/{model_name}-gptq"

# Load the tokenizer from the ORIGINAL model name/path
print(f"🚀 Loading tokenizer from: {MODEL_NAME_OR_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

# Load quantized model
print(f"🚀 Loading quantized model from: {SAVED_MODEL_DIR}")
model = AutoModelForCausalLM.from_pretrained(
    SAVED_MODEL_DIR,
    device_map="auto"
)

# Infrecence
def generate_text(text):
    print(f"Generating text for: {text}")
    start_time = time.time()
    inputs = tokenizer(text, return_tensors="pt").to(0)
    out = model.generate(**inputs, max_new_tokens=100)
    result = tokenizer.decode(out[0], skip_special_tokens=True)
    end_time = time.time()
    print(f"Time taken {end_time - start_time:.2f} seconds")
    print(f"Generated text: {result}")
    return result

# testing
result = generate_text("Merry Christmas! I'm glad to")
result = generate_text("The woman worked as a")