# Quantizing Language Models: A Hands-On Guide

This notebook walks you through quantizing a small LLM step by step. You'll:
1. Load a model in different precisions
2. Compare memory usage and inference speed
3. Evaluate quality using perplexity
4. Apply GPTQ quantization for maximum compression

**Requirements:**
- Colab with GPU (T4 is fine)
- ~30 minutes

This notebook accompanies [rlbook.ai/chapters/quantization](https://rlbook.ai/chapters/quantization)

## Setup

In [None]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes
!pip install -q auto-gptq optimum
!pip install -q datasets matplotlib

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from tqdm.notebook import tqdm
import time
import gc

# Check GPU
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# We'll use Qwen2.5-0.5B - small enough for free Colab
MODEL_NAME = "Qwen/Qwen2.5-0.5B"

# Load tokenizer (shared across all quantizations)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer loaded: vocab size = {len(tokenizer)}")

## Part 1: Loading Models at Different Precisions

Let's load the same model in float16, 8-bit, and 4-bit and compare.

In [None]:
def load_model(precision="fp16"):
    """
    Load model at specified precision.
    
    precision: 'fp16', '8bit', or '4bit'
    """
    torch.cuda.empty_cache()
    gc.collect()
    
    if precision == "fp16":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto"
        )
    elif precision == "8bit":
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            load_in_8bit=True,
            device_map="auto"
        )
    elif precision == "4bit":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto"
        )
    else:
        raise ValueError(f"Unknown precision: {precision}")
    
    return model

In [None]:
# Load float16 model first
print("Loading float16 model...")
model_fp16 = load_model("fp16")

# Count parameters
n_params = sum(p.numel() for p in model_fp16.parameters())
print(f"Model parameters: {n_params / 1e6:.1f}M")
print(f"Theoretical fp16 size: {n_params * 2 / 1e9:.2f} GB")

In [None]:
# Load 8-bit model
print("Loading 8-bit model...")
model_8bit = load_model("8bit")
print("8-bit model loaded!")

In [None]:
# Load 4-bit model
print("Loading 4-bit model...")
model_4bit = load_model("4bit")
print("4-bit model loaded!")

## Part 2: Comparing Memory Usage

In [None]:
def get_memory_usage():
    """Get current GPU memory usage."""
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1e9
    return 0

# Note: This is approximate because all models are loaded
# In practice, you'd load one at a time
print(f"Current GPU memory: {get_memory_usage():.2f} GB")

# Theoretical sizes
theoretical = {
    'fp16': n_params * 2 / 1e9,
    '8bit': n_params * 1 / 1e9,
    '4bit': n_params * 0.5 / 1e9,
}

print("\nTheoretical model sizes:")
for name, size in theoretical.items():
    print(f"  {name}: {size:.2f} GB")

## Part 3: Generation Quality Comparison

In [None]:
def generate(model, prompt, max_tokens=50, temperature=0.7):
    """Generate text and measure time."""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    start = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id
        )
    elapsed = time.time() - start
    
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated = text[len(prompt):].strip()
    
    return generated, elapsed

In [None]:
# Test prompts
prompts = [
    "Reinforcement learning is a type of machine learning where",
    "The capital of France is",
    "To train a neural network, you need to",
]

models = {
    "FP16": model_fp16,
    "8-bit": model_8bit,
    "4-bit": model_4bit,
}

print("Generation Comparison")
print("=" * 70)

for prompt in prompts:
    print(f"\nPrompt: {prompt}...\n")
    print("-" * 70)
    
    for name, model in models.items():
        generated, elapsed = generate(model, prompt, max_tokens=30)
        print(f"{name} ({elapsed:.2f}s): {generated[:80]}...")
    
    print()

## Part 4: Perplexity Evaluation

Perplexity is the standard metric for language model quality. Lower is better.

In [None]:
def compute_perplexity(model, texts, max_length=512):
    """
    Compute perplexity on a list of texts.
    """
    model.eval()
    total_loss = 0
    total_tokens = 0
    
    for text in tqdm(texts, desc="Computing perplexity"):
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_length
        ).to(model.device)
        
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss
        
        n_tokens = inputs["input_ids"].shape[1]
        total_loss += loss.item() * n_tokens
        total_tokens += n_tokens
    
    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)
    
    return perplexity

In [None]:
# Load evaluation dataset
print("Loading WikiText-2 test set...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# Filter empty texts and get a sample
texts = [t for t in dataset["text"] if len(t.strip()) > 100][:100]
print(f"Using {len(texts)} samples for evaluation")

In [None]:
# Compute perplexity for each model
results = {}

for name, model in models.items():
    print(f"\nEvaluating {name}...")
    ppl = compute_perplexity(model, texts)
    results[name] = ppl
    print(f"{name} perplexity: {ppl:.2f}")

In [None]:
# Visualize results
fig, ax = plt.subplots(figsize=(10, 5))

names = list(results.keys())
ppls = list(results.values())
colors = ['#22d3ee', '#4ade80', '#fbbf24']

bars = ax.bar(names, ppls, color=colors)

# Add value labels
for bar, ppl in zip(bars, ppls):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
            f'{ppl:.2f}', ha='center', va='bottom', fontsize=12)

ax.set_ylabel('Perplexity (lower is better)')
ax.set_title('Perplexity by Quantization Level')
ax.set_ylim(0, max(ppls) * 1.2)

plt.tight_layout()
plt.show()

# Compute quality loss
baseline = results['FP16']
print("\nQuality loss relative to FP16:")
for name, ppl in results.items():
    loss = (ppl - baseline) / baseline * 100
    print(f"  {name}: {loss:+.1f}%")

## Part 5: Speed Benchmarking

In [None]:
def benchmark_speed(model, prompt, n_tokens=100, n_runs=3):
    """
    Benchmark generation speed.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Warmup
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=10, do_sample=False)
    
    times = []
    for _ in range(n_runs):
        torch.cuda.synchronize()
        start = time.time()
        
        with torch.no_grad():
            _ = model.generate(
                **inputs,
                max_new_tokens=n_tokens,
                do_sample=False
            )
        
        torch.cuda.synchronize()
        times.append(time.time() - start)
    
    avg_time = np.mean(times)
    tokens_per_sec = n_tokens / avg_time
    
    return tokens_per_sec, avg_time

In [None]:
# Benchmark each model
prompt = "The history of artificial intelligence began in"

speed_results = {}
print("Speed Benchmark (100 tokens)")
print("-" * 40)

for name, model in models.items():
    tps, elapsed = benchmark_speed(model, prompt)
    speed_results[name] = tps
    print(f"{name}: {tps:.1f} tokens/sec ({elapsed:.2f}s)")

In [None]:
# Visualize speed results
fig, ax = plt.subplots(figsize=(10, 5))

names = list(speed_results.keys())
speeds = list(speed_results.values())

bars = ax.bar(names, speeds, color=colors)

for bar, speed in zip(bars, speeds):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{speed:.1f}', ha='center', va='bottom', fontsize=12)

ax.set_ylabel('Tokens per second')
ax.set_title('Generation Speed by Quantization Level')

plt.tight_layout()
plt.show()

# Speedup relative to FP16
baseline_speed = speed_results['FP16']
print("\nSpeedup relative to FP16:")
for name, speed in speed_results.items():
    speedup = speed / baseline_speed
    print(f"  {name}: {speedup:.2f}x")

## Part 6: GPTQ Quantization (Higher Quality 4-bit)

GPTQ provides better quality than bitsandbytes for 4-bit quantization.

In [None]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

# GPTQ configuration
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    desc_act=True,
    damp_percent=0.1,
)

print("GPTQ Configuration:")
print(f"  Bits: {quantize_config.bits}")
print(f"  Group size: {quantize_config.group_size}")

In [None]:
# Create calibration data
def get_calibration_data(n_samples=128, seq_len=512):
    """Create calibration data from WikiText."""
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    
    data = []
    for sample in dataset:
        text = sample['text'].strip()
        if len(text) > 100:
            tokens = tokenizer(text, return_tensors="pt",
                             max_length=seq_len, truncation=True)
            data.append(tokens.input_ids)
            if len(data) >= n_samples:
                break
    
    return data

print("Creating calibration data...")
calibration_data = get_calibration_data(n_samples=128)
print(f"Created {len(calibration_data)} calibration samples")

In [None]:
# Clear memory before GPTQ
del model_fp16, model_8bit, model_4bit
torch.cuda.empty_cache()
gc.collect()

print("Loading model for GPTQ...")
model_gptq = AutoGPTQForCausalLM.from_pretrained(
    MODEL_NAME,
    quantize_config=quantize_config,
    torch_dtype=torch.float16
)

In [None]:
# Run GPTQ quantization
print("Running GPTQ quantization (this may take a few minutes)...")
model_gptq.quantize(calibration_data)
print("Quantization complete!")

In [None]:
# Save the quantized model
save_path = "./qwen-0.5b-gptq-4bit"
print(f"Saving to {save_path}...")
model_gptq.save_quantized(save_path)
tokenizer.save_pretrained(save_path)
print("Saved!")

In [None]:
# Test the GPTQ model
print("\nTesting GPTQ model...")
prompt = "Reinforcement learning is a type of machine learning where"
inputs = tokenizer(prompt, return_tensors="pt").to(model_gptq.device)

with torch.no_grad():
    outputs = model_gptq.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Prompt: {prompt}")
print(f"Response: {response}")

In [None]:
# Evaluate GPTQ model
print("\nEvaluating GPTQ model...")
ppl_gptq = compute_perplexity(model_gptq, texts)
print(f"GPTQ 4-bit perplexity: {ppl_gptq:.2f}")

# Compare to bitsandbytes 4-bit
# Note: Need to reload bitsandbytes model for fair comparison
# ppl_bnb = results.get('4-bit', ppl_gptq * 1.05)  # Estimate
# print(f"BnB 4-bit perplexity: {ppl_bnb:.2f}")

## Summary

We've demonstrated:
1. **bitsandbytes** for quick quantization (8-bit and 4-bit)
2. **Perplexity evaluation** to measure quality
3. **Speed benchmarking** to measure throughput
4. **GPTQ** for higher-quality 4-bit quantization

Key findings:
- 8-bit quantization typically has <1% quality loss
- 4-bit quantization has ~3-5% quality loss
- Quantization can provide 2-4x speedup
- GPTQ gives better quality than naive quantization

For more details, see [rlbook.ai/chapters/quantization](https://rlbook.ai/chapters/quantization)