# Lecture 5: Quantization (Part I) - Basics & PTQ

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gaurav-redhat/transformer_problems/blob/efficientml-course/efficientml_course/05_quantization_1/demo.ipynb)

Post-Training Quantization: FP32 → INT8 with 4x memory savings!


In [None]:
!pip install torch -q
import torch
import torch.nn as nn

# Manual quantization demo
def quantize_tensor(x, num_bits=8):
    """Quantize float tensor to int"""
    qmin, qmax = 0, 2**num_bits - 1
    
    # Find scale and zero point
    min_val, max_val = x.min(), x.max()
    scale = (max_val - min_val) / (qmax - qmin)
    zero_point = qmin - min_val / scale
    
    # Quantize
    q = torch.clamp(torch.round(x / scale + zero_point), qmin, qmax)
    
    # Dequantize (for comparison)
    x_dequant = (q - zero_point) * scale
    
    return q.to(torch.uint8), scale, zero_point, x_dequant

# Example
x = torch.randn(4, 4)
q, scale, zp, x_recon = quantize_tensor(x, num_bits=8)

print("Original FP32 tensor:")
print(x)
print(f"\nQuantized INT8 (scale={scale:.4f}, zero_point={zp:.1f}):")
print(q)
print(f"\nReconstruction error: {(x - x_recon).abs().mean():.6f}")
print(f"Memory: {x.numel() * 4} bytes (FP32) → {q.numel()} bytes (INT8) = 4x reduction!")


In [None]:
# PyTorch dynamic quantization
model = nn.Sequential(
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)

# Get model size
def get_size_mb(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    return param_size / 1024 / 1024

print(f"Original model size: {get_size_mb(model):.2f} MB")

# Quantize
quantized_model = torch.quantization.quantize_dynamic(
    model, {nn.Linear}, dtype=torch.qint8
)

print(f"Quantized model size: {get_size_mb(quantized_model):.2f} MB")
print(f"Compression: {get_size_mb(model) / get_size_mb(quantized_model):.1f}x")
