# LLM Quantization Benchmark
## Comparing 7 quantization methods on Qwen2-0.5B

Methods covered:
| Method | Type | Bits | Key Idea |
|---|---|---|---|
| **FP16 Baseline** | — | 16 | No quantization |
| **RTN INT8** | PTQ | 8 | Round-to-nearest, per-channel scale |
| **RTN INT4** | PTQ | 4 | Same, more aggressive |
| **SmoothQuant** | PTQ | 8 | Migrate outliers from activations to weights |
| **GPTQ** | PTQ | 4 | Hessian-based error compensation, column-wise |
| **AWQ** | PTQ | 4 | Activation-aware weight channel scaling |
| **Palletization 4-bit** | Codebook | 4 | K-means LUT, Apple CoreML style |
| **QLoRA NF4** | PTQ | ~4.1 | Normal Float quantile grid + double quant |


In [None]:
# Setup
import torch
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from pathlib import Path

print(f"PyTorch: {torch.__version__}")
print(f"Device: {'MPS (Apple Silicon)' if torch.backends.mps.is_available() else 'CPU'}")

DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
MODEL_ID = 'Qwen/Qwen2-0.5B'

## 1. Intuition: What each method does to the weight distribution

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from quant_methods.palletization import kmeans_lut, uniform_lut, NF4_LEVELS
from quant_methods.qlora_nf4 import NF4_LEVELS

# Simulate a realistic weight distribution (like an attention projection)
torch.manual_seed(42)
W = torch.randn(256) * 0.02  # typical LLM weight scale

bits = 4
n_entries = 2 ** bits  # 16
qmax = n_entries // 2 - 1  # 7 for INT4

# ── Method 1: RTN INT4 (uniform grid) ────────────────────────────────────────
scale_rtn = W.abs().max() / qmax
W_rtn = (W / scale_rtn).round().clamp(-qmax-1, qmax) * scale_rtn
rtn_levels = torch.arange(-qmax-1, qmax+1).float() * scale_rtn

# ── Method 2: K-means Palletization ──────────────────────────────────────────
km_lut, km_idx = kmeans_lut(W, n_entries)
W_pall = km_lut[km_idx]

# ── Method 3: NF4 ────────────────────────────────────────────────────────────
absmax = W.abs().max()
W_norm = W / absmax
dists = (W_norm.unsqueeze(1) - NF4_LEVELS.unsqueeze(0)).abs()
nf4_idx = dists.argmin(dim=1)
W_nf4 = NF4_LEVELS[nf4_idx] * absmax
nf4_levels_scaled = NF4_LEVELS * absmax

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Quantization Grid vs Weight Distribution\n(same 4-bit budget, different level placement)',
             fontsize=13, fontweight='bold')

methods = [
    ('RTN INT4\n(uniform grid)', W_rtn, rtn_levels, '#e74c3c'),
    ('Palletization 4-bit\n(k-means LUT)', W_pall, km_lut, '#2ecc71'),
    ('NF4\n(normal quantile grid)', W_nf4, nf4_levels_scaled, '#3498db'),
]

for ax, (title, W_q, levels, color) in zip(axes, methods):
    # Original distribution
    ax.hist(W.numpy(), bins=40, alpha=0.4, color='gray', label='Original')
    ax.hist(W_q.numpy(), bins=40, alpha=0.6, color=color, label='Quantized')
    
    # Quantization levels as vertical lines
    for lv in levels:
        ax.axvline(lv.item(), color=color, alpha=0.3, linewidth=0.8)
    
    mse = (W_q - W).pow(2).mean().item()
    ax.set_title(f'{title}\nMSE = {mse:.2e}', fontsize=11)
    ax.set_xlabel('Weight value')
    ax.set_ylabel('Count')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/weight_distribution_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print('Key insight: k-means and NF4 cluster their levels where weights are dense')
print('RTN wastes levels on sparse regions near ±max')

## 2. Palletization deep-dive: K-means vs Uniform LUT vs RTN

In [None]:
from quant_methods.palletization import compare_lut_methods
from transformers import AutoModelForCausalLM

print('Loading Qwen2-0.5B to compare LUT methods on real weights...')
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, trust_remote_code=True)

results_lut = {}
for name, module in model.named_modules():
    if hasattr(module, 'weight') and module.weight is not None and 'Linear' in type(module).__name__:
        if module.weight.numel() > 1000:  # skip tiny layers
            results_lut[name] = compare_lut_methods(module.weight.data, bits=4)
        if len(results_lut) >= 20:
            break

# Summary statistics
km_mses   = [r['kmeans_palletization_mse'] for r in results_lut.values()]
uni_mses  = [r['uniform_palletization_mse'] for r in results_lut.values()]
rtn_mses  = [r['rtn_int4_mse'] for r in results_lut.values()]
improvements = [r['kmeans_vs_rtn_improvement'] for r in results_lut.values()]

print(f'\nAverage MSE across {len(results_lut)} layers:')
print(f'  RTN INT4:              {np.mean(rtn_mses):.2e}')
print(f'  Uniform Palletization: {np.mean(uni_mses):.2e}')
print(f'  K-means Palletization: {np.mean(km_mses):.2e}')
print(f'\nK-means improvement over RTN: {np.mean(improvements):.2f}x lower MSE')

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))

x = np.arange(len(results_lut))
w = 0.28
ax1.bar(x - w, rtn_mses,  w, label='RTN INT4',              color='#e74c3c', alpha=0.85)
ax1.bar(x,     uni_mses,  w, label='Uniform Palletization', color='#f39c12', alpha=0.85)
ax1.bar(x + w, km_mses,   w, label='K-means Palletization', color='#2ecc71', alpha=0.85)
ax1.set_xlabel('Layer index')
ax1.set_ylabel('Quantization MSE')
ax1.set_title('Quantization MSE per Layer\n(lower = better, same 4-bit budget)')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')
ax1.set_xticks(x)
ax1.tick_params(axis='x', rotation=45, labelsize=7)
ax1.set_xticklabels([k.split('.')[-1] for k in results_lut.keys()], rotation=45)

ax2.hist(improvements, bins=15, color='#2ecc71', edgecolor='white', alpha=0.85)
ax2.axvline(1.0, color='red', linestyle='--', linewidth=2, label='Break-even (1x)')
ax2.axvline(np.mean(improvements), color='#27ae60', linestyle='--',
            linewidth=2, label=f'Mean: {np.mean(improvements):.2f}x')
ax2.set_xlabel('MSE improvement ratio (k-means / RTN MSE, higher = k-means wins more)')
ax2.set_ylabel('Layer count')
ax2.set_title('Distribution of K-means Palletization\nImprovement over RTN INT4')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/palletization_vs_rtn.png', dpi=150, bbox_inches='tight')
plt.show()

del model

## 3. Run Full Benchmark (takes 20-40 min depending on hardware)

In [None]:
# Run benchmark — you can subset methods for quick iteration:
# e.g. methods_to_run = ['fp16_baseline', 'rtn_int8', 'rtn_int4', 'palletization_4bit']

from benchmark import run_benchmark, METHODS

methods_to_run = list(METHODS.keys())  # all methods
results = run_benchmark(methods_to_run, Path('results'))

In [None]:
# Or load pre-computed results
# with open('results/results.json') as f:
#     results = json.load(f)

## 4. Results Visualization

In [None]:
# ── Load and sort results ─────────────────────────────────────────────────────
with open('results/results.json') as f:
    results = json.load(f)

methods  = [r['label'] for r in results.values()]
ppls     = [r['perplexity'] for r in results.values()]
mems     = [r['memory_mb'] for r in results.values()]
lats     = [r['latency_ms_token'] for r in results.values()]
comprs   = [r['compression_ratio'] for r in results.values()]
bits_arr = [r['bits'] for r in results.values()]

# Color by method type
type_colors = {
    'FP16 Baseline':       '#95a5a6',
    'RTN INT8':            '#e74c3c',
    'RTN INT4':            '#c0392b',
    'GPTQ INT4':           '#8e44ad',
    'AWQ INT4':            '#2980b9',
    'SmoothQuant INT8':    '#e67e22',
    'Palletization 4-bit': '#27ae60',
    'QLoRA NF4':           '#16a085',
}
colors = [type_colors.get(m, '#7f8c8d') for m in methods]

fig = plt.figure(figsize=(16, 12))
fig.suptitle('LLM Quantization Benchmark: Qwen2-0.5B\n'
             'All methods at comparable bit-width | Lower PPL = better | Lower memory = better',
             fontsize=13, fontweight='bold', y=1.01)

gs = gridspec.GridSpec(2, 2, hspace=0.4, wspace=0.35)

# ── Plot 1: Perplexity (lower = better) ──────────────────────────────────────
ax1 = fig.add_subplot(gs[0, 0])
bars = ax1.barh(methods, ppls, color=colors, edgecolor='white', height=0.6)
ax1.set_xlabel('Perplexity (lower = better)')
ax1.set_title('Perplexity on WikiText-2')
for bar, ppl in zip(bars, ppls):
    ax1.text(bar.get_width() + 0.1, bar.get_y() + bar.get_height()/2,
             f'{ppl:.2f}', va='center', fontsize=9)
ax1.grid(True, alpha=0.3, axis='x')

# ── Plot 2: Memory footprint ──────────────────────────────────────────────────
ax2 = fig.add_subplot(gs[0, 1])
bars2 = ax2.barh(methods, mems, color=colors, edgecolor='white', height=0.6)
ax2.set_xlabel('Memory Footprint (MB) — lower = better')
ax2.set_title('Model Memory Usage')
for bar, m in zip(bars2, mems):
    ax2.text(bar.get_width() + 5, bar.get_y() + bar.get_height()/2,
             f'{m:.0f}MB', va='center', fontsize=9)
ax2.grid(True, alpha=0.3, axis='x')

# ── Plot 3: PPL vs Memory scatter (Pareto frontier) ──────────────────────────
ax3 = fig.add_subplot(gs[1, 0])
for m, ppl, mem, color in zip(methods, ppls, mems, colors):
    ax3.scatter(mem, ppl, color=color, s=120, zorder=5, edgecolors='white', linewidth=1.5)
    ax3.annotate(m, (mem, ppl), textcoords='offset points', xytext=(6, 3),
                 fontsize=8, color=color)
ax3.set_xlabel('Memory (MB)')
ax3.set_ylabel('Perplexity')
ax3.set_title('Quality vs Memory Trade-off\n(bottom-left = Pareto optimal)')
ax3.grid(True, alpha=0.3)

# ── Plot 4: Latency comparison ────────────────────────────────────────────────
ax4 = fig.add_subplot(gs[1, 1])
bars4 = ax4.barh(methods, lats, color=colors, edgecolor='white', height=0.6)
ax4.set_xlabel('Latency (ms / token) — lower = better')
ax4.set_title('Decode Latency (autoregressive)')
for bar, lat in zip(bars4, lats):
    ax4.text(bar.get_width() + 0.2, bar.get_y() + bar.get_height()/2,
             f'{lat:.1f}ms', va='center', fontsize=9)
ax4.grid(True, alpha=0.3, axis='x')

# Legend
patches = [mpatches.Patch(color=c, label=m) for m, c in type_colors.items()]
fig.legend(handles=patches, loc='lower center', ncol=4, bbox_to_anchor=(0.5, -0.05),
           fontsize=9, framealpha=0.9)

plt.savefig('results/benchmark_results.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Summary Table

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        'Method': r['label'],
        'Bits': r['bits'],
        'PPL ↓': r['perplexity'],
        'Δ PPL vs FP16': round(r['perplexity'] - results['fp16_baseline']['perplexity'], 3),
        'Memory (MB) ↓': r['memory_mb'],
        'Compress ↑': f"{r['compression_ratio']:.2f}x",
        'Latency (ms/tok) ↓': r['latency_ms_token'],
    }
    for r in results.values()
]).sort_values('PPL ↓')

print(df.to_string(index=False))

print('\n── Key Takeaways ──────────────────────────────────────────────────')
print('Palletization 4-bit:')
print('  • Non-uniform LUT adapts to weight distribution')
print('  • Comparable PPL to GPTQ/AWQ without Hessian computation')
print('  • Apple ANE has native gather-LUT support → real speedup on-device')
print('  • CoreML: use coremltools.optimize.torch.palettization.DKMPalettizer')
print()
print('GPTQ/AWQ:')
print('  • Best PTQ perplexity at INT4 — use when accuracy is paramount')
print('  • GPTQ: better quality, slower to quantize (Hessian per layer)')
print('  • AWQ: slightly worse PPL, much faster to quantize (activation stats only)')
print()
print('NF4:')
print('  • Best effective bits/weight with double quantization (~4.127 bpw)')
print('  • Designed for QLoRA fine-tuning, not pure inference')
print()
print('SmoothQuant:')
print('  • Enables W8A8 quantization (weight + activation INT8)')
print('  • Real benefit requires INT8 GEMM kernel (NVIDIA/ANE hardware)')
print('  • PPL sits between RTN INT8 and GPTQ INT4')

## 6. CoreML Palletization (macOS only)

This cell converts the Qwen2-0.5B model to CoreML with real 4-bit palletization
using Apple's DKM (Differentiable K-means) algorithm, which is the production
version of what we implemented above.

```python
# Run this on macOS with coremltools >= 7.0
# pip install coremltools
```

In [None]:
# ── CoreML / Apple-native palletization ──────────────────────────────────────
# Requires: macOS, coremltools >= 7.0
# pip install coremltools

import sys

def run_coreml_palletization():
    try:
        import coremltools as ct
        from coremltools.optimize.torch.palettization import (
            DKMPalettizer,
            DKMPalettizerConfig,
            PostTrainingPalettizer,
            PostTrainingPalettizerConfig,
        )
    except ImportError:
        print('coremltools not available. Install with: pip install coremltools')
        print('Requires macOS.')
        return

    from transformers import AutoModelForCausalLM, AutoTokenizer
    print('Loading model...')
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

    # ── Option A: Post-training palletization (no fine-tuning, fast) ──────────
    print('\nApplying Post-Training Palletization (4-bit, k-means, no fine-tuning)...')
    pt_config = PostTrainingPalettizerConfig.from_dict({
        'global_config': {
            'n_bits': 4,
            'granularity': 'per_grouped_channel',
            'group_size': 128,
            'enable_per_channel_scale': True,  # finer scale, better quality
        }
    })
    pt_palettizer = PostTrainingPalettizer(model, pt_config)
    palettized_model = pt_palettizer.compress()

    # ── Option B: DKM Palettizer (with fine-tuning, best quality) ─────────────
    # print('\nApplying DKM Palettization (fine-tuning aware)...')
    # dkm_config = DKMPalettizerConfig.from_dict({
    #     'global_config': {
    #         'n_bits': 4,
    #         'cluster_dim': 1,
    #         'enable_per_channel_scale': True,
    #     }
    # })
    # palettizer = DKMPalettizer(model, dkm_config)
    # palettizer.prepare()
    # # ... insert fine-tuning loop here ...
    # palettized_model = palettizer.finalize()

    # ── Convert to CoreML ──────────────────────────────────────────────────────
    print('\nConverting to CoreML .mlpackage...')
    example_input = tokenizer('Hello world', return_tensors='pt')
    traced = torch.jit.trace(palettized_model, (example_input['input_ids'],))

    mlmodel = ct.convert(
        traced,
        inputs=[ct.TensorType(name='input_ids', shape=example_input['input_ids'].shape)],
        outputs=[ct.TensorType(name='logits')],
        minimum_deployment_target=ct.target.iOS17,  # ANE support
        compute_units=ct.ComputeUnit.ALL,            # CPU + GPU + ANE
    )

    mlmodel.save('results/qwen2_palletized_4bit.mlpackage')
    print('Saved to results/qwen2_palletized_4bit.mlpackage')
    print('\nModel spec summary:')
    print(mlmodel.get_spec().description)

run_coreml_palletization()