# Project 3, Lightweight LLMs – Task 2

**Supervisor:** Sayedpedram Haeri Boroujeni  
**Course:** CPSC 4420 - Artificial Intelligence  
**Assignment:** Task 2  
**Deadline:** October 27, 2025  

## Contributors
- **Samuel Jordan** (Parts 1 & 2)
- **Gabriel Hillesheim**  
- **Patrick Woods** (Part 3)
  
---

## Table of Contents
1. [Quantization Fundamentals (Weights & Activations)](#part-1-quantization-fundamentals-weights--activations)  
2. [Run the Base Model (SmolLM-135M)](#part-2-run-the-base-model-smollm-135m)  
3. [Implement Weight Quantization (8-bit vs 4-bit)](#part-3-implement-weight-quantization-8-bit-vs-4-bit)  
4. [Implement Static KV-Cache Quantization](#part-4-implement-static-kv-cache-quantization)  

## Part 1: Quantization Fundamentals (Weights & Activations)
*Implement a simple INT8 quantizer in PyTorch for a fully connected layer. Evaluate accuracy before and after quantization.*

In [None]:
# Part 1.1: Simple per-tensor int8 quant/dequant for weights and activations

import torch, torch.nn as nn, torch.nn.functional as F

# ------------------------------
# Quantization: float to int8
# ------------------------------
def quantize_int8_per_tensor(tensor):
    """Quantizes a float tensor symmetrically to int8 and returns (quantized_tensor, scale)."""
    max_abs = tensor.abs().max()
    scale = (max_abs / 127.0).clamp(min=1e-12)
    q = torch.clamp(torch.round(tensor / scale), -127, 127)
    return q.to(torch.int8), scale

# ------------------------------
# Dequantization: int8 to float
# ------------------------------
def dequantize_int8_per_tensor(q, scale):
    """Converts quantized int8 tensor back to float using scale."""
    return q.float() * scale

# ------------------------------
# Quantized Linear Layer (weights and activations)
# ------------------------------
class QuantLinear(nn.Module):
    """
    Linear layer with both weights and activations quantized to int8.
    Dequantizes them to float before performing the matmul (for clarity in this demo).
    """
    def __init__(self, lin: nn.Linear):
        super().__init__()
        # Quantize weights once during initialization
        qW, w_scale = quantize_int8_per_tensor(lin.weight.data.detach())
        self.qweight = nn.Parameter(qW, requires_grad=False)
        self.register_buffer("w_scale", w_scale)
        # Copy bias (still float)
        if lin.bias is not None:
            self.bias = nn.Parameter(lin.bias.data.detach().clone(), requires_grad=False)
        else:
            self.bias = None

    def forward(self, x):
        # Quantize activations each forward pass
        qx, x_scale = quantize_int8_per_tensor(x)
        x_deq = dequantize_int8_per_tensor(qx, x_scale)
        # Dequantize weights
        W = dequantize_int8_per_tensor(self.qweight, self.w_scale)
        # Perform standard linear layer computation
        out = F.linear(x_deq, W, self.bias)
        return out

# ------------------------------
# Compare outputs before and after quantization
# ------------------------------
torch.manual_seed(1)
lin = nn.Linear(128, 10)       # baseline FP32 layer
x = torch.randn(4, 128)        # small batch (4 samples)

ref_out = lin(x)               # baseline FP32 output
q_lin = QuantLinear(lin)       # quantized weights + activations
quant_out = q_lin(x)

# Compute mean relative error between outputs
rel_err = (quant_out - ref_out).abs().mean() / ref_out.abs().mean()

# ------------------------------
# Print results
# ------------------------------
print("Baseline (FP32) output:")
print(ref_out[:2])

print("\nQuantized (weights + activations) output:")
print(quant_out[:2])

print(f"\nRelative mean error: {rel_err.item():.6f}")


Baseline (FP32) output:
tensor([[ 1.2113, -0.2507, -0.1274, -0.8630,  0.7295,  0.9688,  0.5489, -0.2225,
          0.4110,  0.2604],
        [ 0.5140,  0.2072, -0.1497,  0.4514, -0.3403, -0.2416, -0.1721,  1.0568,
         -0.3797, -0.2253]], grad_fn=<SliceBackward0>)

Quantized (weights + activations) output:
tensor([[ 1.2147, -0.2490, -0.1213, -0.8630,  0.7335,  0.9755,  0.5451, -0.2266,
          0.4201,  0.2586],
        [ 0.5119,  0.2036, -0.1504,  0.4556, -0.3398, -0.2423, -0.1785,  1.0499,
         -0.3842, -0.2216]])

Relative mean error: 0.008251


In [None]:
# Part 1.2: Quick accuracy comparison demo with MNIST

import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.device("cpu")  # keep CPU to avoid device mismatch with the current QuantLinear
torch.manual_seed(1)

# Data
transform = transforms.Compose([transforms.ToTensor()])
train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=512, shuffle=False)

# Model (the layer we will quantize is the final classifier)
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 256)
        self.fc2 = nn.Linear(256, 10)  # we'll quantize THIS one
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

model = MLP().to(device)

# Train briefly to get non-trivial accuracy
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
epochs = 2  # short for demo

# ------------------------------
# Evaluate model before and after quantization
# ------------------------------
def evaluate(m):
    m.eval()
    correct = total = 0
    with torch.no_grad():
        for xb, yb in test_loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = m(xb)
            pred = logits.argmax(dim=1)
            correct += (pred == yb).sum().item()
            total += yb.numel()
    return correct / total

for _ in range(epochs):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)
        opt.zero_grad(set_to_none=True)
        loss.backward()
        opt.step()

# Evaluate BEFORE quantization
acc_before = evaluate(model)

# Replace the fully connected layer with our INT8 QuantLinear
with torch.no_grad():
    model.fc2 = QuantLinear(model.fc2)  # uses our class defined above
    model = model.to(device)            # stay on CPU for consistency

# Evaluate AFTER quantization
acc_after = evaluate(model)

# ------------------------------
# Print results
# ------------------------------
print(f"\nTest accuracy BEFORE quantization: {acc_before*100:.3f}%")
print(f"Test accuracy AFTER  quantization: {acc_after*100:.3f}%")
print(f"Accuracy drop: {(acc_before-acc_after)*100:.3f}%")

100%|██████████| 9.91M/9.91M [00:00<00:00, 22.5MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 614kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 5.76MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.86MB/s]



Test accuracy BEFORE quantization: 95.300%
Test accuracy AFTER  quantization: 95.330%
Accuracy drop: -0.030%


### Part 1.3: Results
The INT8-quantized model performs essentially the same as the original FP32 model. The difference of −0.03 percentage points is statistically insignificant. In fact, the quantized model scored slightly higher, but that’s just normal random variation due to data order, rounding noise, etc., and isn't seen across different random seeds.

In this application, INT8 quantization successfully reduces model precision and memory/storage cost without hurting accuracy. This demonstrates that our technique is an effective and safe compression strategy for simple models and datasets like MNIST.

## Part 2: Run the Base Model (SmolLM-135M)
*Load SmolLM-135M, run sample prompts, and record baseline latency and GPU memory usage.*

In [None]:
# Part 2.1: SmolLM-135M model
import time, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Pick device/precision (CUDA if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.float16 if device == "cuda" else torch.float32
print(f"Using device: {device.upper()}  |  Precision: {dtype}")

# Load model/tokenizer
model_id = "HuggingFaceTB/SmolLM-135M"
tok = AutoTokenizer.from_pretrained(model_id)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=dtype,
    device_map="auto" if device == "cuda" else None
).to(device).eval()

def run_prompt(prompt, max_new_tokens=250):
    """Generate text and measure latency, throughput, and peak GPU memory."""
    wrapped = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tok(wrapped, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].shape[-1]

    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

    t0 = time.perf_counter()
    with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False, temperature=1.0, top_p=1.0,
            no_repeat_ngram_size=3, repetition_penalty=1.05,
            pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id
        )
    dt = time.perf_counter() - t0

    # Decode only new tokens (avoid echoing the prompt)
    new_tokens = out[0][input_len:]
    text = tok.decode(new_tokens, skip_special_tokens=True).strip()

    gen_tokens = int(new_tokens.shape[0])
    mem_mb = torch.cuda.max_memory_allocated() / (1024**2) if device == "cuda" else 0.0

    return {"prompt": prompt, "latency_s": dt, "gpu_mem_MB": mem_mb, "output": text}

Using device: CPU  |  Precision: torch.float32


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
# Part 2.2: Evaluation on multiple prompts
# ======================================================
# Run a few prompts
# ======================================================
prompts = [
    "Explain quantization in LLMs in one paragraph.",
    "Write a short description of Scooby-Doo and the Mystery Inc. gang.",
    "Explain the process of photosynthesis in one paragraph."
]
results = [run_prompt(p) for p in prompts]

# ======================================================
# Print Results
# ======================================================
lat_list, mem_list = [], []
for i, r in enumerate(results, 1):
    lat_list.append(r["latency_s"])
    mem_list.append(r["gpu_mem_MB"])
    print(f"\n[{i}] Prompt: {r['prompt']}\n")
    print("Output:")
    print(r["output"][:400] + ("..." if len(r["output"]) > 400 else ""))
    print("\nMetrics:")
    print(f"  Latency:         {r['latency_s']:.2f} s")
    print(f"  Peak GPU Memory: {r['gpu_mem_MB']:.2f} MB")
    print("-"*78)

# --- Averages ---
import math
def avg(x): return (sum(x)/len(x)) if x else math.nan
print("\nAverages:")
print(f"  Avg Latency:     {avg(lat_list):.2f} s")
print(f"  Avg Peak GPU MB: {avg(mem_list):.2f} MB")


CPU Autocast only supports dtype of torch.bfloat16, torch.float16 currently.



[1] Prompt: Explain quantization in LLMs in one paragraph.

Output:
The quantization of a sentence is the number of words that are in the sentence, divided by the number
of tokens in the input.
For example, if we have a sentence like "I love my dog", then the quantization would be 1000.
If we have 200 tokens, then the number would be: 2.
This is because the quantized sentence has 2 tokens and 2 words.
In this case, the quantizer would be the number 2, which is 2 t...

Metrics:
  Latency:         20.95 s
  Peak GPU Memory: 0.00 MB
------------------------------------------------------------------------------

[2] Prompt: Write a short description of Scooby-Doo and the Mystery Inc. gang.

Output:
Scooby is a fictional character created by author Dr. Seuss. He is a mischievous boy who loves to play pranks on his friends. One day, he meets a group of kids called the Mystery Inns, who are trying to solve a mystery involving a mysterious creature. Scoob is one of the kids in the Mystery Inn

### Part 2.3: Results
The SmolLM-135M base model was evaluated on three distinct prompts to measure both generation quality and performance metrics such as latency and GPU memory usage. Note that all of my tests were conducted on CPU, resulting in higher latency values and 0 MB GPU memory utilization.

Overall, the SmolLM-135M model produced intelligible but inconsistent and partially incorrect outputs across topics. The average latency on CPU was around 20 seconds per max 250-token generation. Using an instruction-tuned variant or a GPU environment would likely yield both higher-quality responses and significantly improved runtime performance. More varied prompts to text realms like coding, math, text revision, etc. would better evaluate the model across a wider range of tasks, as the current prompts mainly test explanation.

## Part 3: Implement Weight Quantization (8-bit vs 4-bit)
*Use AutoAWQ or bitsandbytes to quantize the model weights. Compare latency, memory usage, and output accuracy for both versions.*

In [None]:
# Part 3.1: SmolLM-135M model and bitsandbytes quantization
import time, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Pick device/precision (CUDA if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.float16 if device == "cuda" else torch.float32
print(f"Using device: {device.upper()}  |  Precision: {dtype}")

# Load model/tokenizer
model_id = "HuggingFaceTB/SmolLM-135M"
tok = AutoTokenizer.from_pretrained(model_id)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=dtype,
    device_map="auto" if device == "cuda" else None
).to(device).eval()

def run_prompt(prompt, max_new_tokens=250):
    """Generate text and measure latency, throughput, and peak GPU memory."""
    wrapped = f"### Instruction:\n{prompt}\n\n### Response:\n"
    inputs = tok(wrapped, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].shape[-1]

    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

    t0 = time.perf_counter()
    with torch.inference_mode(), torch.autocast(device_type=device, dtype=dtype):
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False, temperature=1.0, top_p=1.0,
            no_repeat_ngram_size=3, repetition_penalty=1.05,
            pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id
        )
    dt = time.perf_counter() - t0

    # Decode only new tokens (avoid echoing the prompt)
    new_tokens = out[0][input_len:]
    text = tok.decode(new_tokens, skip_special_tokens=True).strip()

    gen_tokens = int(new_tokens.shape[0])
    mem_mb = torch.cuda.max_memory_allocated() / (1024**2) if device == "cuda" else 0.0

    return {"prompt": prompt, "latency_s": dt, "gpu_mem_MB": mem_mb, "output": text}

Using device: CUDA  |  Precision: torch.float16


In [None]:
# Part 3.2: Load Base Model and Quantized Models
from transformers import BitsAndBytesConfig

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load base model (FP16)
print("Loading base model (FP16)...")
clear_memory()
base_model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.float16, device_map="auto"
).eval()
base_memory = get_memory()
print(f"Base model: {base_memory:.1f} MB")

# Load 8-bit model
print("Loading 8-bit model...")
clear_memory()
bnb_8bit = BitsAndBytesConfig(load_in_8bit=True)
model_8bit = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_8bit, device_map="auto"
).eval()
memory_8bit = get_memory()
print(f"8-bit model: {memory_8bit:.1f} MB")

# Load 4-bit model
print("Loading 4-bit model...")
clear_memory()
bnb_4bit = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_4bit, device_map="auto"
).eval()
memory_4bit = get_memory()
print(f"4-bit model: {memory_4bit:.1f} MB")


Loading tokenizer...
Loading base model (FP16)...
Base model: 791.5 MB
Loading 8-bit model...
8-bit model: 791.5 MB
Loading 4-bit model...
4-bit model: 848.5 MB


In [None]:
# Part 3.3: Test All Models
def measure_latency(model, prompt="What is AI?"):
    inputs = tokenizer(f"### Instruction:\n{prompt}\n\n### Response:\n", return_tensors="pt").to(device)

    start = time.perf_counter()
    with torch.inference_mode():
        model.generate(**inputs, max_new_tokens=30, do_sample=False)
    return time.perf_counter() - start

# Test all models
print("Testing all models...")
base_latency = measure_latency(base_model)
latency_8bit = measure_latency(model_8bit)
latency_4bit = measure_latency(model_4bit)

print(f"Base model latency: {base_latency:.3f}s")
print(f"8-bit model latency: {latency_8bit:.3f}s")
print(f"4-bit model latency: {latency_4bit:.3f}s")


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Testing all models...


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Base model latency: 0.591s
8-bit model latency: 2.069s
4-bit model latency: 0.881s


In [None]:
# Part 3.4: Quantization Comparison Results
print("QUANTIZATION COMPARISON RESULTS")
print("=" * 50)

print(f"\nMemory Usage:")
print(f"Base Model (FP16):  {base_memory:.1f} MB")
print(f"8-bit Quantized:    {memory_8bit:.1f} MB")
print(f"4-bit Quantized:    {memory_4bit:.1f} MB")

print(f"\nLatency:")
print(f"Base Model:         {base_latency:.3f}s")
print(f"8-bit Model:        {latency_8bit:.3f}s")
print(f"4-bit Model:        {latency_4bit:.3f}s")

# Calculate memory differences
mem_change_8bit = ((memory_8bit - base_memory) / base_memory) * 100
mem_change_4bit = ((memory_4bit - base_memory) / base_memory) * 100

print(f"\nMemory Usage Change:")
if mem_change_8bit > 0:
    print(f"8-bit: {mem_change_8bit:.1f}% increase")
elif mem_change_8bit < 0:
    print(f"8-bit: {abs(mem_change_8bit):.1f}% reduction")
else:
    print(f"8-bit: No change")

if mem_change_4bit > 0:
    print(f"4-bit: {mem_change_4bit:.1f}% increase")
elif mem_change_4bit < 0:
    print(f"4-bit: {abs(mem_change_4bit):.1f}% reduction")
else:
    print(f"4-bit: No change")

print(f"\nSpeed Comparison:")
if latency_8bit < base_latency:
    print(f"8-bit is {base_latency/latency_8bit:.2f}x faster")
else:
    print(f"8-bit is {latency_8bit/base_latency:.2f}x slower")

if latency_4bit < base_latency:
    print(f"4-bit is {base_latency/latency_4bit:.2f}x faster")
else:
    print(f"4-bit is {latency_4bit/base_latency:.2f}x slower")

print(f"\nQuantization successfully implemented and tested.")


QUANTIZATION COMPARISON RESULTS

Memory Usage:
Base Model (FP16):  791.5 MB
8-bit Quantized:    791.5 MB
4-bit Quantized:    848.5 MB

Latency:
Base Model:         0.591s
8-bit Model:        2.069s
4-bit Model:        0.881s

Memory Usage Change:
8-bit: 0.0% reduction
4-bit: 7.2% increase

Speed Comparison:
8-bit is 3.50x slower
4-bit is 1.49x slower

Quantization successfully implemented and tested.


When running the quanitzation it should use both less gpu memory and decrease the latency. Due to the model being small and running on a gpu that supports FP16, this is not the case. When running on CPU, quantization should help out by both reducing memory and latency.

## Part 4: Implement Static KV-Cache Quantization
*Quantize the model’s KV cache at a fixed bit-width (e.g., 4 bits). Measure the impact on inference speed, memory consumption, and accuracy.*

In [None]:
!pip install -q --upgrade pip
!pip install -q --upgrade transformers accelerate optimum-quanto
!pip install -q ninja "transformers>=4.45" accelerate "optimum-quanto>=0.2.3"

ERROR: To modify pip, please run the following command:
C:\Users\Anthony\anaconda3\envs\kvq\python.exe -m pip install -q --upgrade pip


In [None]:
from transformers.utils import is_optimum_quanto_available
print("optimum-quanto available?", is_optimum_quanto_available())

optimum-quanto available? True


In [1]:
import sys
# 1) Make sure you have the CUDA build of PyTorch in THIS kernel
!{sys.executable} -m pip uninstall -y torch torchvision torchaudio
!{sys.executable} -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# 2) Verify CUDA is available
import torch
print("cuda available:", torch.cuda.is_available())
print("torch:", torch.__version__, "| torch cuda:", torch.version.cuda)
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))

Found existing installation: torch 2.5.1
Uninstalling torch-2.5.1:
  Successfully uninstalled torch-2.5.1
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
optimum-quanto 0.2.7 requires torch>=2.6.0, but you have torch 2.5.1+cu121 which is incompatible.


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-win_amd64.whl (6.1 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl (4.1 MB)
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp310-cp310-win_amd64.whl (2449.4 MB)
Collecting sympy==1.13.1 (from torch)
  Using cached https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
Installing collected packages: sympy, torch, torchvision, torchaudio

  Attempting uninstall: sympy

    Found existing installation: sympy 1.14.0

   ---------------------------------------- 0/4 [sympy]
   ---------------------------------------- 0/4 [sympy]
   ---------------------------------------- 0/4 [sympy]
    Uninstalling sympy-1.14.0:
   ---------------------------------------- 0/4 [sympy]
   ------

In [2]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

12.1
True
NVIDIA GeForce RTX 3090


In [3]:
import time, math, os, psutil, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_optimum_quanto_available
assert torch.cuda.is_available(), "CUDA GPU required" #Hard cuda requirement for part 4

#--------------------------------
#Config
#--------------------------------

ModelID = "HuggingFaceTB/SmolLM-135M"
prompts = [
    "Explain quantization in LLMs in one paragraph.",
    "Write a short description of Scooby-Doo and the Mystery Inc. gang.",
    "Explain the process of photosynthesis in one paragraph."
]

#pick device, not really necessary because of CUDA requirement, but cpu can still run, just wont be given a memory value
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device:", device.type)

dataType = torch.float16 if device.type == "cuda" else torch.float32
print(f"Using precision: {dataType}")

#toeknizer model recycled
tokenizer = AutoTokenizer.from_pretrained(ModelID)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(ModelID, dtype=dataType).to(device).eval()
USE_KVQ = is_optimum_quanto_available()
KVQ_CFG = {"backend": "quanto", "nbits": 4} #configure KV cache 4 bit (INT4) via "quanto" backend
print("Use KVQ:", USE_KVQ)
#---------------------------
#Core Measurement
#---------------------------

def runOnce(prompt, max_new_tokens = 1024, kv_quant = False): #core measurement function
  inputs = tokenizer(prompt, return_tensors="pt").to(device) #tokenize and transfer to GPU
  if torch.cuda.is_available():
    torch.cuda.empty_cache() #Reset cuda memory stats
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()
    memBefore = torch.cuda.max_memory_allocated() / (1024 ** 2) #capture memory before generation

  gen_kwargs = {}


  if kv_quant and USE_KVQ:
    gen_kwargs.update(dict(cache_implementation="quantized", cache_config=KVQ_CFG)) #warmup for when KVQ is enabled


  if kv_quant and USE_KVQ and not getattr(runOnce, "_warmed", False):
      _ = model.generate(**inputs, max_new_tokens=1, do_sample=False, **gen_kwargs)
      torch.cuda.synchronize()
      runOnce._warmed = True
      torch.cuda.reset_peak_memory_stats()


  #--------------------------
  #Timed Gen
  #--------------------------
  t0 = time.perf_counter()

  out = model.generate(
    **inputs,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    return_dict_in_generate=True,
    output_scores=True,
    do_sample=False,
    **gen_kwargs
  )

  if torch.cuda.is_available():
    torch.cuda.synchronize()

  dt = time.perf_counter() - t0

  peak = torch.cuda.max_memory_allocated() / (1024 ** 2) #peak VRAM
  kv_delta = max(0.0, peak - memBefore) #Estimate KV cache footprint

  new_ids = out.sequences[:, inputs.input_ids.shape[1]:]
  text = tokenizer.batch_decode(new_ids, skip_special_tokens=True)[0][:200] + "..."

  mem_mb = (torch.cuda.max_memory_allocated() / (1024 ** 2)) if device.type == "cuda" else 0.0
  return {'latency_s': dt, "peak_MB": peak, "mem": mem_mb, "kv_MB": kv_delta, "text": text, "kv_quant": bool(kv_quant) and USE_KVQ}


base = ("This is a natural paragraph about language models and token generation. ") * 120

long_prompt = base + "\n\nContinue with a consise summary"
short_prompt = base + "\n\nExplain quantization in LLMs in one paragraph."


short_baseline_Run = runOnce(short_prompt, max_new_tokens=128, kv_quant=False)
short_kvq = runOnce(short_prompt, max_new_tokens=128, kv_quant=True)

long_baseline_Run = runOnce(long_prompt, max_new_tokens=512, kv_quant=False)
long_kvq = runOnce(long_prompt, max_new_tokens=512, kv_quant=True)

print("Baseline Long:", long_baseline_Run)
print("KV-INT4 Long:", long_kvq)
print("Baseline Short:", short_baseline_Run)
print("KV-INT4 Short:", short_kvq)

if long_baseline_Run["kv_MB"] > 0:
  savings = long_baseline_Run["kv_MB"] - long_kvq["kv_MB"]
  savingsPCT = 100.0 * savings/long_baseline_Run["kv_MB"]
  print(f"KV delta: {long_baseline_Run['kv_MB']:.1f}MB | with INT4: {long_kvq['kv_MB']:.1f}MB")
  print(f"KV savings: {savings:.1f} MB ({savingsPCT:.1f}%)")

Using device: cuda
Using precision: torch.float16
Use KVQ: True


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Baseline Long: {'latency_s': 26.75394090000009, 'peak_MB': 560.96826171875, 'mem': 560.96826171875, 'kv_MB': 296.23095703125, 'text': ' of the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the ...', 'kv_quant': False}
KV-INT4 Long: {'latency_s': 35.6673008000007, 'peak_MB': 539.37255859375, 'mem': 539.37255859375, 'kv_MB': 274.63525390625, 'text': ' of the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the article.\n\nThis is a natural paragraph about the ...', 'kv_quant': True}
Baseline Short: {'latency_s': 7.05027599999994, 'peak_MB': 562.35205078125, 'mem': 562.35205078125, 'kv_MB': 305.73974609375, 'text': '\n\nExplain quantization in LLMs in one paragraph.\n\nExplain quantization in LLMs in one paragraph.\n\nExplain quantization in LLMs

In [4]:
def print_results_table(results, title="Results Table"):
    print(f"\n==== {title} ====")
    print(f"{'Type':15s} | {'Latency (s)':12s} | {'KV Δ (MB)':10s} | {'Peak MB':10s} | {'KV Quant?':10s}")
    print("-" * 70)
    for name, r in results:
        print(f"{name:15s} | {r['latency_s']:<12.3f} | {r['kv_MB']:<10.1f} | {r['peak_MB']:<10.1f} | {str(r['kv_quant']):10s}")

results = [
    ("Short Base", short_baseline_Run),
    ("Short KVQ", short_kvq),
    ("Long Base", long_baseline_Run),
    ("Long KVQ", long_kvq),
]

print_results_table(results, title="KV Cache Quantization Results")

if long_baseline_Run["kv_MB"] > 0:
    savings = long_baseline_Run["kv_MB"] - long_kvq["kv_MB"]
    savingsPCT = 100.0 * savings / long_baseline_Run["kv_MB"]
    print(f"\nKV Savings (Long Prompt): {savings:.2f} MB ({savingsPCT:.2f}%)")


==== KV Cache Quantization Results ====
Type            | Latency (s)  | KV Δ (MB)  | Peak MB    | KV Quant? 
----------------------------------------------------------------------
Short Base      | 7.050        | 305.7      | 562.4      | False     
Short KVQ       | 9.314        | 276.1      | 540.8      | True      
Long Base       | 26.754       | 296.2      | 561.0      | False     
Long KVQ        | 35.667       | 274.6      | 539.4      | True      

KV Savings (Long Prompt): 21.60 MB (7.29%)


The findings from part 4 compares the baseline Floating Point 16 generation vs KV-Cache INT4 quantization on the same prompts.

The Key Fields Measured:

- Latency: Decode Time
- Peak: peak GPU memory during generation
- KV MB: Estimated KV-Cache footprint
- KV Quant: Activated or Deactivated upon request