<a href="https://colab.research.google.com/github/guptagundlapalli/Applied_Data_Analytics/blob/master/Quantize_LLMs_to_NVFP4_with_LLM_Compressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*More details in this article: [NVFP4: Same Accuracy with 2.3× Higher Throughput for 4-Bit LLMs](https://kaitchup.substack.com/p/nvfp4-same-accuracy-with-23-higher)*

This notebook shows how to quantize LLMs with NVFP4, using LLM Compressor.
The first part quantizes the weights and prepare the model for quantized activations. Apply this scheme, unless you observe a significant drop of accuracy. In that case, use the second part. It quantizes only the weights and the model will use 16-bit activations during inference.

The NVFP4 models are compatible with vLLM. You need a Blackwell GPU to run them.

# Installation

In [None]:
!pip install llmcompressor datasets transformers

# Weight and Activation Quantization

In [None]:
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

from datasets import load_dataset
NUM_CALIBRATION_SAMPLES=512
MAX_SEQUENCE_LENGTH=2048
# Load dataset.
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

# Preprocess the data into the format the model is trained with.
def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False,)}
ds = ds.map(preprocess)

# Tokenize the data (be careful with bos tokens - we need add_special_tokens=False since the chat_template already added it).
def tokenize(sample):
    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4", ignore=["lm_head"])

# Apply quantization.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/59.6k [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

model-00004-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00005-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00001-of-00030.safetensors:   0%|          | 0.00/4.58G [00:00<?, ?B/s]

model-00008-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00007-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00003-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00009-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00010-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00011-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00012-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00013-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00014-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00015-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00016-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00017-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00018-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00019-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00020-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00021-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00022-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00023-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00024-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00025-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00026-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00027-of-00030.safetensors:   0%|          | 0.00/4.66G [00:00<?, ?B/s]

model-00028-of-00030.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00029-of-00030.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00030-of-00030.safetensors:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

data/train_sft-00000-of-00003-a3ecf92756(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00001-of-00003-0a1804bcb6(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00002-of-00003-ee46ed25cf(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/test_sft-00000-of-00001-f7dfac4afe5(…):   0%|          | 0.00/81.2M [00:00<?, ?B/s]

data/train_gen-00000-of-00003-a6c9fb894b(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_gen-00001-of-00003-d6a0402e41(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/train_gen-00002-of-00003-c0db75b92a(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/test_gen-00000-of-00001-3d4cd830914(…):   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

2025-08-19T10:37:44.931696+0000 | reset | INFO - Compression lifecycle reset
2025-08-19T10:37:44.936240+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-08-19T10:37:45.638760+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-08-19T10:37:45.639660+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `QuantizationModifier`


100%|██████████| 2167/2167 [00:28<00:00, 75.56it/s] 
Calibrating weights: 100%|██████████| 2167/2167 [02:51<00:00, 12.65it/s]
Preparing cache: 100%|██████████| 512/512 [00:00<00:00, 709.92it/s]
(1/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 111.11it/s]
(1/81): Propagating: 100%|██████████| 512/512 [00:09<00:00, 53.39it/s]
(2/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 106.97it/s]
(2/81): Propagating: 100%|██████████| 512/512 [00:05<00:00, 87.18it/s] 
(3/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 107.80it/s]
(3/81): Propagating: 100%|██████████| 512/512 [00:05<00:00, 94.70it/s] 
(4/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 107.28it/s]
(4/81): Propagating: 100%|██████████| 512/512 [00:05<00:00, 96.31it/s] 
(5/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 107.56it/s]
(5/81): Propagating: 100%|██████████| 512/512 [00:05<00:00, 96.75it/s] 
(6/81): Calibrating: 100%|██████████| 512/512 [00:04<00:00, 107.30it/s]
(6/81): Propaga

2025-08-19T10:56:29.426084+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-08-19T10:56:30.342205+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 1047it [39:03,  2.24s/it]


('Llama-3.3-70B-Instruct-NVFP4/tokenizer_config.json',
 'Llama-3.3-70B-Instruct-NVFP4/special_tokens_map.json',
 'Llama-3.3-70B-Instruct-NVFP4/chat_template.jinja',
 'Llama-3.3-70B-Instruct-NVFP4/tokenizer.json')

# Weight-Only Quantization

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct"

# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
# In this case, we:
#   * quantize the weights to fp4 with per group 16 via ptq
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"])

# Apply quantization.
oneshot(model=model, recipe=recipe)


# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

2025-08-19T11:42:22.416004+0000 | reset | INFO - Compression lifecycle reset
2025-08-19T11:42:22.420235+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-08-19T11:42:22.861918+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-08-19T11:42:22.863033+0000 | IndependentPipeline | INFO - Inferred `DataFreePipeline` for `QuantizationModifier`


100%|██████████| 1607/1607 [00:12<00:00, 128.24it/s]
Calibrating weights: 100%|██████████| 1607/1607 [03:38<00:00,  7.35it/s]


2025-08-19T11:46:15.284923+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-08-19T11:46:15.331168+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 1047it [38:35,  2.21s/it]


('Llama-3.3-70B-Instruct-NVFP4A16/tokenizer_config.json',
 'Llama-3.3-70B-Instruct-NVFP4A16/special_tokens_map.json',
 'Llama-3.3-70B-Instruct-NVFP4A16/chat_template.jinja',
 'Llama-3.3-70B-Instruct-NVFP4A16/tokenizer.json')

# Example with LM Head Quantization

*Not supported by vLLM for inference (tested with vLLM v0.10.0)*

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct"

# Load model.
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
# In this case, we:
#   * quantize the weights to fp4 with per group 16 via ptq
recipe = QuantizationModifier(targets="Linear", scheme="NVFP4A16")

# Apply quantization.
oneshot(model=model, recipe=recipe)


# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16LMH"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

2025-08-19T12:51:21.897908+0000 | reset | INFO - Compression lifecycle reset
2025-08-19T12:51:21.902218+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-08-19T12:51:22.363014+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-08-19T12:51:22.364126+0000 | IndependentPipeline | INFO - Inferred `DataFreePipeline` for `QuantizationModifier`


100%|██████████| 1608/1608 [00:01<00:00, 1221.86it/s]
Calibrating weights: 100%|██████████| 1608/1608 [03:39<00:00,  7.33it/s]


2025-08-19T12:55:04.328587+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-08-19T12:55:04.375309+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 1047it [39:28,  2.26s/it]


('Llama-3.3-70B-Instruct-NVFP4A16LMH/tokenizer_config.json',
 'Llama-3.3-70B-Instruct-NVFP4A16LMH/special_tokens_map.json',
 'Llama-3.3-70B-Instruct-NVFP4A16LMH/chat_template.jinja',
 'Llama-3.3-70B-Instruct-NVFP4A16LMH/tokenizer.json')