In [1]:
import os
os.environ["TORCH_CUDA_ARCH_LIST"] = "12.1"
os.environ["TRITON_PTXAS_PATH"] = "/usr/local/cuda/bin/ptxas"
os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ.get("PATH", "")
os.environ["LD_LIBRARY_PATH"] = "/usr/local/cuda/lib64:" + os.environ.get("LD_LIBRARY_PATH", "")

In [2]:
# Run this cell first, then restart the kernel before running the next cell
!pip uninstall torch -y
!pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cu130

!pip install bitsandbytes>=0.43.2
!pip install nvidia-modelopt

Found existing installation: torch 2.9.0+cu130
Uninstalling torch-2.9.0+cu130:
  Successfully uninstalled torch-2.9.0+cu130
Looking in indexes: https://download.pytorch.org/whl/cu130
Collecting torch==2.9.0
  Using cached https://download.pytorch.org/whl/cu130/torch-2.9.0%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl.metadata (30 kB)
Using cached https://download.pytorch.org/whl/cu130/torch-2.9.0%2Bcu130-cp312-cp312-manylinux_2_28_aarch64.whl (512.4 MB)
Installing collected packages: torch
Successfully installed torch-2.9.0+cu130
zsh:1: 0.43.2 not found


In [3]:
# Run this cell after restarting the kernel
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer


import modelopt.torch.quantization as mtq
from modelopt.torch.utils.dataset_utils import create_forward_loop, get_dataset_dataloader

import bitsandbytes as bnb

PyTorch version: 2.9.0+cu130
CUDA available: True


    Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (8.0) - (12.0)
    


In [4]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
dataset_name = "cnn_dailymail"
batch_size = 8
calib_samples = 128

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Load model - use device_map="auto" to handle device placement automatically
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    dtype=torch.bfloat16,  # Must be torch_dtype, not dtype
    device_map="auto",           # Don't use .cuda(), use this instead
    low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model

import sys
sys.path.append("..")  # Add parent directory to path

# Force reload the module to pick up changes
import importlib
import quantization_theory_helper
importlib.reload(quantization_theory_helper)

from quantization_theory_helper import compute_module_sizes
module_size = compute_module_sizes(model)
print(f"The model size is {module_size[''] * 1e-9} GB")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The model size is 16.060522752 GB


In [6]:
dataloader = get_dataset_dataloader(
    dataset_name=dataset_name,
    tokenizer=tokenizer,
    batch_size=batch_size,
    num_samples=calib_samples,
    device="cuda",
)

  warn(


In [7]:
forward_loop = create_forward_loop(dataloader=dataloader)

In [8]:
# Try FP8 instead of FP4 - may have better Blackwell support
# If FP8 also fails, Triton doesn't support Blackwell GPUs yet for ModelOpt
quant_config = mtq.NVFP4_DEFAULT_CFG  # Changed from NVFP4_DEFAULT_CFG
model = mtq.quantize(model, quant_config, forward_loop=forward_loop)

Registered <class 'transformers.models.llama.modeling_llama.LlamaAttention'> to _QuantAttention for KV Cache quantization
Inserted 771 quantizers


100%|██████████| 16/16 [00:35<00:00,  2.20s/it]


In [9]:
# Using eager mode (no torch.compile) for Blackwell GPU compatibility
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, my name is Jessica and I'm a 25-year-old woman


In [10]:
from modelopt.torch.export import export_hf_checkpoint

export_path = "./quantized_model/NVFP4/"
export_hf_checkpoint(model, export_dir=export_path)
tokenizer.save_pretrained(export_path)

`torch_dtype` is deprecated! Use `dtype` instead!


('./quantized_model/NVFP4/tokenizer_config.json',
 './quantized_model/NVFP4/special_tokens_map.json',
 './quantized_model/NVFP4/chat_template.jinja',
 './quantized_model/NVFP4/tokenizer.json')

In [11]:
import sys
sys.path.append("..")  # Add parent directory to path

# Force reload the module to pick up changes
import importlib
import quantization_theory_helper
importlib.reload(quantization_theory_helper)

from quantization_theory_helper import compute_module_sizes
module_size = compute_module_sizes(model)
print(f"The model size is {module_size[''] * 1e-9} GB")

The model size is 6.0277502720000005 GB


In [12]:
import os

# Save model config
model.config.save_pretrained(export_path)
print("✓ Saved config")

# Create a README model card
model_card = """---
license: llama3.1
base_model: meta-llama/Llama-3.1-8B-Instruct
tags:
  - llama
  - quantized
  - nvidia-modeloptimizer
  - NVFP4
library_name: nvidia-modeloptimizer
---

# Llama-3.1-8B-Instruct Quantized (ModelOpt NVFP4)

This is a quantized version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) using [modelopt](https://github.com/NVIDIA/Model-Optimizer) with NVFP4 weight quantization.

## Model Details

- **Base Model:** meta-llama/Llama-3.1-8B-Instruct
- **Quantization Method:** modelopt NVFP4 Post-Training Quantization (PTQ)    
- **Weight Precision:** NVFP4
- **Original Size:** ~16 GB (bfloat16)
- **Quantized Size:** ~6 GB (nvfp4)

## Usage

```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load base model structure
model = AutoModelForCausalLM.from_pretrained(
    "tokenlabsdotrun/Llama-3.1-8B-ModelOpt-NVFP4",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

# Load tokenizer and generate
tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-ModelOpt-NVFP4")

inputs = tokenizer("Hello, my name is", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## License

This model inherits the [Llama 3.1 Community License](https://llama.meta.com/llama3_1/license/).
"""

with open(f"{export_path}/README.md", "w") as f:
    f.write(model_card)
print("✓ Created model card")

print(f"\nModel saved to {export_path}/")
print("Contents:", os.listdir(export_path))

✓ Saved config
✓ Created model card

Model saved to ./quantized_model/NVFP4//
Contents: ['README.md', 'tokenizer_config.json', 'chat_template.jinja', 'special_tokens_map.json', 'generation_config.json', 'config.json', 'hf_quant_config.json', 'tokenizer.json', 'model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model.safetensors.index.json']


In [13]:
from huggingface_hub import create_repo, upload_folder

# Upload to HuggingFace Hub
repo_name = "tokenlabsdotrun/Llama-3.1-8B-ModelOpt-NVFP4"  # Change to your username/repo

try:
    # Create the repo (set private=True if you want it private)
    create_repo(repo_name, exist_ok=True, private=False)
    print(f"✓ Repository created: {repo_name}")
    
    # Upload all files
    upload_folder(
        folder_path=export_path,
        repo_id=repo_name,
        repo_type="model",
        commit_message="Upload Llama-3.1-8B quantized with ModelOpt NVFP4"
    )
    print(f"✓ Uploaded to https://huggingface.co/{repo_name}")
    
except Exception as e:
    print(f"❌ Error: {e}")

✓ Repository created: tokenlabsdotrun/Llama-3.1-8B-ModelOpt-NVFP4


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded to https://huggingface.co/tokenlabsdotrun/Llama-3.1-8B-ModelOpt-NVFP4
