In [1]:
!pip install transformers quanto torch



In [2]:
model_name="meta-llama/Llama-3.1-8B-Instruct"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, low_cpu_mem_usage=True)
model

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
text = "Hello, my name is"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, my name is Emma and I am a freelance writer and editor based


In [5]:
from quantization_theory_helper import compute_module_sizes
module_size = compute_module_sizes(model)
print(f"The model size is {module_size[''] * 1e-9} GB")

The model size is 16.060522752 GB


In [6]:
print(model.model.layers[0].self_attn.q_proj.weight)

Parameter containing:
tensor([[ 0.0052, -0.0293, -0.0064,  ...,  0.0092, -0.0415, -0.0269],
        [-0.0150, -0.0679, -0.0059,  ..., -0.0149, -0.0498,  0.0197],
        [-0.0173, -0.0391, -0.0040,  ...,  0.0107, -0.0132,  0.0071],
        ...,
        [-0.0035, -0.0383,  0.0781,  ...,  0.0057, -0.0012,  0.0024],
        [-0.0033, -0.0093,  0.0437,  ...,  0.0047, -0.0011,  0.0012],
        [-0.0019, -0.0153,  0.0347,  ...,  0.0111,  0.0004,  0.0042]],
       dtype=torch.bfloat16, requires_grad=True)


In [7]:
from quanto import quantize, freeze 
import torch

In [8]:
from quanto import qint8

# Use quanto's qint8 type, not torch.int8
quantize(model, weights=qint8, activations=None)

In [9]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (v_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (o_proj): QLinear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (up_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (down_proj): QLinear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e

In [10]:
print(model.model.layers[0].self_attn.q_proj.weight)

Parameter containing:
tensor([[ 0.0052, -0.0293, -0.0064,  ...,  0.0092, -0.0415, -0.0269],
        [-0.0150, -0.0679, -0.0059,  ..., -0.0149, -0.0498,  0.0197],
        [-0.0173, -0.0391, -0.0040,  ...,  0.0107, -0.0132,  0.0071],
        ...,
        [-0.0035, -0.0383,  0.0781,  ...,  0.0057, -0.0012,  0.0024],
        [-0.0033, -0.0093,  0.0437,  ...,  0.0047, -0.0011,  0.0012],
        [-0.0019, -0.0153,  0.0347,  ...,  0.0111,  0.0004,  0.0042]],
       dtype=torch.bfloat16, requires_grad=True)


In [11]:
freeze(model)

In [12]:
print(model.model.layers[0].self_attn.q_proj.weight)

QBytesTensor(tensor([[ 11, -65, -14,  ...,  20, -92, -60],
        [-14, -63,  -5,  ..., -14, -46,  18],
        [-26, -58,  -6,  ...,  16, -19,  10],
        ...,
        [ -1, -14,  29,  ...,   2,   0,   1],
        [ -2,  -7,  32,  ...,   3,  -1,   1],
        [ -2, -17,  38,  ...,  12,   0,   5]], dtype=torch.int8), scale=tensor([[0.0005],
        [0.0011],
        [0.0007],
        ...,
        [0.0027],
        [0.0014],
        [0.0009]], dtype=torch.bfloat16), dtype=torch.bfloat16)


In [13]:
module_size = compute_module_sizes(model)
print(f"The quantized model size is {module_size[''] * 1e-9} GB")

The quantized model size is 8.558875144 GB


In [14]:
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, my name is Mike and I am a 34-year-old male


In [15]:
!pip install ipywidgets python-dotenv

# Save and upload quanto-quantized model to HuggingFace
import os
from dotenv import load_dotenv
from huggingface_hub import login, create_repo, upload_folder
from quanto import safe_save  # Use quanto's save function, not safetensors directly

# Load HF_TOKEN from .env file
load_dotenv()
token = os.getenv("HF_TOKEN")

if not token:
    raise ValueError("HF_TOKEN not found! Create a .env file with: HF_TOKEN=your_token_here")

print(f"Using token: {token[:10]}...")
login(token=token)

# Create save directory
save_dir = "./quantized_model"
os.makedirs(save_dir, exist_ok=True)

print("Saving quantized model...")

# Use quanto's safe_save which handles quantized tensors properly
safe_save(model.state_dict(), f"{save_dir}/model.safetensors")
print("✓ Saved model weights with quanto format")

# Save tokenizer
tokenizer.save_pretrained(save_dir)
print("✓ Saved tokenizer")

# Save model config
model.config.save_pretrained(save_dir)
print("✓ Saved config")

# Create a README model card
model_card = """---
license: llama3.1
base_model: meta-llama/Llama-3.1-8B-Instruct
tags:
  - llama
  - quantized
  - quanto
  - int8
library_name: transformers
---

# Llama-3.1-8B-Instruct Quantized (Quanto INT8)

This is a quantized version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) using [quanto](https://github.com/huggingface/optimum-quanto) with INT8 weight quantization.

## Model Details

- **Base Model:** meta-llama/Llama-3.1-8B-Instruct
- **Quantization Method:** quanto
- **Weight Precision:** INT8 (qint8)
- **Original Size:** ~16 GB (bfloat16)
- **Quantized Size:** ~8.5 GB

## Usage

```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from quanto import quantize, freeze, qint8, safe_load

# Load base model structure
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

# Quantize structure and load weights
quantize(model, weights=qint8)
state_dict = safe_load("model.safetensors")  # Use quanto's safe_load
model.load_state_dict(state_dict)
freeze(model)

# Load tokenizer and generate
tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8")
inputs = tokenizer("Hello, my name is", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

## License

This model inherits the [Llama 3.1 Community License](https://llama.meta.com/llama3_1/license/).
"""

with open(f"{save_dir}/README.md", "w") as f:
    f.write(model_card)
print("✓ Created model card")

print(f"\nModel saved to {save_dir}/")
print("Contents:", os.listdir(save_dir))

Using token: hf_VnaYJIJ...


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Saving quantized model...
✓ Saved model weights with quanto format
✓ Saved tokenizer
✓ Saved config
✓ Created model card

Model saved to ./quantized_model/
Contents: ['README.md', 'model.safetensors', 'tokenizer_config.json', 'chat_template.jinja', 'special_tokens_map.json', 'config.json', 'tokenizer.json']


In [16]:
# Upload to HuggingFace Hub
repo_name = "tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8"  # Change to your username/repo

try:
    # Create the repo (set private=True if you want it private)
    create_repo(repo_name, exist_ok=True, private=False)
    print(f"✓ Repository created: {repo_name}")
    
    # Upload all files
    upload_folder(
        folder_path=save_dir,
        repo_id=repo_name,
        repo_type="model",
        commit_message="Upload Llama-3.1-8B quantized with quanto int8"
    )
    print(f"✓ Uploaded to https://huggingface.co/{repo_name}")
    
except Exception as e:
    print(f"❌ Error: {e}")

✓ Repository created: tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✓ Uploaded to https://huggingface.co/tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8


In [17]:
tokenizer = AutoTokenizer.from_pretrained("tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8")
inputs = tokenizer("Hello, my name is", return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, my name is Rachel and I'm a freelance writer and editor.


In [19]:
# Load and use the quantized model from HuggingFace
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from quanto import quantize, freeze, qint8, safe_load
from huggingface_hub import hf_hub_download

repo_id = "tokenlabsdotrun/Llama-3.1-8B-Quanto-Int8"
base_model = "meta-llama/Llama-3.1-8B-Instruct"

# Step 1: Load the base model structure
print("Loading base model structure...")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)

# Step 2: Quantize the model structure (creates empty quantized layers)
print("Applying quantization structure...")
quantize(model, weights=qint8)

# Step 3: Download and load the quantized weights
print("Loading quantized weights...")
weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
state_dict = safe_load(weights_path)
model.load_state_dict(state_dict)

# Step 4: Freeze the model (required after loading quanto weights)
freeze(model)
print("✓ Model loaded and ready!")

# Step 5: Load tokenizer and generate
tokenizer = AutoTokenizer.from_pretrained(repo_id)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Generate text
text = "Hello, my name is"
inputs = tokenizer(text, return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

outputs = model.generate(
    **inputs,
    max_new_tokens=10
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Loading base model structure...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Applying quantization structure...
Loading quantized weights...
✓ Model loaded and ready!


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Hello, my name is Chris. I am a 32-year-old man
