
# TinyQuant Quickstart

This Colab shows how to:
1) Install TinyQuant (with optional 4-bit backend).
2) Load a small LLaMA model from Hugging Face.
3) Quantize selected linear layers with NF4 in one line.
4) Generate text as usual.


In [None]:
# TinyQuant + optional extras (bitsandbytes) for 4-bit/NF4 backends
!pip install -qU "tinyquant @ git+https://github.com/galqiwi/tinyquant"
!pip install -q bitsandbytes transformers accelerate

In [None]:
import torch
import transformers

In [None]:
model_id = "unsloth/Llama-3.2-1B"
model_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=model_dtype,
    low_cpu_mem_usage=True,
    attn_implementation="eager",
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
device = model.get_input_embeddings().weight.device

In [None]:
from tinyquant.utils import quantize_matching_linear_layers

# One-line quantization
quantize_matching_linear_layers(model, "nf4", "model.layers.*.self_attn.q_proj")

In [None]:
prompt = "Quantization for neural networks helps with "
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)

output = model.generate(inputs, do_sample=True, max_new_tokens=100)
print(tokenizer.decode(output[0]))