# any4 Tutorial

📥 Load a Model and Tokenizer

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer

model_name = "meta-llama/Llama-3.2-1B"
device = "cuda"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [None]:
# To avoid Hugging Face warning
model.generation_config.pad_token_id = model.generation_config.eos_token_id

In [10]:
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

✍️ Baseline Inference (BF16)

In [11]:
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print(text)

Once upon a time, there was a little boy who loved to ride his bike. He loved the feeling of the wind in his hair, the freedom of the open road, and the exhilaration of riding down a hill. But there was one problem. The little boy was short. He was just 3 feet tall, and he had to sit down on his bike to ride. He knew he could do it, but he also knew that he would never be able to ride his bike without sitting down. He tried and tried, but he just couldn’t get it right. He was frustrated and discouraged. He knew that if he could just get it right, he would be able to ride his bike and have the freedom of the open road. But he kept trying, and trying, and trying. Finally, he gave up. He just couldn’t do it. He was too short. He was too small. He was too weak. He was too weak to ride his bike. He was too weak to ride his bike. He was too weak to ride his bike. He was too weak to ride his bike. He was too weak to ride his bike. He was too weak to ride his bike. He was too weak to ride his 

Let's benchmark the model:

In [None]:
from utils import get_model_size

model_size = get_model_size(model)
print(f"Model Size: {model_size / 2**30:.2f} GB")

 Model Size: 2.79 GB


In [17]:
from utils import benchmark_cuda_only_in_ms

model_cuda_time = benchmark_cuda_only_in_ms(model, warmup=0, iters=1, **inputs,do_sample=True, max_new_tokens=256)
print(f"GPU Time: {model_cuda_time:.2f} ms")

GPU Time: 20.52 ms


In [None]:
from utils import benchmark_in_ms

model_cuda_time = benchmark_in_ms(model, warmup=0, iters=1, **inputs,do_sample=True, max_new_tokens=256)
print(f"Total Time: {model_cuda_time:.2f} ms")

Total Time: 56.94 ms


Let's evaluate accuracy on different types of tasks

In [None]:
import json
from lm_eval import simple_evaluate

results = simple_evaluate(
    model="hf",
    model_args={
        "pretrained": model,
        "tokenizer": tokenizer,
        "batch_size": 8
    },
    tasks=["piqa", "arc_easy"],
)

print(json.dumps(results["results"], indent=2))


2025-07-08:04:08:55,834 INFO     [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-07-08:04:08:55,836 INFO     [evaluator.py:188] Initializing hf model, with arguments: {'pretrained': LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_fe

{
  "arc_easy": {
    "alias": "arc_easy",
    "acc,none": 0.6540404040404041,
    "acc_stderr,none": 0.009760749624427512,
    "acc_norm,none": 0.6031144781144782,
    "acc_norm_stderr,none": 0.010039236800583206
  },
  "piqa": {
    "alias": "piqa",
    "acc,none": 0.7421109902067464,
    "acc_stderr,none": 0.01020695666205627,
    "acc_norm,none": 0.7453754080522307,
    "acc_norm_stderr,none": 0.010164432237060466
  }
}


In [67]:
import argparse
from datetime import timedelta

from accelerate import Accelerator, InitProcessGroupKwargs
import bigcode_eval
import bigcode_eval.evaluator
from bigcode_eval.arguments import EvalArguments

from eval import bigcode_default_args

accelerator = Accelerator(InitProcessGroupKwargs(timeout=timedelta(weeks=52)))
bigcode_evaluator = bigcode_eval.evaluator.Evaluator(
    accelerator=accelerator,
    model=model,
    tokenizer=tokenizer,
    args=argparse.Namespace(**bigcode_default_args),
)
result = bigcode_evaluator.evaluate("humaneval")
print(result)

number of problems for this task is 164


100%|██████████| 164/164 [01:53<00:00,  1.44it/s]

Evaluating generations...





{'pass@1': 0.16463414634146342}


In [69]:
from data import eval_perplexity, task_dataset_configs

result = eval_perplexity(model=model, tokenizer=tokenizer, batch_size=1, max_seq_len=2048, num_batches=10, **task_dataset_configs["pile-clean"])
print(result)

Evaluating perplexity on monology/pile-uncopyrighted on split train...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

31it [00:02, 12.22it/s, loss=2.31]


Perplexity: 14.16655158996582
14.16655158996582


Now, let's quantize

In [None]:
from quantize import any4, int4, int8, nf4, fp4

# You can also try, `int4(model)`, `int8(model)`, to use tinygemm uniform integer quantization kernels.
model = any4(model)

Quantizing: 100%|██████████| 112/112 [03:46<00:00,  2.02s/layer, model.layers.15.mlp.down_proj]   


Let's peak into the model. We can now see that `Linear` layers are now replaced with `Any4Linear`.

In [71]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Any4Linear(in_features=2048, out_features=2048, bias=False, group_size=128, per_row=True)
          (k_proj): Any4Linear(in_features=2048, out_features=512, bias=False, group_size=128, per_row=True)
          (v_proj): Any4Linear(in_features=2048, out_features=512, bias=False, group_size=128, per_row=True)
          (o_proj): Any4Linear(in_features=2048, out_features=2048, bias=False, group_size=128, per_row=True)
        )
        (mlp): LlamaMLP(
          (gate_proj): Any4Linear(in_features=2048, out_features=8192, bias=False, group_size=128, per_row=True)
          (up_proj): Any4Linear(in_features=2048, out_features=8192, bias=False, group_size=128, per_row=True)
          (down_proj): Any4Linear(in_features=8192, out_features=2048, bias=False, group_size=128, per_row=True)
    

Let's try printing something:

In [72]:
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print(text)

Once upon a time, I was a fan of the old-school, “How to be a Rock Star”-type advice books, which have now been replaced by the “How to be a Rock Star in 10 Easy Steps”-type books. The difference is that the first group of books was about how to be a rock star, while the second group of books are about how to be a rock star in 10 easy steps.
I’m not sure what to make of this, but I do think that these books are a bit of a mixed bag. On one hand, you have the old-school advice books that talk about how to be a rock star, while on the other hand, you have the new-school advice books that talk about how to be a rock star in 10 easy steps. I think that these two types of books are both valid and useful, but I also think that there’s something to be said for the old-school advice books, because they’re a bit more personal and they’re a bit more real.
The old-school advice books are a bit more personal and they’re a bit more real. They’re a bit more real because they’re a bit more real, and 

Not bad! Now let's benchmark it.

In [73]:
from utils import get_model_size

model_size = get_model_size(model)
print(f"Model Size: {model_size / 2**30:.2f} GB")

Model Size: 1.47 GB


In [74]:
from utils import benchmark_cuda_only_in_ms

model_cuda_time = benchmark_cuda_only_in_ms(model, warmup=0, iters=1, **inputs,do_sample=True, max_new_tokens=256)
print(f"GPU Time: {model_cuda_time:.2f} ms")

GPU Time: 18.02 ms


In [75]:
from utils import benchmark_in_ms

model_cuda_time = benchmark_in_ms(model, warmup=0, iters=1, **inputs,do_sample=True, max_new_tokens=256)
print(f"Total Time: {model_cuda_time:.2f} ms")

Total Time: 37.05 ms


Model size reduced from 2.79 GB to 1.47 GB (note that embedding and language model head have not been quantized and these tend to be signficant in smaller models like Llama3.2 1B).

GPU time reduced from 20.52 ms to 18.02 ms.

Total time reduced from 56.94 ms to 37.05 ms.

Now let's evaluate accuracy after quantization: 

In [76]:
import json
from lm_eval import simple_evaluate

results = simple_evaluate(
    model="hf",
    model_args={
        "pretrained": model,
        "tokenizer": tokenizer,
        "batch_size": 8
    },
    tasks=["piqa", "arc_easy"],
)

print(json.dumps(results["results"], indent=2))


2025-07-08:05:09:26,385 INFO     [evaluator.py:164] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-07-08:05:09:26,388 INFO     [evaluator.py:188] Initializing hf model, with arguments: {'pretrained': LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Any4Linear(in_features=2048, out_features=2048, bias=False, group_size=128, per_row=True)
          (k_proj): Any4Linear(in_features=2048, out_features=512, bias=False, group_size=128, per_row=True)
          (v_proj): Any4Linear(in_features=2048, out_features=512, bias=False, group_size=128, per_row=True)
          (o_proj): Any4Linear(in_features=2048, out_features=2048, bias=False, group_size=128, per_row=True)
        )
        (mlp): LlamaMLP(
          (gate_proj): Any4Linear(in_features=2048, out

{
  "arc_easy": {
    "alias": "arc_easy",
    "acc,none": 0.6191077441077442,
    "acc_stderr,none": 0.009964428212260372,
    "acc_norm,none": 0.5778619528619529,
    "acc_norm_stderr,none": 0.010134620524592271
  },
  "piqa": {
    "alias": "piqa",
    "acc,none": 0.7274211099020674,
    "acc_stderr,none": 0.010389256803296016,
    "acc_norm,none": 0.7285092491838956,
    "acc_norm_stderr,none": 0.010376251176596135
  }
}


In [77]:
import argparse
from datetime import timedelta

from accelerate import Accelerator, InitProcessGroupKwargs
import bigcode_eval
import bigcode_eval.evaluator
from bigcode_eval.arguments import EvalArguments

from eval import bigcode_default_args

accelerator = Accelerator(InitProcessGroupKwargs(timeout=timedelta(weeks=52)))
bigcode_evaluator = bigcode_eval.evaluator.Evaluator(
    accelerator=accelerator,
    model=model,
    tokenizer=tokenizer,
    args=argparse.Namespace(**bigcode_default_args),
)
result = bigcode_evaluator.evaluate("humaneval")
print(result)

number of problems for this task is 164


100%|██████████| 164/164 [02:17<00:00,  1.19it/s]

Evaluating generations...





{'pass@1': 0.11585365853658537}


In [78]:
from data import eval_perplexity, task_dataset_configs

result = eval_perplexity(model=model, tokenizer=tokenizer, batch_size=1, max_seq_len=2048, num_batches=10, **task_dataset_configs["pile-clean"])
print(result)

Evaluating perplexity on monology/pile-uncopyrighted on split train...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

31it [00:05,  6.11it/s, loss=2.48]

Perplexity: 15.88514232635498
15.88514232635498





So it seems accuracy is very similar, and hence quantization does not degrade accuracy much. Cool!