In [18]:
!pip install accelerate



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
import torch
from transformers import AutoModelForCausalLM

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load model with optimizations to prevent kernel crash:
# 1. Use torch_dtype=torch.bfloat16 to reduce memory usage by 50%
# 2. Use device_map="auto" to automatically distribute model across available devices
# 3. Use low_cpu_mem_usage=True to minimize CPU memory during loading
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)
model

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 81.59it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [21]:
text = "Hello, my name is"
inputs = tokenizer(text, return_tensors="pt")
# Move inputs to the same device as model immediately
inputs = {k: v.to(model.device) for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=10)

outputs

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000,   9906,     11,    856,    836,    374,  23880,    323,    358,
           1097,    264,    220,   1187,   1060,   2362,   8954]])

In [22]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Hello, my name is Maria and I am a 24 year old female


In [23]:
from quantization_theory_helper import compute_module_sizes
module_sizes = compute_module_sizes(model)
module_sizes

defaultdict(int,
            {'': 16060522752,
             'model': 15009849600,
             'model.embed_tokens': 1050673152,
             'model.embed_tokens.weight': 1050673152,
             'model.layers': 13959168000,
             'model.layers.0': 436224000,
             'model.layers.0.self_attn': 83886080,
             'model.layers.0.self_attn.q_proj': 33554432,
             'model.layers.0.self_attn.q_proj.weight': 33554432,
             'model.layers.0.self_attn.k_proj': 8388608,
             'model.layers.0.self_attn.k_proj.weight': 8388608,
             'model.layers.0.self_attn.v_proj': 8388608,
             'model.layers.0.self_attn.v_proj.weight': 8388608,
             'model.layers.0.self_attn.o_proj': 33554432,
             'model.layers.0.self_attn.o_proj.weight': 33554432,
             'model.layers.0.mlp': 352321536,
             'model.layers.0.mlp.gate_proj': 117440512,
             'model.layers.0.mlp.gate_proj.weight': 117440512,
             'model.layers.0.

In [24]:
print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 16.060522752 GB


In [25]:
print(model.model.layers[0].mlp.down_proj.weight)

Parameter containing:
tensor([[ 0.0121, -0.0148, -0.0086,  ...,  0.0090, -0.0033,  0.0128],
        [ 0.0237, -0.0069, -0.0052,  ...,  0.0018,  0.0193,  0.0024],
        [ 0.0099, -0.0110, -0.0004,  ..., -0.0042,  0.0024, -0.0179],
        ...,
        [ 0.0113, -0.0243,  0.0244,  ..., -0.0208,  0.0085, -0.0139],
        [-0.0157, -0.0093, -0.0126,  ...,  0.0101,  0.0010, -0.0160],
        [-0.0038,  0.0068,  0.0087,  ..., -0.0063, -0.0054, -0.0116]],
       dtype=torch.bfloat16, requires_grad=True)


In [26]:
from quanto import quantize, freeze, qint8

# Use quanto's qtype objects directly
print(f"Before quantization: {model.model.layers[0].mlp.down_proj.weight}")

Before quantization: Parameter containing:
tensor([[ 0.0121, -0.0148, -0.0086,  ...,  0.0090, -0.0033,  0.0128],
        [ 0.0237, -0.0069, -0.0052,  ...,  0.0018,  0.0193,  0.0024],
        [ 0.0099, -0.0110, -0.0004,  ..., -0.0042,  0.0024, -0.0179],
        ...,
        [ 0.0113, -0.0243,  0.0244,  ..., -0.0208,  0.0085, -0.0139],
        [-0.0157, -0.0093, -0.0126,  ...,  0.0101,  0.0010, -0.0160],
        [-0.0038,  0.0068,  0.0087,  ..., -0.0063, -0.0054, -0.0116]],
       dtype=torch.bfloat16, requires_grad=True)


In [27]:
quantize(model, weights=qint8, activations=None)
# print(model)
# print(f"After quantization: {model.model.layers[0].mlp.down_proj.weight}")

In [28]:
freeze(model)

In [29]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QLinear(in_features=4096, out_features=4096, bias=False)
          (k_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (v_proj): QLinear(in_features=4096, out_features=1024, bias=False)
          (o_proj): QLinear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (up_proj): QLinear(in_features=4096, out_features=14336, bias=False)
          (down_proj): QLinear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e

In [30]:
print(model.model.layers[0].mlp.down_proj.weight)

QBytesTensor(tensor([[ 25, -31, -18,  ...,  19,  -7,  27],
        [ 68, -20, -15,  ...,   5,  55,   7],
        [ 26, -28,  -1,  ..., -11,   6, -46],
        ...,
        [ 15, -33,  33,  ..., -28,  12, -19],
        [-38, -22, -30,  ...,  24,   2, -38],
        [-10,  17,  22,  ..., -16, -13, -29]], dtype=torch.int8), scale=tensor([[0.0005],
        [0.0003],
        [0.0004],
        ...,
        [0.0007],
        [0.0004],
        [0.0004]], dtype=torch.bfloat16), dtype=torch.bfloat16)


In [31]:
module_sizes = compute_module_sizes(model)
print(f"The quantized model size is {module_sizes[''] * 1e-9} GB")

The quantized model size is 8.558875144 GB


In [32]:
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0