# Required Vram

In [7]:
from huggingface_hub import get_safetensors_metadata

# model_id = "mistralai/Mistral-7B-Instruct-v0.1"
model_id = "distilbert/distilgpt2"
# model_id = 'bert-base-uncased'

dtype_bytes = {"F32": 4, "F16": 2, "BF16": 2, "F8": 1, "INT8": 1, "INT4": 0.5}

metadata = get_safetensors_metadata(model_id)

for precision, bytes_per_dtype in dtype_bytes.items():
    memory = ((sum(metadata.parameter_count.values()) * bytes_per_dtype) / (1024**3)) * 1.18
    print(f"{model_id=} requires {memory:.2f} GB for {precision} precision")

model_id='distilbert/distilgpt2' requires 0.39 GB for F32 precision
model_id='distilbert/distilgpt2' requires 0.19 GB for F16 precision
model_id='distilbert/distilgpt2' requires 0.19 GB for BF16 precision
model_id='distilbert/distilgpt2' requires 0.10 GB for F8 precision
model_id='distilbert/distilgpt2' requires 0.10 GB for INT8 precision
model_id='distilbert/distilgpt2' requires 0.05 GB for INT4 precision


# KV Caching

In [1]:
import numpy as np
import time
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)

for use_cache in (True, False):
    times = []
    for _ in tqdm(range(10), desc=f"Running with {'KV caching' if use_cache else 'without KV caching'}"):  # Adding tqdm here
        start = time.time()
        model.generate(
            **tokenizer("What is KV caching?", return_tensors="pt").to(device), 
            use_cache=use_cache, 
            max_new_tokens=1000, 
            pad_token_id=tokenizer.eos_token_id
        )
        end = time.time()
        times.append(end - start)
    print(f"{'with' if use_cache else 'without'} KV caching: {round(np.mean(times), 3)} +- {round(np.std(times), 3)} seconds")

  from .autonotebook import tqdm as notebook_tqdm
Running with KV caching: 100%|██████████| 10/10 [00:16<00:00,  1.62s/it]


with KV caching: 1.622 +- 0.117 seconds


Running with without KV caching: 100%|██████████| 10/10 [01:12<00:00,  7.22s/it]

without KV caching: 7.216 +- 0.098 seconds





# Precision

In [10]:
from transformers import AutoModel

# Load the BERT model (e.g., 'bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Check the dtype of the model's weights (you can also check specific layers if needed)
for name, param in model.named_parameters():
    print(f"{name}: {param.dtype}")

embeddings.word_embeddings.weight: torch.float32
embeddings.position_embeddings.weight: torch.float32
embeddings.token_type_embeddings.weight: torch.float32
embeddings.LayerNorm.weight: torch.float32
embeddings.LayerNorm.bias: torch.float32
encoder.layer.0.attention.self.query.weight: torch.float32
encoder.layer.0.attention.self.query.bias: torch.float32
encoder.layer.0.attention.self.key.weight: torch.float32
encoder.layer.0.attention.self.key.bias: torch.float32
encoder.layer.0.attention.self.value.weight: torch.float32
encoder.layer.0.attention.self.value.bias: torch.float32
encoder.layer.0.attention.output.dense.weight: torch.float32
encoder.layer.0.attention.output.dense.bias: torch.float32
encoder.layer.0.attention.output.LayerNorm.weight: torch.float32
encoder.layer.0.attention.output.LayerNorm.bias: torch.float32
encoder.layer.0.intermediate.dense.weight: torch.float32
encoder.layer.0.intermediate.dense.bias: torch.float32
encoder.layer.0.output.dense.weight: torch.float32
enco