In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

import pynvml

dtype = torch.bfloat16
n_bytes_per_param = 2 if dtype in (torch.float16, torch.bfloat16) else 4
device = torch.device("cuda")

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
get_vram = lambda: pynvml.nvmlDeviceGetMemoryInfo(handle).used / 2**20 # MiB

In [2]:
start_vram = get_vram()

# Initializing CUDA kernels
a = torch.ones((1,1)).to(device); del a
cuda_kernels_vram = get_vram() - start_vram
print(f"CUDA kernels VRAM: {cuda_kernels_vram:.0f} MiB")

CUDA kernels VRAM: 361 MiB


In [3]:
#model_name_or_path = "gpt2-xl" # gpt2, gpt2-medium, gpt2-large, gpt2-xl
#model_name_or_path = "mistralai/Mistral-7B-v0.1"
model_name_or_path = "NousResearch/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=dtype).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

n_parameters = sum(p.numel() for p in model.parameters())
model_estimated_vram = n_parameters * n_bytes_per_param / 2**20
model_actual_vram = get_vram() - cuda_kernels_vram - start_vram

display(model)
print(f"Number of parameters: {(n_parameters / 1e9):.3f} B")
print(f"Estimated VRAM usage: {(model_estimated_vram / 2**10):.3f} GiB")
print(f"Actual VRAM usage: {(model_actual_vram / 2**10):.3f} GiB")
print(f"Error: {((model_actual_vram - model_estimated_vram) * 100 / model_actual_vram):.1f} %")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

Number of parameters: 6.738 B
Estimated VRAM usage: 12.551 GiB
Actual VRAM usage: 12.613 GiB
Error: 0.5 %


In [4]:
bs = 8
seq_length = 1024

input_ids = torch.randint(0, len(tokenizer), (bs, seq_length)).to(device)
attention_mask = torch.ones((bs, seq_length)).to(device)

In [5]:
_ = model.eval()
model.config.use_cache = False

with torch.no_grad():
    out = model(input_ids=input_ids, attention_mask=attention_mask)
    
out_bs, out_sequence_length, out_embedding_size = out.logits.shape
n_bytes_per_param_out = 2 if out.logits.dtype in (torch.float16, torch.bfloat16) else 4

#print(torch.cuda.memory_summary())

In [6]:
total_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
print(f"Total forward pass VRAM usage: {total_forward_pass_vram:.0f} MiB")

torch.cuda.empty_cache() # calling `free` on allocated memory for forward pass
output_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
del out
torch.cuda.empty_cache() # calling `free` on allocated memory for `out` tensor
eps_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram # idk what is that, but it is small

output_actual_vram = output_vram - eps_vram
activations_actual_vram = total_forward_pass_vram - output_actual_vram - eps_vram

output_estimated_vram = out_bs * out_sequence_length * out_embedding_size * n_bytes_per_param_out / 2**20

print(f"Actual output tensor with bs {out_bs}, seq length {out_sequence_length} and emb size {out_embedding_size} VRAM usage: {output_actual_vram:.0f} MiB")
print(f"Estimated output tensor with bs {out_bs}, seq length {out_sequence_length} and emb size {out_embedding_size} VRAM usage: {output_estimated_vram:.0f} MiB")
print(f"Actual activations VRAM usage: {activations_actual_vram:.0f} MiB")

Total forward pass VRAM usage: 3764 MiB
Actual output tensor with bs 8, seq length 1024 and emb size 32000 VRAM usage: 1024 MiB
Estimated output tensor with bs 8, seq length 1024 and emb size 32000 VRAM usage: 1000 MiB
Actual activations VRAM usage: 2708 MiB


In [7]:
hidden_size = model.config.hidden_size
num_attention_heads = model.config.num_attention_heads
num_key_value_heads = model.config.num_key_value_heads # different from num_attention_heads in case of GQA
intermediate_size = model.config.intermediate_size # MLP projection
head_dim = hidden_size // num_attention_heads
print(f"Calculating size of activation for single block with:\nhidden size {hidden_size}\nnum attention heads {num_attention_heads}\nnum key value heads {num_key_value_heads}\nintermediate size {intermediate_size}\nhead dim {head_dim}")

attention_input      = n_bytes_per_param * bs * seq_length * hidden_size
q                    = n_bytes_per_param * bs * seq_length * head_dim * num_attention_heads # for Q @ K.T
k                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads # num_key_value_heads might be different from num_attention_heads in case of GQA
softmax_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2 # to multiply with V
#softmax_dropout_mask = 1                 * bs * num_attention_heads * seq_length ** 2 # single byte per elem
#dropout_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2
v                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads
out_proj_input       = n_bytes_per_param * bs * seq_length * num_attention_heads * head_dim
#attention_dropout    = 1                 * bs * seq_length * hidden_size
attention_block = attention_input + q + k + softmax_output + v + out_proj_input

mlp_input        = n_bytes_per_param * bs * seq_length * hidden_size
activation_input = n_bytes_per_param * bs * seq_length * intermediate_size # SiLU
down_proj_input  = n_bytes_per_param * bs * seq_length * intermediate_size
mlp_block = mlp_input + activation_input + down_proj_input

layer_norms = n_bytes_per_param * bs * seq_length * hidden_size * 2 # 2 layer norms

layer = attention_block + mlp_block + layer_norms
print(f"Single layer (out of {model.config.num_hidden_layers}) estimated activations VRAM usage: {layer // 2**20} MiB")
print(f"Estimated activations VRAM usage (softmax output + v): {(softmax_output + v) // 2**20} MiB")

Calculating size of activation for single block with:
hidden size 4096
num attention heads 32
num key value heads 32
intermediate size 11008
head dim 128
Single layer (out of 32) estimated activations VRAM usage: 1368 MiB
Estimated activations VRAM usage (softmax output + v): 576 MiB
