In [1]:
import torch
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer

import pynvml

dtype = torch.float16
n_bytes_per_param = 2 if dtype in (torch.float16, torch.bfloat16) else 4
device = torch.device("cuda")

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
get_vram = lambda: pynvml.nvmlDeviceGetMemoryInfo(handle).used / 2**20 # MiB

In [2]:
start_vram = get_vram()

# Initializing CUDA kernels
a = torch.ones((1,1)).to(device); del a
cuda_kernels_vram = get_vram() - start_vram
print(f"CUDA kernels VRAM: {cuda_kernels_vram:.0f} MiB")

CUDA kernels VRAM: 361 MiB


# Loading model

In [3]:
#model_name_or_path = "gpt2-xl" # gpt2, gpt2-medium, gpt2-large, gpt2-xl
#model_name_or_path = "mistralai/Mistral-7B-v0.1"
#model_name_or_path = "NousResearch/Llama-2-7b-hf"
model_name_or_path = "microsoft/phi-1_5" # phi-1_5, phi-2

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=dtype, trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

n_parameters = sum(p.numel() for p in model.parameters())
model_estimated_vram = n_parameters * n_bytes_per_param / 2**20
model_actual_vram = get_vram() - cuda_kernels_vram - start_vram

print(model.config)
print(model)
print(f"Number of parameters: {(n_parameters / 1e9):.3f} B ({n_parameters})")
print(f"Estimated VRAM usage: {(model_estimated_vram):.3f} MiB")
print(f"Actual VRAM usage: {(model_actual_vram):.3f} MiB")
print(f"Error: {((model_actual_vram - model_estimated_vram) * 100 / model_actual_vram):.1f} %")

PhiConfig {
  "_name_or_path": "microsoft/phi-1_5",
  "activation_function": "gelu_new",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-1_5--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-1_5--modeling_phi.PhiForCausalLM"
  },
  "embd_pdrop": 0.0,
  "flash_attn": false,
  "flash_rotary": false,
  "fused_dense": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "phi-msft",
  "n_embd": 2048,
  "n_head": 32,
  "n_head_kv": null,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 2048,
  "resid_pdrop": 0.0,
  "rotary_dim": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.2",
  "vocab_size": 51200
}

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2048)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-23): 24 x ParallelBlock(
        (ln)

In [4]:
bs = 2
seq_length = 128

input_ids = torch.randint(0, len(tokenizer), (bs, seq_length)).to(device)
attention_mask = torch.ones((bs, seq_length)).to(device)

# Inference forward pass

In [5]:
model.config.use_cache = False
_ = model.eval()

with torch.no_grad():
    out = model(input_ids=input_ids, attention_mask=attention_mask)
    # probs = F.softmax(out.logits[:, -1, :], dim=-1) # for inference we need probabilities only over the last token; omit this as it is very small
    
out_bs, out_sequence_length, out_embedding_size = out.logits.shape
n_bytes_per_param_out = 2 if out.logits.dtype in (torch.float16, torch.bfloat16) else 4
print(f"Out tensor dtype: {out.logits.dtype}")

#print(torch.cuda.memory_summary())

Out tensor dtype: torch.float32


In [6]:
total_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
print(f"Total forward pass VRAM usage: {total_forward_pass_vram:.0f} MiB")

torch.cuda.empty_cache() # calling `free` on allocated memory for forward pass
output_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
del out
torch.cuda.empty_cache() # calling `free` on allocated memory for `out` tensor
eps_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram # idk what is that, but it is small

output_actual_vram = output_vram - eps_vram
activations_actual_vram = total_forward_pass_vram - output_actual_vram - eps_vram

output_estimated_vram = out_bs * out_sequence_length * out_embedding_size * n_bytes_per_param_out / 2**20

print(f"Actual output tensor with bs {out_bs}, seq length {out_sequence_length} and emb size {out_embedding_size} VRAM usage: {output_actual_vram:.0f} MiB")
print(f"Estimated output tensor with bs {out_bs}, seq length {out_sequence_length} and emb size {out_embedding_size} VRAM usage: {output_estimated_vram:.0f} MiB")
print(f"Actual activations VRAM usage: {activations_actual_vram:.0f} MiB")

Total forward pass VRAM usage: 122 MiB
Actual output tensor with bs 2, seq length 128 and emb size 51200 VRAM usage: 50 MiB
Estimated output tensor with bs 2, seq length 128 and emb size 51200 VRAM usage: 50 MiB
Actual activations VRAM usage: 34 MiB


# Training step

## no optimizer

In [7]:
_ = model.train()
out = model(input_ids=input_ids, attention_mask=attention_mask)
total_train_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Total train forward pass VRAM usage (with output tensor): {total_train_forward_pass_vram:.0f} MiB")

probs = F.softmax(out.logits, dim=-1)
probs_vram = get_vram() - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Actual probs tensor VRAM usage: {probs_vram:.0f} MiB")

loss = F.cross_entropy(probs.permute(0, 2, 1), input_ids) # mapping tokens into themselves
loss_calculation_vram = get_vram() - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Loss calculation VRAM usage: {loss_calculation_vram:.0f} MiB")
loss.backward()
backward_vram = get_vram() - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Backward calculation VRAM usage: {backward_vram:.0f} MiB")

print(f"Gradients type: {next(model.parameters()).grad.dtype}")

Total train forward pass VRAM usage (with output tensor): 878 MiB
Actual probs tensor VRAM usage: 50 MiB
Loss calculation VRAM usage: 100 MiB
Backward calculation VRAM usage: 2468 MiB
Gradients type: torch.float16


In [8]:
del out
del probs
del loss

torch.cuda.empty_cache() # calling `free` on allocated memory for activations
gradients_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
model.zero_grad()
torch.cuda.empty_cache()
eps_2_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
gradients_actual_vram = gradients_total_vram - eps_2_vram
print(f"Actual gradients VRAM usage: {gradients_actual_vram:.0f} MiB")

eps_vram += eps_2_vram
print(f"Random eps VRAM usage: {eps_vram:.0f} MiB")

Actual gradients VRAM usage: 2718 MiB
Random eps VRAM usage: 96 MiB


## SGD

### no momentum

In [9]:
print(f"Expecting to use 0 extra VRAM: {get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram}")
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
_ = model.train()

Expecting to use 0 extra VRAM: 0.0


In [10]:
out = model(input_ids=input_ids, attention_mask=attention_mask)
total_train_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Total train forward pass VRAM usage (with output tensor): {total_train_forward_pass_vram:.0f} MiB")

probs = F.softmax(out.logits, dim=-1)
probs_vram = get_vram() - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Actual probs tensor VRAM usage: {probs_vram:.0f} MiB")

loss = F.cross_entropy(probs.permute(0, 2, 1), input_ids) # mapping tokens into themselves
loss_calculation_vram = get_vram() - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Loss calculation VRAM usage: {loss_calculation_vram:.0f} MiB")
loss.backward()
backward_vram = get_vram() - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Backward calculation VRAM usage: {backward_vram:.0f} MiB")

optimizer.step()
#optim_step_vram = get_vram() - backward_vram - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
#print(f"Optim step VRAM usage: {optim_step_vram:.0f} MiB")

print(f"Gradients type: {next(model.parameters()).grad.dtype}")

Total train forward pass VRAM usage (with output tensor): 838 MiB
Actual probs tensor VRAM usage: 50 MiB
Loss calculation VRAM usage: 100 MiB
Backward calculation VRAM usage: 2436 MiB
Gradients type: torch.float16


In [11]:
del out
del probs
del loss

torch.cuda.empty_cache() # calling `free` on allocated memory for activations
gradients_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
optimizer.zero_grad()
torch.cuda.empty_cache()
eps_2_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
gradients_actual_vram = gradients_total_vram - eps_2_vram
print(f"Actual gradients VRAM usage: {gradients_actual_vram:.0f} MiB")

eps_vram += eps_2_vram
print(f"Random eps VRAM usage: {eps_vram:.0f} MiB")

Actual gradients VRAM usage: 2704 MiB
Random eps VRAM usage: 96 MiB


### momentum

In [12]:
print(f"Expecting to use 0 extra VRAM: {get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram}")
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
_ = model.train()

Expecting to use 0 extra VRAM: 0.0


In [13]:
out = model(input_ids=input_ids, attention_mask=attention_mask)
total_train_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Total train forward pass VRAM usage (with output tensor): {total_train_forward_pass_vram:.0f} MiB")

probs = F.softmax(out.logits, dim=-1)
probs_vram = get_vram() - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Actual probs tensor VRAM usage: {probs_vram:.0f} MiB")

loss = F.cross_entropy(probs.permute(0, 2, 1), input_ids) # mapping tokens into themselves
loss_calculation_vram = get_vram() - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Loss calculation VRAM usage: {loss_calculation_vram:.0f} MiB")
loss.backward()
backward_vram = get_vram() - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Backward calculation VRAM usage: {backward_vram:.0f} MiB")

optimizer.step()
#optim_step_vram = get_vram() - backward_vram - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
#print(f"Optim step VRAM usage: {optim_step_vram:.0f} MiB")

print(f"Gradients type: {next(model.parameters()).grad.dtype}")

Total train forward pass VRAM usage (with output tensor): 838 MiB
Actual probs tensor VRAM usage: 50 MiB
Loss calculation VRAM usage: 100 MiB
Backward calculation VRAM usage: 2436 MiB
Gradients type: torch.float16


In [14]:
del out
del probs
del loss

torch.cuda.empty_cache() # calling `free` on allocated memory for activations
gradients_optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
optimizer.zero_grad()
torch.cuda.empty_cache()
optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
del optimizer
torch.cuda.empty_cache()
eps_2_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
gradients_actual_vram = gradients_optimizer_total_vram - optimizer_total_vram
optimizer_actual_vram = optimizer_total_vram - eps_2_vram
print(f"Actual gradients VRAM usage: {gradients_actual_vram:.0f} MiB")
print(f"Actual optimizer states VRAM usage: {optimizer_actual_vram:.0f} MiB")

eps_vram += eps_2_vram
print(f"Random eps VRAM usage: {eps_vram:.0f} MiB")

Actual gradients VRAM usage: 2702 MiB
Actual optimizer states VRAM usage: 2754 MiB
Random eps VRAM usage: 96 MiB


## Adam

In [15]:
print(f"Expecting to use 0 extra VRAM: {get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram}")
get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
optimizer = torch.optim.AdamW(model.parameters(), lr=0.1)
_ = model.train()

Expecting to use 0 extra VRAM: 0.0


In [16]:
out = model(input_ids=input_ids, attention_mask=attention_mask)
total_train_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Total train forward pass VRAM usage (with output tensor): {total_train_forward_pass_vram:.0f} MiB")

probs = F.softmax(out.logits, dim=-1)
probs_vram = get_vram() - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Actual probs tensor VRAM usage: {probs_vram:.0f} MiB")

loss = F.cross_entropy(probs.permute(0, 2, 1), input_ids) # mapping tokens into themselves
loss_calculation_vram = get_vram() - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Loss calculation VRAM usage: {loss_calculation_vram:.0f} MiB")
loss.backward()
backward_vram = get_vram() - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
print(f"Backward calculation VRAM usage: {backward_vram:.0f} MiB")

optimizer.step()
#optim_step_vram = get_vram() - backward_vram - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
#print(f"Optim step VRAM usage: {optim_step_vram:.0f} MiB")

print(f"Gradients type: {next(model.parameters()).grad.dtype}")

Total train forward pass VRAM usage (with output tensor): 838 MiB
Actual probs tensor VRAM usage: 50 MiB
Loss calculation VRAM usage: 100 MiB
Backward calculation VRAM usage: 2436 MiB
Gradients type: torch.float16


In [17]:
del out
del probs
del loss

torch.cuda.empty_cache() # calling `free` on allocated memory for activations
gradients_optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
optimizer.zero_grad()
torch.cuda.empty_cache()
optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
del optimizer
torch.cuda.empty_cache()
eps_2_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
gradients_actual_vram = gradients_optimizer_total_vram - optimizer_total_vram
optimizer_actual_vram = optimizer_total_vram - eps_2_vram
print(f"Actual gradients VRAM usage: {gradients_actual_vram:.0f} MiB")
print(f"Actual optimizer states VRAM usage: {optimizer_actual_vram:.0f} MiB")

eps_vram += eps_2_vram
print(f"Random eps VRAM usage: {eps_vram:.0f} MiB")

Actual gradients VRAM usage: 2702 MiB
Actual optimizer states VRAM usage: 5508 MiB
Random eps VRAM usage: 96 MiB


In [18]:
!nvidia-smi

Thu Dec 14 12:29:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:81:00.0 Off |                  N/A |
| 34%   48C    P2   135W / 350W |   3209MiB / 24576MiB |     47%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Estimation activations

In [19]:
hidden_size = model.config.hidden_size
num_attention_heads = model.config.num_attention_heads
num_key_value_heads = model.config.num_key_value_heads if hasattr(model.config, "num_key_value_heads") else model.config.num_attention_heads # different from num_attention_heads in case of GQA
intermediate_size = model.config.intermediate_size if hasattr(model.config, "intermediate_size") else 4 * model.config.hidden_size # MLP projection
head_dim = hidden_size // num_attention_heads
print(f"Calculating size of activation for single block with:\nhidden size {hidden_size}\nnum attention heads {num_attention_heads}\nnum key value heads {num_key_value_heads}\nintermediate size {intermediate_size}\nhead dim {head_dim}")

attention_input      = n_bytes_per_param * bs * seq_length * hidden_size
q                    = n_bytes_per_param * bs * seq_length * head_dim * num_attention_heads # for Q @ K.T
k                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads # num_key_value_heads might be different from num_attention_heads in case of GQA
softmax_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2 # to multiply with V
#softmax_dropout_mask = 1                 * bs * num_attention_heads * seq_length ** 2 # single byte per elem
#dropout_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2
v                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads
out_proj_input       = n_bytes_per_param * bs * seq_length * num_attention_heads * head_dim
#attention_dropout    = 1                 * bs * seq_length * hidden_size
attention_block = attention_input + q + k + softmax_output + v + out_proj_input

mlp_input        = n_bytes_per_param * bs * seq_length * hidden_size
activation_input = n_bytes_per_param * bs * seq_length * intermediate_size # SiLU
down_proj_input  = n_bytes_per_param * bs * seq_length * intermediate_size
mlp_block = mlp_input + activation_input + down_proj_input

layer_norms = n_bytes_per_param * bs * seq_length * hidden_size * 2 # 2 layer norms

layer = attention_block + mlp_block + layer_norms
print(f"Single layer (out of {model.config.num_hidden_layers}) estimated activations VRAM usage: {layer // 2**20} MiB")
print(f"Estimated activations VRAM usage (softmax output + v): {(softmax_output + v) // 2**20} MiB")

Calculating size of activation for single block with:
hidden size 2048
num attention heads 32
num key value heads 32
intermediate size 8192
head dim 64
Single layer (out of 24) estimated activations VRAM usage: 18 MiB
Estimated activations VRAM usage (softmax output + v): 3 MiB
