In [1]:
import torch
import torch.nn.functional as F
import pynvml
from transformers import AutoModelForCausalLM, AutoTokenizer
from ipyexperiments import IPyExperimentsPytorch

# Run parameters

In [2]:
device = torch.device("cuda")
model_name_or_path = "microsoft/phi-1_5" # microsoft/phi-1_5, microsoft/phi-2, NousResearch/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, gpt2, gpt2-medium, gpt2-large, gpt2-xl
dtype = torch.float32
mixed_precision_training = True
bs = 2
seq_length = 128
get_optimizer = lambda parameters: torch.optim.SGD(parameters, lr=0.1, momentum=0.9) # SGD(parameters, lr=0.1), SGD(parameters, lr=0.1, momentum=0.9), AdamW(parameters, lr=0.1)

In [3]:
n_bytes_per_param = 2 if dtype in (torch.float16, torch.bfloat16) else 4

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
get_vram = lambda: pynvml.nvmlDeviceGetMemoryInfo(handle).used / 2**20 # MiB

start_vram = get_vram()

# Initializing CUDA kernels
a = torch.ones((1,1)).to(device); del a; torch.cuda.empty_cache()
cuda_kernels_vram = get_vram() - start_vram
print(f"CUDA kernels VRAM: {cuda_kernels_vram:.0f} MiB")

exp = IPyExperimentsPytorch()

CUDA kernels VRAM: 715 MiB

*** Experiment started with the Pytorch backend
Device: ID 0, NVIDIA GeForce RTX 3090 (24576 RAM)


*** Current state:
RAM:     Used     Free    Total        Util
CPU:    1,599  247,654  257,616 MB   0.62% 
GPU:    1,031   23,544   24,576 MB   4.20% 


･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.000
･ CPU:          0          0      1,599 MB |
･ GPU:          0          0      1,031 MB |


# Loading model

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=dtype, trust_remote_code=True).to(device)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

n_parameters = sum(p.numel() for p in model.parameters())
model_estimated_vram = n_parameters * n_bytes_per_param / 2**20
model_actual_vram = get_vram() - cuda_kernels_vram - start_vram

print(model.config)
print("=" * 75)
print(model)
print("=" * 75)
print(f"Number of parameters: {(n_parameters / 1e9):.3f} B ({n_parameters})")
print(f"Model VRAM usage: {(model_actual_vram):.0f} MiB (expected {(model_estimated_vram):.0f} MiB, error {((model_actual_vram - model_estimated_vram) * 100 / model_actual_vram):.1f} %)")
print("=" * 75)

PhiConfig {
  "_name_or_path": "microsoft/phi-1_5",
  "activation_function": "gelu_new",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attn_pdrop": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-1_5--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-1_5--modeling_phi.PhiForCausalLM"
  },
  "embd_pdrop": 0.0,
  "flash_attn": false,
  "flash_rotary": false,
  "fused_dense": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "phi-msft",
  "n_embd": 2048,
  "n_head": 32,
  "n_head_kv": null,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 2048,
  "resid_pdrop": 0.0,
  "rotary_dim": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "use_cache": false,
  "vocab_size": 51200
}

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2048)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-23): 24 x Paral

In [5]:
input_ids = torch.randint(0, len(tokenizer), (bs, seq_length)).to(device)
attention_mask = torch.ones((bs, seq_length)).to(device)

･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.005
･ CPU:          0          0      2,198 MB |
･ GPU:          0          0      6,443 MB |


# Inference forward pass

In [6]:
model.eval()

with torch.no_grad():
    out = model(input_ids=input_ids, attention_mask=attention_mask)
    # probs = F.softmax(out.logits[:, -1, :], dim=-1) # for inference we need probabilities only over the last token; omit this as it is very small

out_bs, out_sequence_length, out_embedding_size = out.logits.shape
n_bytes_per_param_out = 2 if out.logits.dtype in (torch.float16, torch.bfloat16) else 4
output_estimated_vram = out_bs * out_sequence_length * out_embedding_size * n_bytes_per_param_out / 2**20
print(f"Out tensor dtype: {out.logits.dtype}")

total_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
torch.cuda.empty_cache() # calling `free` on allocated memory for forward pass
output_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram
del out; torch.cuda.empty_cache() # calling `free` on allocated memory for `out` tensor
eps_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram # idk what is that, but it is small

output_actual_vram = output_vram - eps_vram
activations_actual_vram = total_forward_pass_vram - output_actual_vram - eps_vram

print(f"Total forward pass VRAM usage: {total_forward_pass_vram:.0f} MiB")
print(f"Output tensor with bs {out_bs}, seq length {out_sequence_length} and emb size {out_embedding_size} VRAM usage: {output_actual_vram:.0f} MiB (expected {output_estimated_vram:.0f} MiB)")
print(f"Activations VRAM usage: {activations_actual_vram:.0f} MiB")
print(f"Random eps VRAM: {eps_vram:.0f} MiB")
#print(torch.cuda.memory_summary())
print("=" * 75)

Out tensor dtype: torch.float32
Total forward pass VRAM usage: 474 MiB
Output tensor with bs 2, seq length 128 and emb size 51200 VRAM usage: 50 MiB (expected 50 MiB)
Activations VRAM usage: 44 MiB
Random eps VRAM: 380 MiB
･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.918
･ CPU:        454          0      2,652 MB |
･ GPU:        380         94      6,823 MB |


# Training step

In [7]:
model.train()
optimizer = get_optimizer(model.parameters())
scaler = torch.cuda.amp.GradScaler(enabled=mixed_precision_training)

･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.002
･ CPU:          0          0      2,652 MB |
･ GPU:          0          0      6,823 MB |


In [8]:
with torch.autocast(device_type=device.type, dtype=torch.float16, enabled=mixed_precision_training):
    out = model(input_ids=input_ids, attention_mask=attention_mask)
    total_train_forward_pass_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
    
    probs = F.softmax(out.logits, dim=-1)
    probs_vram = get_vram() - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
    
    loss = F.cross_entropy(probs.permute(0, 2, 1), input_ids) # mapping tokens into themselves
    loss_calculation_vram = get_vram() - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram

scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
backward_vram = get_vram() - loss_calculation_vram - probs_vram - total_train_forward_pass_vram - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram

print(f"Model gradients type: {next(model.parameters()).grad.dtype}")
print(f"Total train forward pass VRAM usage (activations + output tensor): {total_train_forward_pass_vram:.0f} MiB" + (f" (expect {(n_parameters * 2 / 2**20):.0f} MiB of these to be for fp16 weights copy)" if mixed_precision_training else ""))
#print(f"Actual probs tensor VRAM usage: {probs_vram:.0f} MiB")
#print(f"Loss calculation VRAM usage: {loss_calculation_vram:.0f} MiB")
#print(f"Backward calculation VRAM usage: {backward_vram:.0f} MiB")

del out
del probs
del loss
torch.cuda.empty_cache() # calling `free` on allocated memory for activations and outputs

gradients_optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
optimizer.zero_grad(set_to_none=True); torch.cuda.empty_cache()
optimizer_total_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram
del optimizer; torch.cuda.empty_cache()
eps_2_vram = get_vram() - model_actual_vram - cuda_kernels_vram - start_vram - eps_vram

gradients_actual_vram = gradients_optimizer_total_vram - optimizer_total_vram
optimizer_actual_vram = optimizer_total_vram - eps_2_vram
print(f"Gradients VRAM usage: {gradients_actual_vram:.0f} MiB (model weights were {model_actual_vram:.0f} MiB)")
print(f"Actual optimizer states VRAM usage: {optimizer_actual_vram:.0f} MiB")

eps_vram += eps_2_vram
print(f"Random eps VRAM usage: {eps_vram:.0f} MiB")
print("=" * 75)

Model gradients type: torch.float32
Total train forward pass VRAM usage (activations + output tensor): 3768 MiB (expect 2705 MiB of these to be for fp16 weights copy)
Gradients VRAM usage: 5528 MiB (model weights were 5412 MiB)
Actual optimizer states VRAM usage: 5506 MiB
Random eps VRAM usage: 410 MiB
･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.662
･ CPU:          9          0      2,662 MB |
･ GPU:         30     13,668      6,853 MB |


# Estimation activations

In [9]:
n_bytes_per_param = 2 if mixed_precision_training else 4

hidden_size = model.config.hidden_size
num_attention_heads = model.config.num_attention_heads
num_key_value_heads = model.config.num_key_value_heads if hasattr(model.config, "num_key_value_heads") else model.config.num_attention_heads # different from num_attention_heads in case of GQA
intermediate_size = model.config.intermediate_size if hasattr(model.config, "intermediate_size") else 4 * model.config.hidden_size # MLP projection
num_hidden_layers = model.config.num_hidden_layers
head_dim = hidden_size // num_attention_heads
print(f"Calculating size of activation for single block with:\nhidden size {hidden_size}\nnum attention heads {num_attention_heads}\nnum key value heads {num_key_value_heads}\nintermediate size {intermediate_size}\nhead dim {head_dim}\nnum hidden layers {num_hidden_layers}")
print("=" * 75)

attention_input      = n_bytes_per_param * bs * seq_length * hidden_size
q                    = n_bytes_per_param * bs * seq_length * head_dim * num_attention_heads # for Q @ K.T
k                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads # num_key_value_heads might be different from num_attention_heads in case of GQA
softmax_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2 # to multiply with V
softmax_dropout_mask = 1                 * bs * num_attention_heads * seq_length ** 2 # single byte per elem
dropout_output       = n_bytes_per_param * bs * num_attention_heads * seq_length ** 2
v                    = n_bytes_per_param * bs * seq_length * head_dim * num_key_value_heads
out_proj_input       = n_bytes_per_param * bs * seq_length * num_attention_heads * head_dim
attention_dropout    = 1                 * bs * seq_length * hidden_size
#attention_block = attention_input + q + k + softmax_output + v + out_proj_input
attention_block = attention_input + q + k + softmax_output + v + out_proj_input + softmax_dropout_mask + dropout_output + attention_dropout

mlp_input        = n_bytes_per_param * bs * seq_length * hidden_size
activation_input = n_bytes_per_param * bs * seq_length * intermediate_size # SiLU
down_proj_input  = n_bytes_per_param * bs * seq_length * intermediate_size
dropout_mask     = 1                 * bs * seq_length * hidden_size # single byte per elem
#mlp_block = mlp_input + activation_input + down_proj_input
mlp_block = mlp_input + activation_input + down_proj_input + dropout_mask

layer_norms = n_bytes_per_param * bs * seq_length * hidden_size * 2 # 2 layer norms

layer = attention_block + mlp_block + layer_norms
print(f"Single layer (out of {num_hidden_layers}) estimated activations VRAM usage: {layer // 2**20} MiB")
print(f"All layers estimated activations VRAM usage: {layer * num_hidden_layers // 2**20} MiB")
print(f"Estimated activations on inference forward pass VRAM usage (softmax output + v): {(softmax_output + v) // 2**20} MiB")
print("=" * 75)

Calculating size of activation for single block with:
hidden size 2048
num attention heads 32
num key value heads 32
intermediate size 8192
head dim 64
num hidden layers 24
Single layer (out of 24) estimated activations VRAM usage: 22 MiB
All layers estimated activations VRAM usage: 528 MiB
Estimated activations on inference forward pass VRAM usage (softmax output + v): 3 MiB
･ RAM:  △Consumed    △Peaked    Used Total | Exec time 0:00:00.002
･ CPU:          0          0      2,662 MB |
･ GPU:          0          0      6,853 MB |
