In [2]:
import numpy as np
from dataclasses import dataclass

In [67]:
@dataclass
class GPTParams:
    num_seq: int
    num_embd: int
    num_head: int
    num_layers: int

    @property
    def params_per_layer(self):
        num_embd = self.num_embd
        # QKV + projection + FFN
        return (num_embd * num_embd) * (3 + 1 + 4 * 2)

    def activation_size(self, batch_size, seq_len):
        return batch_size * seq_len * self.num_embd

    def matmul_output_activations(self, batch_size, seq_len):
        # QKV + Attention + linear + FFN outputs
        act_size = self.activation_size(batch_size, seq_len)
        kv_attention_size = batch_size * seq_len * seq_len
        ffn_size = 16 * act_size
        return act_size * 3 + kv_attention_size + act_size + act_size + ffn_size + act_size 

In [80]:
# GPT2 sizes: small, medium, large, XL
# ["124M", "355M", "774M", "1558M"]

gpt2_small = GPTParams(num_seq=128, num_embd=768, num_head=12, num_layers=12)
gpt2_med = GPTParams(num_seq=128, num_embd=1024, num_head=16, num_layers=24)
gpt2_large = GPTParams(num_seq=128, num_embd=1280, num_head=20, num_layers=36)
gpt2_xl = GPTParams(num_seq=128, num_embd=1600, num_head=25, num_layers=48)

In [69]:
# GPT2 XL
n_seq = 128
n_embd = 512

In [70]:
1024 * 1600 * 8 / 1e6

13.1072

In [71]:
gpt2_xl.params_per_layer * gpt2_xl.num_layers / 1e6

1474.56

In [72]:
layers_per_ipu = 3
layers_per_ipu * gpt2_xl.params_per_layer / 1e6

92.16

In [73]:
batch_size = 16
seq_len = 128
batch_size * gpt2_xl.num_embd * seq_len / 1e6

3.2768

# Pipeline scenarios

In [81]:
batch_size = 4
seq_len = 128 * 4
pipeline_size = 16

params_per_ipu = gpt2_xl.params_per_layer
print("ACT perc:", gpt2_xl.activation_size(batch_size, seq_len) / params_per_ipu)
print("INTER perc:", gpt2_xl.matmul_output_activations(batch_size, seq_len) / params_per_ipu)
print("ACT pipleine perc:", gpt2_xl.activation_size(batch_size, seq_len) / params_per_ipu * pipeline_size)

ACT perc: 0.10666666666666667
INTER perc: 2.3808
ACT pipleine perc: 1.7066666666666668
