# GPT Model Notebook
Comprehension notebook to explain the details of the GPT modeling code provided by @karpathy.

In [4]:
from dataclasses import dataclass
import inspect
import math
import sys
import os
from typing import Optional, Literal

import pickle

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import Tensor


## PyTorch Documentation

`CLASStorch.nn.Module(*args, **kwargs)[SOURCE]` <br>
Base class for all neural network modules. Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes:

In [21]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

## Config

In [68]:
# Config
@dataclass
class GPTConfig:
    batch_size: int = 32 # Added for demo purposes
    block_size: int = 256
    vocab_size: int = 65  # GPT-2 use 50304
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

config = GPTConfig()

## Layer Normalization

`torch.nn.LayerNorm(normalized_shape, eps=1e-05, elementwise_affine=True, bias=True, device=None, dtype=None)`

Applies Layer Normalization over a mini-batch of inputs

$$ y= \frac{x - \mathbb{E}[x]}{\sqrt{Var[x] + \epsilon}} ∗ \gamma + \beta $$

The mean and standard-deviation are calculated over the last D dimensions, where D is the dimension of normalized_shape. For example, if normalized_shape is (3, 5) (a 2-dimensional shape), the mean and standard-deviation are computed over the last 2 dimensions of the input (i.e. input.mean((-2, -1))). 
γ and β are learnable affine transform parameters of normalized_shape if elementwise_affine is True. The standard-deviation is calculated via the biased estimator, equivalent to torch.var(input, unbiased=False).

In [44]:
class LayerNorm(nn.Module):
    """
    LayerNorm but with an optional bias. PyTorch doesn't support
    simply bias=False.
    """

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

## GELU

`CLASStorch.nn.GELU(approximate='none')` <br>

Applies the Gaussian Error Linear Units function:


$$ GELU(x)=x∗Φ(x)$$
where $Φ(x)$ is the Cumulative Distribution Function for Gaussian Distribution.

![GELU](images/gelu.png)

There are several reasons why GELU is used as an activation function:

1. Non-Linearity: GELU introduces non-linearity into the model. This is essential for deep learning models to be able to approximate complex functions and patterns in the data.

2. Smooth Gradient: The GELU function is smooth and differentiable everywhere. This means that it has a well-behaved gradient which can aid in stable and efficient training of deep neural networks.

3. Saturation Behavior: Like the sigmoid and tanh functions, GELU also has saturation behavior. When the input is extremely negative or positive, the GELU function saturates, making it less sensitive to extreme values. This can help in reducing the impact of outliers.

4. Empirical Success: GELU has been empirically observed to improve the performance of transformer models like BERT compared to other activation functions like ReLU or LeakyReLU. This empirical success has led to its widespread adoption.

5. Properties between ReLU and tanh: GELU acts somewhat as a bridge between the ReLU and tanh activation functions. It offers the gating properties of ReLU (which helps in avoiding vanishing gradients) and the smooth transition of tanh.

6. Adaptive Behavior: The shape of the GELU curve allows the network to decide whether to allow information through (like a ReLU) or gate it (somewhat like a sigmoid). This adaptability is useful for tasks where certain information might need to be selectively emphasized or de-emphasized.

## Multi-layer Perceptron

In [45]:
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 4x input nodes in first layer
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
        self.gelu = nn.GELU()
        # back to embedding dimension
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

## Causal Self Attention
See the attention notebook for a dedicated walk-through of the attention heads.

### Code Implementation

In [77]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Split up embedding vector by attention heads
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
        if not self.flash:
            print("WARNING: using slow attention. Flash requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left
            # in the input sequence
            self.register_buffer(
                "bias",
                torch.tril(torch.ones(config.block_size, config.block_size)).view(
                    1, 1, config.block_size, config.block_size
                ),
            )

    def forward(self, x):
        (
            B,
            T,
            C,
        ) = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(
                q,
                k,
                v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True,
            )
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = (
            y.transpose(1, 2).contiguous().view(B, T, C)
        )  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

### Breakdown

In [48]:
print(config.n_embd, config.bias)
# key, query, value projections for all heads, but in a batch
# Multiple along the block dimension (sequence input length)
c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
c_attn

768 True


Linear(in_features=768, out_features=2304, bias=True)

In [49]:
# (B, T, C) = (batch size, sequence length, embedding dimensionality (n_embd))
print(config.batch_size, config.block_size, config.n_embd)
x = torch.rand(size=(config.batch_size, config.block_size, config.n_embd))
(
    B,
    T,
    C,
) = x.size()
print(B, T, C)

32 256 768
32 256 768


In [50]:
q, k, v = c_attn(x).split(config.n_embd, dim=2)
print(q.shape, k.shape, v.shape)

torch.Size([32, 256, 768]) torch.Size([32, 256, 768]) torch.Size([32, 256, 768])


In [51]:
# Split up the embedding dimension into different heads
k = k.view(B, T, config.n_head, C // config.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
q = q.view(B, T, config.n_head, C // config.n_head).transpose(
    1, 2
)  # (B, nh, T, hs)
v = v.view(B, T, config.n_head, C // config.n_head).transpose(
    1, 2
)  # (B, nh, T, hs)

In [53]:
# (B, nh, T, hs)
print(k.shape, q.shape, v.shape)

torch.Size([32, 12, 256, 64]) torch.Size([32, 12, 256, 64]) torch.Size([32, 12, 256, 64])


In [54]:
print(q.shape, k.transpose(-2, -1).shape)
print((q @ k.transpose(-2, -1)).shape)

torch.Size([32, 12, 256, 64]) torch.Size([32, 12, 64, 256])
torch.Size([32, 12, 256, 256])


In [63]:
# scaled dot product attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
# lower triangular matrix for autoregressive masking
bias = torch.ones(config.block_size, config.block_size).view(
    1, 1, config.block_size, config.block_size
)
att = att.masked_fill(bias[:, :, :T, :T] == 0, float("-inf"))
# apply softmax
att = F.softmax(att, dim=-1)
print(att.shape)

torch.Size([32, 12, 256, 256])


In [64]:
attn_dropout = nn.Dropout(config.dropout)
att = attn_dropout(att)
print(att.shape)

torch.Size([32, 12, 256, 256])


In [65]:
y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
print(y.shape)

torch.Size([32, 12, 256, 64])


In [72]:
# glue attention heads back together
y = (
    y.transpose(1, 2).contiguous().view(B, T, C)
)  # re-assemble all head outputs side by side
print(y.shape)

torch.Size([32, 256, 768])


In [76]:
# output projection
c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
resid_dropout = nn.Dropout(config.dropout)

# apply output projection and dropout
y = resid_dropout(c_proj(y))
print(y.shape)

torch.Size([32, 256, 768])


## Attention Block

In [78]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x)) # note residual connections
        x = x + self.mlp(self.ln_2(x))
        return x


## Transformer

### Module Dict and Trainable Parameters

In [80]:
transformer = nn.ModuleDict(
            dict(
                wte=nn.Embedding(config.vocab_size, config.n_embd),
                wpe=nn.Embedding(config.block_size, config.n_embd),
                drop=nn.Dropout(config.dropout),
                h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
                ln_f=LayerNorm(config.n_embd, bias=config.bias),
            )
        )   

In [83]:
def test_init_weights(module):
    print(module)

# test _init_weights method
transformer.apply(test_init_weights)

Embedding(65, 768)
Embedding(256, 768)
Dropout(p=0.0, inplace=False)
LayerNorm()
Linear(in_features=768, out_features=2304, bias=True)
Linear(in_features=768, out_features=768, bias=True)
Dropout(p=0.0, inplace=False)
Dropout(p=0.0, inplace=False)
CausalSelfAttention(
  (c_attn): Linear(in_features=768, out_features=2304, bias=True)
  (c_proj): Linear(in_features=768, out_features=768, bias=True)
  (attn_dropout): Dropout(p=0.0, inplace=False)
  (resid_dropout): Dropout(p=0.0, inplace=False)
)
LayerNorm()
Linear(in_features=768, out_features=3072, bias=True)
GELU(approximate='none')
Linear(in_features=3072, out_features=768, bias=True)
Dropout(p=0.0, inplace=False)
MLP(
  (c_fc): Linear(in_features=768, out_features=3072, bias=True)
  (gelu): GELU(approximate='none')
  (c_proj): Linear(in_features=3072, out_features=768, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)
Block(
  (ln_1): LayerNorm()
  (attn): CausalSelfAttention(
    (c_attn): Linear(in_features=768, out_features=

ModuleDict(
  (wte): Embedding(65, 768)
  (wpe): Embedding(256, 768)
  (drop): Dropout(p=0.0, inplace=False)
  (h): ModuleList(
    (0-11): 12 x Block(
      (ln_1): LayerNorm()
      (attn): CausalSelfAttention(
        (c_attn): Linear(in_features=768, out_features=2304, bias=True)
        (c_proj): Linear(in_features=768, out_features=768, bias=True)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (ln_2): LayerNorm()
      (mlp): MLP(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (gelu): GELU(approximate='none')
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm()
)

In [85]:
# test iteration over named parameters
for pn, p in transformer.named_parameters():
    print(pn, p)

wte.weight Parameter containing:
tensor([[ 0.6899, -1.7424, -0.1061,  ..., -0.0829, -0.8804, -1.2164],
        [ 1.7504, -0.0238,  0.1950,  ..., -0.1088, -0.7475,  0.4840],
        [ 0.3681,  0.8410,  1.0086,  ..., -0.7974,  0.6872, -1.1189],
        ...,
        [ 0.4298,  1.1135,  0.1752,  ..., -1.6236, -2.2124, -0.2306],
        [-0.3770,  0.4423, -0.2479,  ..., -0.5750, -0.7649,  0.8233],
        [-2.2920,  1.2476,  0.6346,  ...,  0.8003,  0.6904, -1.1140]],
       requires_grad=True)
wpe.weight Parameter containing:
tensor([[ 0.3292,  1.0306, -0.1584,  ...,  0.6496,  0.4486, -1.5912],
        [-1.9679, -0.1105, -0.8930,  ..., -0.2746, -1.6689, -0.4446],
        [ 0.7996, -0.0865, -2.1194,  ..., -0.7560, -0.0603,  0.3939],
        ...,
        [ 1.5839,  0.5889, -0.7035,  ..., -1.7717, -0.4523, -1.1871],
        [-1.7193,  0.0959,  0.1120,  ..., -0.4279, -0.2383,  1.3885],
        [ 0.3065, -0.1857,  0.9104,  ..., -0.7224, -0.3663, -0.9336]],
       requires_grad=True)
h.0.ln_1.wei

### LM Head

In [91]:
lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
print(lm_head)

Linear(in_features=768, out_features=65, bias=False)


#### Training Inference: Target Token Prediction

In [107]:
# attention block output is (batch_size, block_size, emb_n)
x = torch.rand(size=(config.batch_size, config.block_size, config.n_embd))
targets = torch.randint(0, 65, size=(config.batch_size, config.block_size))
logits = lm_head(x)
print(logits.shape)

loss = F.cross_entropy(
    logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
)
print(loss)

torch.Size([32, 256, 65])
tensor(4.2265, grad_fn=<NllLossBackward0>)


#### Generate Text

In [102]:
# attention block output is (batch_size, block_size, emb_n)
x = torch.rand(size=(config.batch_size, config.block_size, config.n_embd))
logits = lm_head(x)
print(logits.shape)
# this takes the last time dimension (with all but next token revealed) to predict the next token
print(logits[:, -1, :].shape)

torch.Size([32, 256, 65])
torch.Size([32, 65])


### Load Pre-Trained Model

In [115]:
from src.models.gpt import GPT
from transformers import GPT2LMHeadModel

# config
model_type = "gpt2"
override_args=None
override_args = override_args or {}
print(override_args)

  from .autonotebook import tqdm as notebook_tqdm


{}


In [123]:
# model config overwrite
config_args = {
    "gpt2": dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
    "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024),  # 350M params
    "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280),  # 774M params
    "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600),  # 1558M params
}[model_type]
print("forcing vocab_size=50257, block_size=1024, bias=True")
config_args["vocab_size"] = 50257  # always 50257 for GPT model checkpoints
config_args["block_size"] = 1024  # always 1024 for GPT model checkpoints
config_args["bias"] = True  # always True for GPT model checkpoints
# we can override the dropout rate, if desired
if "dropout" in override_args:
    print(f"overriding dropout rate to {override_args['dropout']}")
    config_args["dropout"] = override_args["dropout"]

forcing vocab_size=50257, block_size=1024, bias=True


In [124]:
print(config_args)

{'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'vocab_size': 50257, 'block_size': 1024, 'bias': True}


In [125]:
# create a from-scratch initialized minGPT model
config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = sd.keys()
sd_keys = [
    k for k in sd_keys if not k.endswith(".attn.bias")
]  # discard this mask / buffer, not a param
sd_keys[:10]

number of parameters: 123.65M


['transformer.wte.weight',
 'transformer.wpe.weight',
 'transformer.h.0.ln_1.weight',
 'transformer.h.0.ln_1.bias',
 'transformer.h.0.attn.c_attn.weight',
 'transformer.h.0.attn.c_attn.bias',
 'transformer.h.0.attn.c_proj.weight',
 'transformer.h.0.attn.c_proj.bias',
 'transformer.h.0.ln_2.weight',
 'transformer.h.0.ln_2.bias']

In [126]:
# init a huggingface/transformers model
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()

Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 301kB/s]
Downloading pytorch_model.bin: 100%|██████████| 548M/548M [01:03<00:00, 8.57MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 124/124 [00:00<00:00, 38.1kB/s]


In [128]:
# copy while ensuring all of the parameters are aligned and match in names and shapes
sd_keys_hf = sd_hf.keys()
sd_keys_hf = [
    k for k in sd_keys_hf if not k.endswith(".attn.masked_bias")
]  # ignore these, just a buffer
sd_keys_hf = [
    k for k in sd_keys_hf if not k.endswith(".attn.bias")
]  # same, just the mask (buffer)
transposed = [
    "attn.c_attn.weight",
    "attn.c_proj.weight",
    "mlp.c_fc.weight",
    "mlp.c_proj.weight",
]
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
# this means that we have to transpose these weights when we import them
assert len(sd_keys_hf) == len(
    sd_keys
), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
    if any(k.endswith(w) for w in transposed):
        # special treatment for the Conv1D weights we need to transpose
        assert sd_hf[k].shape[::-1] == sd[k].shape
        with torch.no_grad():
            sd[k].copy_(sd_hf[k].t())
    else:
        # vanilla copy over the other parameters
        assert sd_hf[k].shape == sd[k].shape
        with torch.no_grad():
            sd[k].copy_(sd_hf[k])

## Configure Optimizer

In [145]:
class OptimizerConfig:
    learning_rate: float = 6e-4
    weight_decay: float = 1e-1
    beta1: float = 0.9
    beta2: float = 0.95
    device: Literal[
        "cpu", "cuda", "cuda:0", "cuda:1", "mps"
    ] = "cpu" 
    device_type: Literal["cpu", "cuda"] = "cpu"
optimizer_config = OptimizerConfig()

In [146]:
model.named_parameters

<bound method Module.named_parameters of GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>

In [140]:
# start with all of the candidate parameters
param_dict = {pn: p for pn, p in model.named_parameters()}
# filter out those that do not require grad
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
optim_groups = [
    {"params": decay_params, "weight_decay": optimizer_config.weight_decay},
    {"params": nodecay_params, "weight_decay": 0.0},
]
num_decay_params = sum(p.numel() for p in decay_params)
num_nodecay_params = sum(p.numel() for p in nodecay_params)
print(
    f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
)
print(
    f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
)

num decayed parameter tensors: 50, with 124,318,464 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters


In [149]:
# Create AdamW optimizer and use the fused version if it is available
fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and optimizer_config.device_type == "cuda"
extra_args = dict(fused=True) if use_fused else dict()
optimizer = torch.optim.AdamW(
    optim_groups, lr=optimizer_config.learning_rate, betas=(optimizer_config.beta1, optimizer_config.beta2), **extra_args
)
print(f"using fused AdamW: {use_fused}")
optimizer

using fused AdamW: False


AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0006
    maximize: False
    weight_decay: 0.1

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0006
    maximize: False
    weight_decay: 0.0
)

## Estimate Model Flop Utilization (MFU)

This seems to be useful mostly in GPU settings so I will skip it for now.

MFU Calculation
![test](images/mfu_appendix.png)

In [158]:
   def estimate_mfu(model, config, fwdbwd_per_iter, dt):
        """estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS"""
        # first estimate the number of flops we do per iteration.
        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
        N = model.get_num_params()
        cfg = config
        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd // cfg.n_head, cfg.block_size
        flops_per_token = 6 * N + 12 * L * H * Q * T
        flops_per_fwdbwd = flops_per_token * T
        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
        # express our flops throughput as ratio of A100 bfloat16 peak flops
        flops_achieved = flops_per_iter * (1.0 / dt)  # per second
        flops_promised = 312e12  # A100 GPU bfloat16 peak flops is 312 TFLOPS
        mfu = flops_achieved / flops_promised
        return mfu

### Benchmarking Call

In [None]:
# simple benchmarking
torch.cuda.synchronize()
for stage, num_steps in enumerate([10, 20]): # burnin, then benchmark
    t0 = time.time()
    X, Y = get_batch('train')
    for k in range(num_steps):
        with ctx:
            logits, loss = model(X, Y)
        X, Y = get_batch('train')
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        lossf = loss.item()
        print(f"{k}/{num_steps} loss: {lossf:.4f}")
    torch.cuda.synchronize()
    t1 = time.time()
    dt = t1-t0
    mfu = model.estimate_mfu(batch_size * 1 * num_steps, dt)
    if stage == 1:
        print(f"time per iteration: {dt/num_steps*1000:.4f}ms, MFU: {mfu*100:.2f}%")


## Generate Tokens

In [5]:
class GenerateConfig:
    max_new_tokens: int = 10
    num_samples: int = 10
    block_size: int = 10
    start: str = '\n'
    device: str = "cpu"
    temperature = (
        0.8  # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    )
    seed: int = 1337
    dtype = (
        "bfloat16"
        if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
        else "float16"
    )
    tokenizer_dpath = (
        "/Users/endric.daues/workspace/projects/deep_learning/gpt/tokenizers"
    )
    top_k: int = 200
    
generate_config = GenerateConfig()
    

### Load Tokenizer

In [6]:
meta_path = os.path.join("data", generate_config.tokenizer_dpath, "meta.pkl")
load_meta = os.path.exists(meta_path)

print(f"Loading meta from {meta_path}...")
with open(meta_path, "rb") as f:
    meta = pickle.load(f)
# TODO want to make this more general to arbitrary encoder/decoder schemes
stoi, itos = meta["token_id_map"], meta["id_token_map"]

Loading meta from /Users/endric.daues/workspace/projects/deep_learning/gpt/tokenizers/meta.pkl...


In [7]:
def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    result = ""
    for i in l:
        if i in itos:
            result += itos[i]
        else:
            result += "[UNK]"
    return result

In [9]:
start_ids = encode(generate_config.start)
print(start_ids)
torch.tensor(start_ids, dtype=torch.long, device=generate_config.device)[
    None, ...
]

[0]


tensor([[0]])

In [211]:
start_ids = encode(generate_config.start)
print(start_ids)
idx = torch.tensor(start_ids, dtype=torch.long, device=generate_config.device)[
            None, ...
        ]

# Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
# the sequence max_new_tokens times, feeding the predictions back into the model each time.
# Most likely you'll want to make sure to be in model.eval() mode of operation for this.
for _ in range(generate_config.max_new_tokens):
    # if the sequence context is growing too long we must crop it at block_size
    idx_cond = (
        idx
        if idx.size(1) <= generate_config.block_size
        else idx[:, -generate_config.block_size :]
    )
    logits = torch.rand([1, 256, 65])
    logits = logits[:, -1, :] / generate_config.temperature
    
    
    if generate_config.top_k is not None:
        v, _ = torch.topk(logits, min(generate_config.top_k, logits.size(-1)))
        logits[logits < v[:, [-1]]] = -float("Inf")
    
    # print(logits.shape)
    probs = F.softmax(logits, dim=-1)
    # print(probs)
    idx_next = torch.multinomial(probs, num_samples=1)
    print(idx_next)
    idx = torch.cat((idx, idx_next), dim=1)
    print(decode(idx[0].tolist()))
    
    
    
    

tensor([[39]])

a
tensor([[7]])

a-
tensor([[22]])

a-J
tensor([[37]])

a-JY
tensor([[43]])

a-JYe
tensor([[15]])

a-JYeC
tensor([[16]])

a-JYeCD
tensor([[51]])

a-JYeCDm
tensor([[5]])

a-JYeCDm'
tensor([[21]])

a-JYeCDm'I
