## 4.1 Coding an LLM architecture

In [25]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [26]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        context_length,
        emb_dim,
        n_heads,
        n_layers,
        drop_rate,
        qkv_bias=False,
        **kwargs,
    ):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_emb = nn.Embedding(context_length, emb_dim)
        self.drop_emb = nn.Dropout(drop_rate)
        self.trf_blocks = nn.Sequential(
            *[
                DummyTransformerBlock(
                    vocab_size=vocab_size,
                    context_length=context_length,
                    emb_dim=emb_dim,
                    n_heads=n_heads,
                    n_layers=n_layers,
                    drop_rate=drop_rate,
                    qkv_bias=qkv_bias,
                    **kwargs,
                )
                for _ in range(n_layers)
            ]
        )
        self.final_norm = DummyLayerNorm(emb_dim)
        self.out_head = nn.Linear(emb_dim, vocab_size, bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(
        self,
        vocab_size,
        context_length,
        emb_dim,
        n_heads,
        n_layers,
        drop_rate,
        qkv_bias=False,
        **kwargs,
    ):
        super().__init__()

    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()

    def forward(self, x):
        return x

DummyGPTModel(**GPT_CONFIG_124M)

DummyGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): DummyTransformerBlock()
    (1): DummyTransformerBlock()
    (2): DummyTransformerBlock()
    (3): DummyTransformerBlock()
    (4): DummyTransformerBlock()
    (5): DummyTransformerBlock()
    (6): DummyTransformerBlock()
    (7): DummyTransformerBlock()
    (8): DummyTransformerBlock()
    (9): DummyTransformerBlock()
    (10): DummyTransformerBlock()
    (11): DummyTransformerBlock()
  )
  (final_norm): DummyLayerNorm()
  (out_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [27]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [28]:
torch.manual_seed(42)
model = DummyGPTModel(**GPT_CONFIG_124M)
logits = model(batch)
print(f"{logits} ({logits.shape=})")

tensor([[[ 0.7739,  0.0181, -0.0797,  ...,  0.3098,  0.8177, -0.6049],
         [-0.8063,  0.8920, -1.0962,  ..., -0.4378,  1.1056,  0.1939],
         [-0.8459, -1.0176,  0.4964,  ...,  0.4581, -0.3293,  0.2320],
         [ 0.4098, -0.3144, -1.0831,  ...,  0.7491,  0.7018,  0.4715]],

        [[ 0.2911,  0.1596, -0.2137,  ...,  0.5173,  0.7380, -0.7045],
         [-0.4064,  0.6045, -0.4485,  ..., -0.5616,  0.4590, -0.1384],
         [-0.6108,  0.7148,  1.2499,  ..., -0.7925, -0.5328,  0.4794],
         [ 0.9423,  0.1867, -0.5557,  ...,  0.4156,  0.1756,  1.9882]]],
       grad_fn=<UnsafeViewBackward0>) (logits.shape=torch.Size([2, 4, 50257]))


## 4.2 Normalizing activations with layer normalization

In [29]:
torch.manual_seed(42)
batch_example = torch.randn(2, 5)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
print(f"{mean=}\n{var=}")

tensor([[0.0000, 0.1842, 0.0052, 0.7233, 0.0000, 0.5298],
        [0.0000, 0.0000, 0.0000, 0.2237, 0.0000, 0.7727]],
       grad_fn=<ReluBackward0>)
mean=tensor([[0.2404],
        [0.1661]], grad_fn=<MeanBackward1>)
var=tensor([[0.0982],
        [0.0963]], grad_fn=<VarBackward0>)


First value in the `mean` tensor is the mean of the first row, second value is the mean for the second row. Likewise for the variance.

`keepdim=True` means the output tensor retains the same number of dimensions as the input tensor, even though the operation reduces the tensor along the dimension specified via `dim`.
`dim` specifies the dimension along which the calculation of the statistic should be performed.

In [30]:
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Normalized layer outputs:\n", out_norm)
print(f"Mean:\n{mean}")
print(f"Variance:\n{var}")

Normalized layer outputs:
 tensor([[-0.7672, -0.1794, -0.7506,  1.5410, -0.7672,  0.9234],
        [-0.5351, -0.5351, -0.5351,  0.1857, -0.5351,  1.9546]],
       grad_fn=<DivBackward0>)
Mean:
tensor([[0.0000e+00],
        [7.4506e-09]], grad_fn=<MeanBackward1>)
Variance:
tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


In [31]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5  # Prevent division by zero
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [36]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[-1.1921e-08],
        [ 3.2037e-08]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
