# Lab

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,omegaconf --conda

Python implementation: CPython
Python version       : 3.11.8
IPython version      : 8.22.2

numpy    : 1.26.4
pandas   : 2.2.1
polars   : 0.20.18
omegaconf: 2.3.0

conda environment: torch_p11



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [38]:
seed: int = 123

GPT_CONFIG_124M: dict[str, Any] = {
    "vocab_size": 50_257,
    "context_length": 1_024,
    "emb_dim": 768,
    "n_heads": 12,  # Number of attention heads
    "n_layers": 12,
    "drop_rate": 0.1,  # Dropout rate
    "qkv_bias": False,
}

In [5]:
class SelfAttention(nn.Module):
    def __init__(self, in_feats: int, out_feats: int) -> None:
        super().__init__()

        # Size: (seq_len, emb_dim)
        self.query_weights = nn.Linear(in_feats, out_feats, bias=False)
        self.key_weights = nn.Linear(in_feats, out_feats, bias=False)
        self.value_weights = nn.Linear(in_feats, out_feats, bias=False)

    def forward(self, x: Tensor) -> Tensor:
        # b_size, seq_len, emb_dim = x.shape
        # (b_size, emb_dim, seq_len) @ (seq_len, emb_dim) -> (b_size, emb_dim, emb_dim)
        query = self.query_weights(x)
        key = self.key_weights(x)
        value = self.value_weights(x)

        # Attention scores
        # (b_size, emb_dim, seq_len) @ (seq_len, emb_dim) -> (b_size, emb_dim, emb_dim)
        attn_scores: Tensor = torch.matmul(query, key.transpose(-1, -2))
        attn_weights: Tensor = F.softmax(attn_scores / key.shape[1] ** 0.5, dim=-1)
        # (seq_len, emb_dim) @ (b_size, emb_dim, emb_dim) -> (b_size, seq_len, emb_dim)
        context_vector: Tensor = torch.matmul(attn_weights, value)
        return context_vector

In [6]:
vocab_size: int = 27
embedding_dim: int = 5
context_size: int = 8
batch_size: int = 2

input_seq: Tensor = torch.rand(
    size=(batch_size, context_size, embedding_dim), dtype=torch.float32
)
self_attn: SelfAttention = SelfAttention(embedding_dim, embedding_dim)
context_vector: Tensor = self_attn(input_seq)
context_vector

tensor([[[-0.1268,  0.2215,  0.2211,  0.0352,  0.1281],
         [-0.1261,  0.2206,  0.2204,  0.0339,  0.1301],
         [-0.1264,  0.2221,  0.2206,  0.0354,  0.1285],
         [-0.1298,  0.2196,  0.2243,  0.0342,  0.1274],
         [-0.1276,  0.2200,  0.2218,  0.0346,  0.1279],
         [-0.1278,  0.2188,  0.2224,  0.0333,  0.1288],
         [-0.1292,  0.2210,  0.2237,  0.0351,  0.1274],
         [-0.1265,  0.2204,  0.2207,  0.0339,  0.1295]],

        [[-0.2475,  0.2025,  0.3551, -0.0171,  0.1869],
         [-0.2461,  0.2054,  0.3540, -0.0152,  0.1852],
         [-0.2463,  0.2076,  0.3551, -0.0145,  0.1836],
         [-0.2456,  0.2084,  0.3543, -0.0140,  0.1834],
         [-0.2457,  0.2098,  0.3550, -0.0135,  0.1823],
         [-0.2456,  0.2077,  0.3541, -0.0138,  0.1833],
         [-0.2452,  0.2077,  0.3535, -0.0138,  0.1838],
         [-0.2475,  0.2017,  0.3548, -0.0174,  0.1875]]],
       grad_fn=<UnsafeViewBackward0>)

In [47]:
class CausalSelfAttention(nn.Module):
    def __init__(self, in_feats: int, out_feats: int, context_size: int) -> None:
        super().__init__()

        # Size: (seq_len, emb_dim)
        self.query_weights = nn.Linear(in_feats, out_feats, bias=False)
        self.key_weights = nn.Linear(in_feats, out_feats, bias=False)
        self.value_weights = nn.Linear(in_feats, out_feats, bias=False)

        self.register_buffer(
            "mask", torch.triu(torch.ones(context_size, context_size), diagonal=1)
        )

    def forward(self, x: Tensor) -> Tensor:
        b_size, seq_len, emb_dim = x.shape
        # (b_size, emb_dim, seq_len) @ (seq_len, emb_dim) -> (b_size, emb_dim, emb_dim)
        query = self.query_weights(x)
        key = self.key_weights(x)
        value = self.value_weights(x)

        # Attention scores
        # (b_size, emb_dim, seq_len) @ (seq_len, emb_dim) -> (b_size, emb_dim, emb_dim)
        attn_scores: Tensor = torch.matmul(query, key.transpose(-1, -2))
        # Apply mask (inplace)
        attn_scores.masked_fill_(self.mask.bool()[:seq_len, :seq_len], -torch.inf)

        attn_weights: Tensor = F.softmax(attn_scores / key.shape[1] ** 0.5, dim=-1)
        # (seq_len, emb_dim) @ (b_size, emb_dim, emb_dim) -> (b_size, seq_len, emb_dim)
        context_vector: Tensor = torch.matmul(attn_weights, value)
        return context_vector

In [48]:
torch.manual_seed(seed)

input_seq: Tensor = torch.rand(
    size=(batch_size, context_size, embedding_dim), dtype=torch.float32
)
causal_self_attn: CausalSelfAttention = CausalSelfAttention(
    embedding_dim, embedding_dim, context_size
)
context_vector: Tensor = causal_self_attn(input_seq)
context_vector  # .shape

tensor([[[-0.2318,  0.0440,  0.2652, -0.4445,  0.0685],
         [-0.2561, -0.1346,  0.1961, -0.4771, -0.0082],
         [-0.1968, -0.0901,  0.1365, -0.4585, -0.0332],
         [-0.2134, -0.1038,  0.1803, -0.4520,  0.0084],
         [-0.2601, -0.1058,  0.2350, -0.5068,  0.0147],
         [-0.2830, -0.1005,  0.2751, -0.5042,  0.0258],
         [-0.3018, -0.1143,  0.2859, -0.5648,  0.0235],
         [-0.2846, -0.1180,  0.2660, -0.5321,  0.0213]],

        [[-0.4226, -0.0772,  0.4811, -0.6062,  0.2411],
         [-0.4354, -0.0966,  0.4816, -0.6960,  0.2383],
         [-0.4003, -0.0931,  0.4299, -0.6829,  0.1654],
         [-0.3426, -0.0509,  0.4048, -0.5703,  0.1671],
         [-0.2918, -0.0457,  0.3191, -0.5597,  0.0942],
         [-0.2636, -0.0650,  0.2737, -0.5394,  0.0660],
         [-0.2520, -0.0853,  0.2341, -0.5581,  0.0223],
         [-0.2528, -0.0887,  0.2248, -0.5806,  0.0233]]],
       grad_fn=<UnsafeViewBackward0>)

In [20]:
context_vector.masked_fill(
    mask=torch.triu(torch.ones(5, 5), diagonal=1), value=-torch.inf
)

RuntimeError: The size of tensor a (5) must match the size of tensor b (8) at non-singleton dimension 1

In [11]:
torch.triu(torch.ones(5, 5), diagonal=1)

tensor([[0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.]])

In [18]:
torch.inf, np.inf

(inf, inf)