# Transformer: Shakespeare

based on: https://github.com/karpathy/build-nanogpt

In this example a transformer model is built close to the GPT-2 model. The building blocks are now implemented manually instead of using a `nn.Sequential` container.

In [None]:
import requests
import time
from itertools import cycle

import auto_compyute as ac
import auto_compyute.nn.functional as F

ac.set_random_seed(0)

DEVICE = "cuda" if ac.gpu_available() else "cpu"
DEVICE

In [None]:
CTX_LEN = 8  # number of tokens in the input sequence
EMB_DIM = 64  # embedding dimension or model dimension
N_HEADS = 4  # number of heads in each attention block
N_BLOCKS = 6  # number of transformer blocks/layers
BATCH_SIZE = 64

## Prepare Data

In [None]:
# download data
DATA_URL = (
    "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
)
response = requests.get(DATA_URL, timeout=10)
data = response.text

# create a character-level tokenizer
chars = sorted(set(response.text))
vocab = dict(enumerate(chars))
ivocab = {c: i for i, c in vocab.items()}


def encode(text: str) -> list[int]:
    return [ivocab[t] for t in text]


def decode(token_ids: list[int]) -> str:
    return "".join(vocab[id] for id in token_ids)


vocab_size = len(chars)
vocab_size

In [None]:
# encode data
data_enc = ac.tensor(encode(data)).int()

# create the datasets. y is just x shifted by one (the next token to predict)
n_samples = len(data_enc) // CTX_LEN
X = ac.stack(*[data_enc[i * CTX_LEN : i * CTX_LEN + CTX_LEN] for i in range(n_samples)])
y = ac.stack(*[data_enc[i * CTX_LEN + 1 : i * CTX_LEN + CTX_LEN + 1] for i in range(n_samples)])

print(X[:4])
print(y[:4])

In [None]:
# create a random train-val split
idx = ac.randperm(len(X))
n = int(len(X) * 0.9)
train_idx, val_idx = idx[:n], idx[n:]
X_train = X[train_idx]
y_train = y[train_idx]
X_val = X[val_idx]
y_val = y[val_idx]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_val.shape=}")
print(f"{y_val.shape=}")

## Build the Transformer Model

In [None]:
from auto_compyute import nn


class Transformer(nn.Module):
    """Transformer model following GPT-2"""

    def __init__(self, n_emb, emb_dim, seq_len, n_heads, n_layers, mask, dropout=0):
        super().__init__()
        self.token_emb = nn.Embedding(n_emb, emb_dim)
        self.pos_emb = nn.Embedding(seq_len, emb_dim)
        self.token_emb.w.data *= emb_dim**-0.5  # scaling used by GPT-2
        self.pos_emb.w.data *= emb_dim**-0.5  # scaling used by GPT-2

        out_scale = (2 * n_layers) ** -0.5  # scaling used by GPT-2
        self.blocks = nn.Modulelist(
            Block(emb_dim, n_heads, mask, dropout, out_scale) for _ in range(n_layers)
        )

        self.head_ln = nn.Layernorm((emb_dim))
        self.head = nn.Linear(emb_dim, n_emb, bias=False)
        self.head.w = self.token_emb.w  # weight tying

        self.pos = nn.Buffer(ac.arange(seq_len).view(1, -1))

    def forward(self, x):
        x = self.token_emb(x) + self.pos_emb(self.pos[:, : x.shape[-1]])
        for block in self.blocks:
            x = block(x)
        x = self.head(self.head_ln(x))
        return x


class Block(nn.Module):
    """Transformer Block"""

    def __init__(self, emb_dim, n_heads, mask, dropout, out_scale):
        super().__init__()

        self.attn_ln = nn.Layernorm((emb_dim,))
        self.attn = nn.MultiHeadSelfAttention(emb_dim, n_heads, mask, dropout)
        self.attn.qkv.w.data *= out_scale  # scaling used by GPT-2
        self.attn_dropout = nn.Dropout(dropout)

        self.mlp_ln = nn.Layernorm((emb_dim,))
        self.mlp = MLP(emb_dim)
        self.mlp.down.w.data *= out_scale  # scaling used by GPT-2
        self.mlp_dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.attn_dropout(self.attn(self.attn_ln(x)))
        x = x + self.mlp_dropout(self.mlp(self.mlp_ln(x)))
        return x


class MLP(nn.Module):
    """Feed Forward Block"""

    def __init__(self, n_emb):
        super().__init__()
        self.up = nn.Linear(n_emb, 4 * n_emb)
        self.down = nn.Linear(4 * n_emb, n_emb)

    def forward(self, x):
        x = self.up(x)
        x = F.gelu(x)
        x = self.down(x)
        return x

In [None]:
# mask for causal self-attention
mask = ac.full(CTX_LEN, CTX_LEN, value=float("-inf")).triu(1)
mask

In [None]:
model = Transformer(
    n_emb=vocab_size,
    emb_dim=EMB_DIM,
    seq_len=CTX_LEN,
    n_heads=N_HEADS,
    n_layers=N_BLOCKS,
    mask=mask,
).to(DEVICE)

model.n_params

## Training

In [None]:
train_dl = nn.Dataloader((X_train, y_train), BATCH_SIZE, DEVICE)
val_dl = nn.Dataloader((X_val, y_val), BATCH_SIZE, DEVICE, shuffle_data=False)
optim = nn.optimizers.AdamW(model.parameters(), learning_rate=3e-4)
val_steps = len(val_dl)

In [None]:
# training parameters
MAX_STEPS = 2500
VAL_INTERVAL = 250

step = 1
for x, y in cycle(train_dl()):
    # training
    model.train()
    start = time.perf_counter()

    loss = F.cross_entropy_loss(model(x), y)
    loss.backward()
    optim.update_params()
    optim.reset_param_grads()

    dt = time.perf_counter() - start
    tok_per_s = BATCH_SIZE * CTX_LEN / dt

    # validation
    if step > 1 and step % VAL_INTERVAL == 0:
        model.eval()
        with ac.no_autograd_tracking():
            val_loss = sum(F.cross_entropy_loss(model(x), y).item() for x, y in val_dl())
            val_loss /= val_steps
            print(f"\n---\nval_loss {val_loss:.4f}\n---\n")

    print(f"{step}/{MAX_STEPS} | loss {loss.item():.4f} | dt {dt:.4f} s | {tok_per_s:.1f} tok/s")
    step += 1

    if step > MAX_STEPS:
        break