In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from types import SimpleNamespace

config = SimpleNamespace()


def update_config(config, dictionary):
    for k, v in dictionary.items():
        setattr(config, k, v)
    return config


def show_config(config: SimpleNamespace):
    for k in sorted(config.__dict__):
        print(f"{k}: {getattr(config, k)}")

In [3]:
from torch.utils.data import Dataset
import torch


class FaiscaDataset(Dataset):
    def __init__(self, data: str, context_length: int):
        chars = sorted(list(set(data)))
        data_size, self.vocab_size = len(data), len(chars)
        print(f"Vocab size: {self.vocab_size}")
        print(f"Data size: {data_size}")

        self.input_ids = []
        self.target_ids = []
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for i, ch in enumerate(chars)}

        token_ids = [self.stoi[c] for c in data]

        for i in range(0, len(token_ids) - context_length):
            input_ids = token_ids[i : i + context_length]
            target_ids = token_ids[i + 1 : i + context_length + 1]
            self.input_ids.append(input_ids)
            self.target_ids.append(target_ids)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx]), torch.tensor(self.target_ids[idx])

    def encode(self, text: str) -> torch.Tensor:
        return torch.tensor([self.stoi[c] for c in text], dtype=torch.long)

    def decode(self, idx: torch.Tensor) -> str:
        return "".join([self.itos[i.item()] for i in idx])


with open("../datasets/bocage.txt", "r", encoding="ISO-8859-1") as f:
    text = f.read()

config.context_length = 240
dataset = FaiscaDataset(text, context_length=config.context_length)
x, y = dataset[0]
print(f"{x.shape=}, {y.shape=}")

Vocab size: 99
Data size: 74935
x.shape=torch.Size([240]), y.shape=torch.Size([240])


In [4]:
from torch import nn


class MultiHeadAttention(nn.Module):
    def __init__(
        self,
        dimension_in: int,
        dimension_out: int,
        context_length: int,
        dropout: float,
        num_heads: int,
        qkv_bias: bool,
    ):
        super().__init__()
        if dimension_out % num_heads != 0:
            raise ValueError(
                f"dimension_out must be divisible by num_heads, got {dimension_out} and {num_heads}"
            )

        self.dimension_out = dimension_out
        self.num_heads = num_heads
        self.head_dim = dimension_out // num_heads

        self.W_query = nn.Linear(dimension_in, dimension_out, bias=qkv_bias)
        self.W_key = nn.Linear(dimension_in, dimension_out, bias=qkv_bias)
        self.W_value = nn.Linear(dimension_in, dimension_out, bias=qkv_bias)
        self.out_projection = nn.Linear(dimension_out, dimension_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, number_of_tokens, dimension_in = x.shape

        # x shape: (batch_size, number_of_tokens, dimension_in)

        # pass through the linear layers
        keys = self.W_key(x)  # shape (batch_size, number_of_tokens, dimension_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # split into heads
        keys = keys.view(
            batch_size, number_of_tokens, self.num_heads, self.head_dim
        )  # shape (batch_size, number_of_tokens, num_heads, head_dim)
        queries = queries.view(
            batch_size, number_of_tokens, self.num_heads, self.head_dim
        )
        values = values.view(
            batch_size, number_of_tokens, self.num_heads, self.head_dim
        )

        # transpose to get the shape right for the attention scores
        keys = keys.transpose(
            1, 2
        )  # shape (batch_size, num_heads, number_of_tokens, head_dim)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # compute the attention scores
        attention_scores = queries @ keys.transpose(2, 3)

        # create the mask
        mask_bool = self.mask.bool()[:number_of_tokens, :number_of_tokens]

        # apply the mask
        attention_scores.masked_fill_(mask_bool, -torch.inf)

        # compute the attention weights
        attention_weights = torch.softmax(
            attention_scores / keys.shape[-1] ** 0.5, dim=-1
        )
        attention_weights = self.dropout(attention_weights)

        # compute the context vector
        context_vector = (attention_weights @ values).transpose(1, 2)

        # combine the heads
        context_vector = context_vector.contiguous().view(
            batch_size, number_of_tokens, self.dimension_out
        )
        context_vector = self.out_projection(context_vector)

        return context_vector

In [5]:
mha = MultiHeadAttention(
    dimension_in=16,
    dimension_out=32,
    context_length=10,
    dropout=0.1,
    num_heads=4,
    qkv_bias=True,
)

# Example input: batch_size=2, number_of_tokens=10, dimension_in=16
x = torch.randn(2, 10, 16)

# Forward pass
print(f"{x.shape=}")
output = mha(x)
print(f"{output.shape=}")

x.shape=torch.Size([2, 10, 16])
output.shape=torch.Size([2, 10, 32])


In [6]:
class LayerNorm(nn.Module):
    def __init__(self, embedding_dimension: int):
        super().__init__()
        self.eps = 1e-6  # small constant to avoid division by zero
        self.scale = nn.Parameter(torch.ones(embedding_dimension))
        self.bias = nn.Parameter(torch.zeros(embedding_dimension))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.bias

In [7]:
ln = LayerNorm(embedding_dimension=16)
x = torch.randn(2, 10, 16)
print(f"{x.shape=}, mean={x.mean()}, std={x.std()}")
print(f"{ln(x).shape=}, mean={ln(x).mean()}, std={ln(x).std()}")

x.shape=torch.Size([2, 10, 16]), mean=-0.10188867896795273, std=1.0522814989089966
ln(x).shape=torch.Size([2, 10, 16]), mean=-5.9604645663569045e-09, std=1.0015655755996704


In [8]:
class GELU(nn.Module):
    """
    Gelu activation:
    xi > 0 -> output ~ xi
    -2 < xi < 2 -> output ~ 0
    xi << -2 -> output ~ 0
    """

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return (
            0.5
            * x
            * (
                1
                + torch.tanh(
                    torch.sqrt(torch.tensor(2.0 / torch.pi))
                    * (x + 0.044715 * torch.pow(x, 3))
                )
            )
        )

In [9]:
glu = GELU()
x = torch.randn(2, 10, 16)
print(f"{x.shape=}")
print(f"{glu(x).shape=}")

x.shape=torch.Size([2, 10, 16])
glu(x).shape=torch.Size([2, 10, 16])


In [10]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dimension: int, hidden_expansion_factor: int = 4):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(
                embedding_dimension, hidden_expansion_factor * embedding_dimension
            ),
            nn.GELU(),
            nn.Linear(
                hidden_expansion_factor * embedding_dimension, embedding_dimension
            ),
        )

    def forward(self, x):
        return self.layers(x)

In [11]:
x = torch.randn(2, 10, 16)
print(f"{x.shape=}")

ff = FeedForward(embedding_dimension=16, hidden_expansion_factor=4)
print(f"{ff(x).shape=}")

x.shape=torch.Size([2, 10, 16])
ff(x).shape=torch.Size([2, 10, 16])


In [12]:
class TransformerBlock(nn.Module):
    def __init__(
        self,
        embedding_dimension: int,
        context_length: int,
        num_heads: int,
        qkv_bias: bool,
        dropout_rate: float,
    ):
        super().__init__()
        self.attention = MultiHeadAttention(
            dimension_in=embedding_dimension,
            dimension_out=embedding_dimension,
            context_length=context_length,
            dropout=dropout_rate,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
        )
        self.feed_forward = FeedForward(
            embedding_dimension=embedding_dimension,
        )
        self.norm1 = LayerNorm(embedding_dimension=embedding_dimension)
        self.norm2 = LayerNorm(embedding_dimension=embedding_dimension)
        self.drop_shortcut = nn.Dropout(p=dropout_rate)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # shortcut connection // attention
        shortcut = x
        x = self.norm1(x)
        x = self.attention(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        # shortcut connection // feed forward
        shortcut = x
        x = self.norm2(x)
        x = self.feed_forward(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x

In [13]:
x = torch.randn(2, 10, 16)
print(f"{x.shape=}")

tb = TransformerBlock(
    embedding_dimension=16,
    context_length=10,
    num_heads=4,
    qkv_bias=True,
    dropout_rate=0.1,
)
print(f"{tb(x).shape=}")

x.shape=torch.Size([2, 10, 16])
tb(x).shape=torch.Size([2, 10, 16])


In [14]:
class FaiscaGPT(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dimension: int,
        context_length: int,
        num_layers: int,
        num_heads: int,
        dropout_rate: float,
        qkv_bias: bool,
    ):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dimension)
        self.positional_embedding = nn.Embedding(context_length, embedding_dimension)
        self.dropout_embedding = nn.Dropout(p=dropout_rate)

        self.transformer_blocks = nn.Sequential(
            *[
                TransformerBlock(
                    embedding_dimension=embedding_dimension,
                    context_length=context_length,
                    num_heads=num_heads,
                    qkv_bias=qkv_bias,
                    dropout_rate=dropout_rate,
                )
                for _ in range(num_layers)
            ]
        )

        self.final_layer_norm = LayerNorm(embedding_dimension=embedding_dimension)
        self.out_head = nn.Linear(embedding_dimension, vocab_size, bias=False)

        n_params = sum(p.numel() for p in self.transformer_blocks.parameters())
        n_params_million = n_params / 1e6
        print(f"Number of parameters in transformer blocks: {n_params_million:.2f}M")

        n_params_all = sum(p.numel() for p in self.parameters())
        n_params_all_million = n_params_all / 1e6
        print(f"Number of parameters in all layers: {n_params_all_million:.2f}M")

    def forward(self, in_idx: torch.Tensor) -> torch.Tensor:
        _, sequence_length = in_idx.shape
        token_embeddings = self.token_embedding(in_idx)
        positional_embeddings = self.positional_embedding(
            torch.arange(sequence_length, device=in_idx.device)
        )
        x = token_embeddings + positional_embeddings
        x = self.dropout_embedding(x)
        x = self.transformer_blocks(x)
        x = self.final_layer_norm(x)
        logits = self.out_head(x)
        return logits

In [15]:
config = update_config(
    config,
    {
        "vocab_size": 50257,  # Vocabulary size
        "context_length": 1024,  # Context length
        "embedding_dimension": 768,  # Embedding dimension
        "num_heads": 12,  # Number of attention heads
        "num_layers": 12,  # Number of layers
        "dropout_rate": 0.1,  # Dropout rate
        "qkv_bias": False,  # QKV bias
    },
)


fgpt = FaiscaGPT(
    vocab_size=config.vocab_size,
    embedding_dimension=config.embedding_dimension,
    context_length=config.context_length,
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    dropout_rate=config.dropout_rate,
    qkv_bias=config.qkv_bias,
)

torch.manual_seed(42)
x = torch.randint(0, config.vocab_size, (1, config.context_length))

print(f"{x.shape=}")
print(f"{fgpt(x).shape=}")

Number of parameters in transformer blocks: 85.03M
Number of parameters in all layers: 163.01M
x.shape=torch.Size([1, 1024])
fgpt(x).shape=torch.Size([1, 1024, 50257])


In [16]:
import pathlib

config.context_length = 240

bocage_text = pathlib.Path("../datasets/bocage-mini.txt").read_text(
    encoding="ISO-8859-1"
)
dataset = FaiscaDataset(bocage_text, context_length=config.context_length)


config = update_config(
    config,
    {
        "vocab_size": dataset.vocab_size,  # Vocabulary size
        "context_length": config.context_length,  # Context length
        "embedding_dimension": 768,  # Embedding dimension
        "num_heads": 12,  # Number of attention heads
        "num_layers": 12,  # Number of layers
        "dropout_rate": 0.1,  # Dropout rate
        "qkv_bias": False,  # QKV bias
    },
)

model = FaiscaGPT(
    vocab_size=config.vocab_size,
    embedding_dimension=config.embedding_dimension,
    context_length=config.context_length,
    num_layers=config.num_layers,
    num_heads=config.num_heads,
    dropout_rate=config.dropout_rate,
    qkv_bias=config.qkv_bias,
)


input_text = "Hello, I am"
encoded = dataset.encode(input_text).unsqueeze(0)

print(f"\nInput text: {input_text}")
print(f"Encoded input: {encoded}\n")


max_new_tokens = 125

for _ in range(max_new_tokens):
    idx_cond = encoded[:, -config.context_length :]
    with torch.no_grad():
        logits = model(idx_cond)

    logits = logits[:, -1, :]
    idx_next = torch.argmax(logits, dim=-1, keepdim=True)

    encoded = torch.cat((encoded, idx_next), dim=1)

print(f"\nGenerated: {encoded}")
decoded = dataset.decode(encoded[0])
print(f"Decoded: {decoded}\n")

Vocab size: 62
Data size: 20479
Number of parameters in transformer blocks: 85.03M
Number of parameters in all layers: 85.31M

Input text: Hello, I am
Encoded input: tensor([[20, 40, 47, 47, 50,  7,  1, 21,  1, 36, 48]])


Generated: tensor([[20, 40, 47, 47, 50,  7,  1, 21,  1, 36, 48, 33,  7, 53,  4, 24, 55, 40,
         58, 13, 48, 33, 37, 15, 44, 14, 24, 38, 38, 44, 14, 17, 15, 60, 15,  2,
         33, 55, 10, 32, 35, 60, 56, 17, 46, 60, 16, 61, 39, 28, 18, 34, 27, 31,
         42, 58, 27, 33, 55, 35, 60, 44, 20, 18, 10, 27,  7, 36,  5,  8,  0, 60,
         17, 49, 17, 61, 48, 22, 17, 15, 20, 49, 47, 28, 27, 61, 39, 33, 59, 13,
         10, 55, 46, 16, 21,  3, 27, 61, 33, 60, 10, 47, 28, 49,  7, 10, 27, 61,
         33, 55, 57, 53, 43, 33, 32, 52, 34,  2, 44, 37,  0, 58, 44,  3, 38, 44,
          4, 11, 58, 44, 14,  6, 55, 47, 28, 45]])
Decoded: Hello, I amW,r'MtewAmWbCiBMcciBECyC!Wt:V_yuEkyDzdRFYPUgwPWt_yiHF:P,a(-
yEnEzmJECHnlRPzdWxA:tkDI"PzWy:lRn,:PzWtvrhWVqY!ib
wi"ci';wiB)tlRj



In [17]:
from torch.utils.data import DataLoader


config.test_size = 0.15
split_idx = int(config.test_size * len(dataset))
print(f"Split index: {split_idx}")

config.batch_size = 5
print(f"Batch size: {config.batch_size}")

config.num_workers = 0  # jupyter does not support multiprocessing
print(f"Number of workers: {config.num_workers}")

config.shuffle = True
config.drop_last = True

train_dataloader = DataLoader(
    FaiscaDataset(
        bocage_text[:split_idx],
        context_length=config.context_length,
    ),
    batch_size=config.batch_size,
    shuffle=config.shuffle,
    drop_last=config.drop_last,
    num_workers=config.num_workers,
)
val_dataloader = DataLoader(
    FaiscaDataset(
        bocage_text[split_idx:],
        context_length=config.context_length,
    ),
    batch_size=config.batch_size,
    shuffle=config.shuffle,
    drop_last=config.drop_last,
    num_workers=config.num_workers,
)

print(config)

Split index: 3035
Batch size: 5
Number of workers: 0
Vocab size: 55
Data size: 3035
Vocab size: 62
Data size: 17444
namespace(context_length=240, vocab_size=62, embedding_dimension=768, num_heads=12, num_layers=12, dropout_rate=0.1, qkv_bias=False, test_size=0.15, batch_size=5, num_workers=0, shuffle=True, drop_last=True)


In [18]:
split_idx

3035

In [19]:
len(dataset)

20239

In [20]:
len(train_dataloader)

559

In [21]:
training_losses, validation_losses, track_tokens_seen = [], [], []
tokens_seen = 0
global_step = -1

config.num_epochs = 10
config.learning_rate = 1e-3
config.weight_decay = 0.1
config.device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
config.eval_freq = 5
config.eval_iter = 1

optimizer = torch.optim.AdamW(
    model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay
)

model.to(config.device)


def calculate_loss(input_batch, target_batch, model):
    input_batch, target_batch = (
        input_batch.to(config.device),
        target_batch.to(config.device),
    )
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss


for epoch in range(config.num_epochs):
    print(f"Epoch {epoch}")
    model.train()
    size_train_dataloader = len(train_dataloader)
    batch_num = 0

    for input_batch, target_batch in train_dataloader:
        optimizer.zero_grad()
        loss = calculate_loss(input_batch, target_batch, model)
        loss.backward()
        optimizer.step()
        tokens_seen += input_batch.numel()
        global_step += 1
        batch_num += 1

        # run evaluation
        if global_step % config.eval_freq == 0:
            model.eval()
            with torch.no_grad():
                total_train_loss = 0
                for i, (input_batch, target_batch) in enumerate(train_dataloader):
                    if i < config.eval_iter:
                        loss = calculate_loss(input_batch, target_batch, model)
                        total_train_loss += loss.item()
                    else:
                        break

                train_loss = total_train_loss / config.eval_iter

                total_validation_loss = 0
                for i, (input_batch, target_batch) in enumerate(val_dataloader):
                    if i < config.eval_iter:
                        loss = calculate_loss(input_batch, target_batch, model)
                        total_validation_loss += loss.item()
                    else:
                        break

                validation_loss = total_validation_loss / config.eval_iter

                print(
                    f"Epoch: {epoch} - Step: {global_step} - Train Loss: {train_loss} - Validation Loss: {validation_loss} - Batch: {batch_num} (out of {size_train_dataloader})"
                )

                training_losses.append(train_loss)
                validation_losses.append(validation_loss)
                track_tokens_seen.append(tokens_seen)

                model.train()

    # print sample
    model.eval()
    start_context = "Murchas no horror "
    context_size = model.positional_embedding.weight.shape[0]
    encoded = dataset.encode(start_context).unsqueeze(0)

    max_new_tokens = 125

    with torch.no_grad():
        for _ in range(max_new_tokens):
            idx_cond = encoded[:, -config.context_length :]
            logits = model(idx_cond)
            logits = logits[:, -1, :]
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
            encoded = torch.cat((encoded, idx_next), dim=1)

    print(f"\nGenerated: {encoded}")
    decoded = dataset.decode(encoded[0])
    print(f"Decoded: {decoded}\n")

    model.train()

Epoch 0
Epoch: 0 - Step: 0 - Train Loss: 5.072232723236084 - Validation Loss: 6.737091064453125 - Batch: 1 (out of 559)
Epoch: 0 - Step: 5 - Train Loss: 3.4928996562957764 - Validation Loss: 4.627834796905518 - Batch: 6 (out of 559)
Epoch: 0 - Step: 10 - Train Loss: 3.298999547958374 - Validation Loss: 5.103976726531982 - Batch: 11 (out of 559)
Epoch: 0 - Step: 15 - Train Loss: 3.1710705757141113 - Validation Loss: 4.973865032196045 - Batch: 16 (out of 559)
Epoch: 0 - Step: 20 - Train Loss: 3.195740222930908 - Validation Loss: 4.918081760406494 - Batch: 21 (out of 559)
Epoch: 0 - Step: 25 - Train Loss: 3.0871100425720215 - Validation Loss: 5.096863746643066 - Batch: 26 (out of 559)
Epoch: 0 - Step: 30 - Train Loss: 3.0517594814300537 - Validation Loss: 5.171359062194824 - Batch: 31 (out of 559)
Epoch: 0 - Step: 35 - Train Loss: 2.9863762855529785 - Validation Loss: 4.900959491729736 - Batch: 36 (out of 559)
Epoch: 0 - Step: 40 - Train Loss: 2.9792404174804688 - Validation Loss: 4.83649

KeyboardInterrupt: 

In [None]:
len(train_dataloader)

0

In [None]:
show_config(config)

batch_size: 16
context_length: 240
device: mps
drop_last: True
dropout_rate: 0.1
embedding_dimension: 768
learning_rate: 0.001
num_epochs: 3
num_heads: 12
num_layers: 12
num_workers: 4
qkv_bias: False
shuffle: True
test_size: 0.15
vocab_size: 99
weight_decay: 0.1
