## 🚘 Tranformer

#### 📚 Libraries

In [1]:
import os

os.chdir("/Users/dewith/Repos/gpt-from-scratch")

In [2]:
import re
import logging

import pandas as pd
from datasets import load_dataset
from unidecode import unidecode

import torch
from torch import nn
from torch.nn import functional as F
from torchviz import make_dot

In [3]:
logging.basicConfig(level=logging.DEBUG, force=True)
LOGGER = logging.getLogger(__name__)

#### 🏹 Functions

In [5]:
"""
This module provides utility functions for working with data.
The `get_corpus_text` function is used to get the corpus text from a dataset.
The `get_data_split` function is used to split the data into train and val sets.
The `Tokenizer` class is used to encode and decode text.
The `Batcher` class is used to generate batches of data.
"""


def get_corpus_text(local_path=None, hf_path=None, is_clean=False):
    """Get the corpus text"""
    if not is_clean:
        if local_path is None and hf_path is None:
            raise ValueError("At least one of local or hf path must be passed.")

        if local_path is not None and hf_path is None:
            corpus_df = pd.read_csv(local_path)
            path = local_path
        elif local_path is None and hf_path is not None:
            dataset = load_dataset(hf_path)
            corpus_df = dataset["train"].to_pandas()
            path = hf_path
        else:
            try:
                corpus_df = pd.read_csv(local_path)
                path = local_path
            except Exception as e:  # pylint: disable=broad-except
                LOGGER.warning("│   ├── Local path raised exception:")
                LOGGER.error("│   ├── \t%s", e)
                LOGGER.warning("│   ├── Loading from Hugging Face dataset.")
                dataset = load_dataset(hf_path)
                corpus_df = dataset["train"].to_pandas()
                path = hf_path
        LOGGER.info("│   ├── Loaded from %s", path)

        corpus_text = "\n".join(corpus_df["doc_text"])
        corpus_text_clean = corpus_text.replace("\n", " ").replace("\r", " ")
        corpus_text_clean = re.sub(r" +", " ", corpus_text_clean)
        pat = r'[^\w\s!"·$%&/()=?¿\\|@#+,\.-^\*;:_\[\]\{\} !¡¿?,\.@#$%^&\*]'
        corpus_text_clean = re.sub(pat, "", corpus_text_clean)
        corpus_text_clean = corpus_text_clean.lower()
        corpus_text_clean = unidecode(corpus_text_clean)
        LOGGER.info("│   ├── Corpues cleaned and normalized")

        with open("data/02_primary/corpus.txt", "w", encoding="utf-8") as f:
            f.write(corpus_text_clean)
    else:
        with open(local_path, "r", encoding="utf-8") as f:
            corpus_text_clean = f.read()
        LOGGER.info("│   ├── Loaded already clean corpus from %s", local_path)

    return corpus_text_clean


def get_data_split(corpus_text, tokenizer):
    """Get the data split into training and validation sets."""
    data = torch.tensor(tokenizer.encode(corpus_text), dtype=torch.long)
    train_size = int(len(data) * 0.90)
    train_data = data[:train_size]
    val_data = data[train_size:]
    return train_data, val_data


class Tokenizer:
    """Tokenizer class for encoding and decoding text."""

    def __init__(self, corpus_text):
        stoi, itos = self._get_token_maps(corpus_text)
        self.stoi = stoi
        self.itos = itos
        self.vocab_size = len(stoi)

    def encode(self, text: str) -> list:
        """Encode text to integers."""
        return [self.stoi.get(char, self.stoi["[UNK]"]) for char in text]

    def decode(self, integers: list) -> str:
        """Decode list of integers to text."""
        return "".join([self.itos[i] for i in integers])

    def _get_token_maps(self, corpus_text):
        chars = sorted(list(set(corpus_text)))
        itos = dict(enumerate(chars, start=0))
        itos[max(itos) + 1] = "[UNK]"
        stoi = {char: i for i, char in enumerate(chars, start=0)}
        stoi["[UNK]"] = max(stoi)
        return stoi, itos


class Batcher:
    """Batcher class for generating batches of data."""

    # pylint: disable=too-few-public-methods

    def __init__(self, train_data, val_data, batch_size, block_size):
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.block_size = block_size

    def get_batch(self, split):
        """Generates a small batch of data of inputs x and targets y"""
        data = self.train_data if split == "train" else self.val_data
        ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
        x = torch.stack([data[i : i + self.block_size] for i in ix])
        y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
        return x, y

In [6]:
"""Transformer model for language modeling."""


class Head(nn.Module):
    """Attetion head."""

    # pylint: disable=too-few-public-methods

    def __init__(self, block_size=8, num_embeds=64, head_size=8, dropout=0.1):
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(num_embeds, head_size, bias=False)
        self.query = nn.Linear(num_embeds, head_size, bias=False)
        self.value = nn.Linear(num_embeds, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """Forward pass of the model."""
        _, t, _ = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        # Compute the attention scores
        scores = (q @ k.transpose(-2, -1)) / (self.head_size**0.5)
        scores_masked = scores.masked_fill(self.tril[:t, :t] == 0, float("-inf"))
        weights = torch.softmax(scores_masked, dim=-1)
        weights = self.dropout(weights)

        # Compute the weighted sum of the values
        output = weights @ v
        return output


class MultiHeadAttention(nn.Module):
    """Multiple heads of attention."""

    # pylint: disable=too-few-public-methods
    # pylint: disable=too-many-arguments

    def __init__(self, num_heads, block_size, num_embeds, head_size, dropout):
        super().__init__()
        self.heads = [
            Head(block_size, num_embeds, head_size, dropout) for _ in range(num_heads)
        ]
        self.proj = nn.Linear(num_embeds, num_embeds)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """Forward pass of the model."""
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """Feed forward network for the transformer."""

    # pylint: disable=too-few-public-methods

    def __init__(self, num_embeds, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_embeds, 4 * num_embeds),
            nn.ReLU(),
            nn.Linear(4 * num_embeds, num_embeds),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        """Forward pass of the model."""
        return self.net(x)


class Block(nn.Module):
    """Transformer block. Communication followed by computation"""

    # pylint: disable=too-few-public-methods
    # pylint: disable=too-many-arguments

    def __init__(self, num_heads, block_size, num_embeds, head_size, dropout):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(num_embeds)
        self.attention_heads = MultiHeadAttention(
            num_heads, block_size, num_embeds, head_size, dropout
        )
        self.layer_norm2 = nn.LayerNorm(num_embeds)
        self.feed_forward = FeedForward(num_embeds, dropout)

    def forward(self, x):
        """Forward pass of the model."""
        x = x + self.attention_heads(self.layer_norm1(x))
        x = x + self.feed_forward(self.layer_norm2(x))
        return x


class Transformer(nn.Module):
    """Transformer model for language modeling."""

    # pylint: disable=too-many-instance-attributes
    # pylint: disable=too-many-arguments

    def __init__(
        self,
        vocab_size: int,
        num_embeds: int = 32,
        block_size: int = 8,
        num_heads: int = 4,
        head_size: int = 8,
        num_layers: int = 3,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_embeds = num_embeds
        self.block_size = block_size
        self.head_size = head_size
        # Each token reads the logits for the next token from the lookup table
        self.token_embed_table = nn.Embedding(vocab_size, num_embeds)
        self.position_embed_table = nn.Embedding(block_size, num_embeds)
        self.blocks = nn.Sequential(
            *[
                Block(num_heads, block_size, num_embeds, head_size, dropout)
                for _ in range(num_layers)
            ]
        )
        self.layer_norm = nn.LayerNorm(num_embeds)
        self.linear_head = nn.Linear(num_embeds, vocab_size)

    def forward(self, idx, targets=None):
        """Forward pass of the model."""
        b, t = idx.shape

        # idx and targets are both of shape (batch_size, sequence_length)
        tok_emb = self.token_embed_table(idx)  # (B, T, E)
        pos_emb = self.position_embed_table(torch.arange(t).to(idx.device))  # (T, E)
        x = tok_emb + pos_emb  # (B, T, E)
        x = self.blocks(x)  # (B, T, E)
        x = self.layer_norm(x)  # (B, T, E)
        logits = self.linear_head(x)

        if targets is None:
            loss = None
        else:
            # (B, T, C) (batch_size, sequence_length, vocab_size)
            b, t, c = logits.shape  # Shape (32, 8, 527)
            logits_ = logits.view(b * t, c)  # Reshape to (256, 527)
            targets_ = targets.view(-1)  # Reshape from (32, 8) to (256)
            loss = F.cross_entropy(logits_, targets_)
        return logits, loss

    def generate(self, idx, length):
        """Generate text using the model."""
        generated_sequence = idx.clone()
        with torch.no_grad():
            for _ in range(length):
                # Ensure idx does not exceed the block size
                idx = idx[:, -self.block_size :]

                # Get the predictions for the current set of tokens
                logits, _ = self.forward(idx)  # (B, T, C)
                logits = logits[:, -1, :]  # Get the last token's logits (B, C)
                probs = F.softmax(logits, dim=-1)  # (B, C)
                next_token = torch.multinomial(probs, 1)  # (B, 1)

                # Append the next token to the generated sequence (B, T+1)
                generated_sequence = torch.cat(
                    tensors=[generated_sequence, next_token], dim=-1
                )
                # Append the next token to idx for the next iteration
                idx = torch.cat([idx, next_token], dim=-1)
        return generated_sequence

    def viz(self, x, y):
        """Visualize the model."""
        viz = make_dot(
            self(x, y)[1],
            params=dict(self.named_parameters()),
            show_attrs=False,
            show_saved=False,
        )
        return viz

In [7]:
"""
This module provides functions for evaluating the model. The `estimate_loss`
function is used to estimate the loss of the model on the training and
validation sets.
"""

@torch.no_grad()
def estimate_loss(model, batcher, eval_iters):
    """Estimate the loss of the model on the training and validation sets."""
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = batcher.get_batch(split)
            _, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

#### 🚤 Main Execution

Data preparation

In [8]:
LOGGER.info("Training the transformer model with self attention")

# Data
LOGGER.info("├── Loading the dataset")
try:
    dataset_local_path = "data/02_primary/corpus.txt"
    corpus = get_corpus_text(dataset_local_path, is_clean=True)
except FileNotFoundError:
    dataset_local_path = "data/01_raw/secop_corpus.csv"
    dataset_hf_path = "dewithsan/secop_corpus_clean"
    corpus = get_corpus_text(dataset_local_path, dataset_hf_path)
tokenizer = Tokenizer(corpus)
train_data, val_data = get_data_split(corpus, tokenizer)
LOGGER.info("│   └── Data tokenized and train/val splitted")
LOGGER.info("│")

INFO:__main__:Training the transformer model with self attention
INFO:__main__:├── Loading the dataset
INFO:__main__:│   ├── Loaded already clean corpus from data/02_primary/corpus.txt
INFO:__main__:│   └── Data tokenized and train/val splitted
INFO:__main__:│


Defining hyperparameters

In [9]:
# Hyperparameters
LOGGER.info("├── Defining hyperparameters")
batch_size = 16
block_size = 24
num_embeds = 80
num_heads = 8
head_size = num_embeds // num_heads
num_layers = 3
dropout = 0.2
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "cpu"
max_steps = 5000
step_loss_interval = 100
eval_interval = 500
eval_iters = 100

INFO:__main__:├── Defining hyperparameters


Model definition and visualization

In [10]:
# Model definition
LOGGER.info("├── Defining the model")
vocab_size = tokenizer.vocab_size
model = Transformer(
    vocab_size, num_embeds, block_size, num_heads, head_size, num_layers, dropout
)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
batcher = Batcher(train_data, val_data, batch_size, block_size)
LOGGER.info("│   ├── Model created with vocab of %s", vocab_size)
LOGGER.info("│   ├── Optimizer: AdamW with lr %s", learning_rate)
LOGGER.info("│   ├── Device: %s", device)
LOGGER.info("│   └── Batcher size: %s", batch_size)
LOGGER.info("│")

if False:
    LOGGER.info("├── Visualizing the model")
    viz_path = "data/04_models/transformer.png"
    x_viz, y_viz = batcher.get_batch("train")
    x_viz, y_viz = x_viz.to(device), y_viz.to(device)
    model_viz = model.viz(x_viz, y_viz)
    model_viz.render(
        filename=viz_path.rsplit(".", maxsplit=1)[0],
        format=viz_path.rsplit(".", maxsplit=1)[-1],
        cleanup=True,
    )
    LOGGER.info("│   └── Model visualization saved at %s", viz_path)

INFO:__main__:├── Defining the model
INFO:__main__:│   ├── Model created with vocab of 97
INFO:__main__:│   ├── Optimizer: AdamW with lr 0.001
INFO:__main__:│   ├── Device: cpu
INFO:__main__:│   └── Batcher size: 16
INFO:__main__:│


In [11]:
# Model training
LOGGER.info("├── Training the model with %s steps", max_steps)
for step in range(max_steps + 1):
    # Forward pass
    xb, yb = batcher.get_batch("train")
    xb, yb = xb.to(device), yb.to(device)
    _, loss = model(xb, yb)

    # Backward pass
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if step % step_loss_interval == 0:
        loss_str = f"{loss.item():.4f}"
        LOGGER.info("│   ├── Step %s ~ Loss: %s", step, loss_str)

    # Evaluate the model and log the losses
    if step % eval_interval == 0:
        losses = estimate_loss(model, batcher, eval_iters)
        train_loss = f"{losses['train']:.4f}"
        val_loss = f"{losses['val']:.4f}"
        LOGGER.info("│   │   ├── Train loss: %s", train_loss)
        LOGGER.info("│   │   └── Val loss:   %s", val_loss)

LOGGER.info("│   └── Model training completed")
LOGGER.info("│")

LOGGER.info("├── Generating text")
context = torch.randint(tokenizer.vocab_size, (1, 1)).to(device)
generated_tokens = model.generate(context, 80)
generated_text = tokenizer.decode(generated_tokens[0].tolist())
LOGGER.info("│   ├── Text generated")
LOGGER.info("│   └── %s", generated_text.replace("\n", " "))
LOGGER.info("│")

LOGGER.info("└── Saving the model")
local_path = "data/04_models/transformer_model.pth"
torch.save(model.state_dict(), local_path)
LOGGER.info("    └── Model saved at %s", local_path)

INFO:__main__:├── Training the model with 5000 steps
INFO:__main__:│   ├── Step 0 ~ Loss: 4.7092
INFO:__main__:│   │   ├── Train loss: 4.4706
INFO:__main__:│   │   └── Val loss:   4.4671
INFO:__main__:│   ├── Step 100 ~ Loss: 2.5229
INFO:__main__:│   ├── Step 200 ~ Loss: 2.3175
INFO:__main__:│   ├── Step 300 ~ Loss: 2.2343
INFO:__main__:│   ├── Step 400 ~ Loss: 2.3830
INFO:__main__:│   ├── Step 500 ~ Loss: 2.2409
INFO:__main__:│   │   ├── Train loss: 2.2866
INFO:__main__:│   │   └── Val loss:   2.2548
INFO:__main__:│   ├── Step 600 ~ Loss: 2.1925
INFO:__main__:│   ├── Step 700 ~ Loss: 2.4422
INFO:__main__:│   ├── Step 800 ~ Loss: 2.1084
INFO:__main__:│   ├── Step 900 ~ Loss: 2.1943
INFO:__main__:│   ├── Step 1000 ~ Loss: 2.3896
INFO:__main__:│   │   ├── Train loss: 2.1996
INFO:__main__:│   │   └── Val loss:   2.2198
INFO:__main__:│   ├── Step 1100 ~ Loss: 2.1915
INFO:__main__:│   ├── Step 1200 ~ Loss: 2.1884
INFO:__main__:│   ├── Step 1300 ~ Loss: 2.1566
INFO:__main__:│   ├── Step 1400