## GPT Development

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
with open('tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Mapping from characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

encode = lambda x: [char_to_int[c] for c in x] # x: str -> list[int]
decode = lambda x: ''.join([int_to_char[i] for i in x]) # x: list[int] -> str

print(encode('Hello World!'))
print(decode(encode('Hello World!')))


In [None]:
# Train and validation splits
data = torch.tensor(encode(text), dtype=torch.long)
split = int(len(data) * 0.9) # 90% train, 10% val
train_data, val_data = data[:split], data[split:]

In [None]:
# There are a total of block_size training examples in each block
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    print(f'context: {x[:t+1].tolist()} -> target: {y[t]}')

Training with these different lengths of contexts from a size of 1 to block_size is important to ensure the transformer learns to deal with different context lengths. This is useful during inference because the model can generate text from as little as one character of context.

In [None]:
batch_size = 4 # Number of sequences to process in parallel
block_size = 8 # Maximum context length for predictions

def get_batch(split: str) -> tuple[torch.Tensor, torch.Tensor]:
    """Generate a random batch of context and target sequences."""
    data = train_data if split == 'train' else val_data
    # Randomly sample batch_size number of starting indices
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

# Get a batch of context and target sequences
xb, yb = get_batch('train')
print(f'xb: {xb}\nyb: {yb}')

# xb and yb are both tensors of shape (batch_size, block_size)

In [None]:
# Display the context and target sequences for each batch element
for b in range(batch_size):
    for t in range(block_size):
        print(f'context: {xb[b, :t+1].tolist()} -> target: {yb[b, t]}')

### Bigram Model

In [None]:
# B - batch size, T - block size (time step), C - embedding dimension (vocab size)

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embed_table = nn.Embedding(vocab_size, vocab_size) # (B,T) -> (B,T,C)

    def forward(self, x: torch.Tensor, y: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
        logits = self.token_embed_table(x)

        if y is None:
            loss = None
        else:
            B, T, C = logits.shape
            # Flatten batch and sequence dimensions to use F.cross_entropy
            logits = logits.view(B*T, C)
            y = y.view(B*T)
            loss = F.cross_entropy(logits, y)
        return logits, loss

    def generate(self, x: torch.Tensor, max_tokens: int) -> torch.Tensor:
        for _ in range(max_tokens):
            # Get the previous predictions
            logits, _ = self(x)
            # Keep only the last prediction
            logits = logits[:, -1, :] # (B,C)
            # Apply softmax to convert logits into probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # Sample from the probability distribution
            x_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # Concatenate the new prediction to the previous context
            x = torch.cat([x, x_next], dim=1) # (B,T+1)
        return x
    
model = BigramLanguageModel()

total_params = sum(param.numel() for param in model.parameters())
print(f'Model parameters: {total_params}')

# Generate
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_tokens=32)[0].tolist()))

The integer associated with each character is used as an index to look up the corresponding row in the embedding table. This row is a trainable vector (of size `n_embed`) representation of the character.

In [None]:
# Hyperparameters
batch_size = 32 # Sequences to process in parallel
max_iters = 2500 # Iterations to train the model
lr = 1e-2 # Learning rate

# Training the model
optimiser = torch.optim.AdamW(model.parameters(), lr=lr)

loss = torch.tensor(torch.inf)

for i in range(max_iters):

    if i % (max_iters // 10) == 0 or i == max_iters - 1:
        print(f'iteration {i}, loss: {loss.item()}')

    # Get a batch of context and target sequences
    xb, yb = get_batch('train')

    # Compute the gradients and update the weights
    _, loss = model(xb, yb) # Forward pass
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()

In [None]:
# Generate
context = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(context, max_tokens=32)[0].tolist()))

### Self-Attention

In [None]:
B, T, C = 4, 8, 32
x = torch.randn(B, T ,C)

# Bag of words. Calculate x[b,t] = mean_{t'<=t} x[b,t']
xbow_1 = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xbow_1[b, t] = torch.mean(x[b, :t+1], 0)

# Version 2. Parallelised. W is a lower triangular matrix which can be used for weighted aggregation
W = torch.tril(torch.ones(T, T))
W = W / W.sum(1, keepdim=True)
xbow_2 = W @ x

# Version 3. Parallelised. Uses softmax. W represents the same lower triangular matrix as before
tril = torch.tril(torch.ones(T, T))
W = torch.zeros((T, T))
W = W.masked_fill(tril == 0, float('-inf'))
W = F.softmax(W, dim=-1)
xbow_3 = W @ x

# Check that the three methods are equivalent
torch.allclose(xbow_1, xbow_2) and torch.allclose(xbow_1, xbow_3)


In [None]:
# B - batch size, T - block size (time step), C - embedding dimension, H - head size

# Single head self-attention
head_size = 16
n_embed = 32
key = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
query = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
value = nn.Linear(n_embed, head_size, bias=False) # (B,T,C) -> (B,T,H)
k = key(x)
q = query(x)

# Compute the scaled dot-product attention
W = q @ k.transpose(-2, -1) # (B,T,H) @ (B,H,T) -> (B,T,T)
tril = torch.tril(torch.ones(T, T))
W = W.masked_fill(tril == 0, float('-inf'))
W = F.softmax(W, dim=-1)
v = value(x)
out = W @ v # (B,T,T) @ (B,T,H) -> (B,T,H)

**Notes:**
- Attention is a communication mechanism. It can be viewed as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- In the attention layer of a Transformer, every token is attending to a finite list of tokens previously in the sequence. This is called causal self-attention.
- There is no notion of space. Attention simply acts over a set of vectors. This is why tokens need to be positionally encoded.
- Each example across batch dimensions are treated independently and never interact with each other.
- In an encoder attention block just delete the single line that performs masking with `tril`, allowing all tokens to communicate with each other and not just the previous ones. The block implemented above is called a decoder attention block because it has triangular masking and is used in autoregressive settings like language modelling.
- 'Self-attention' just means that the keys and the values are produced from the same source as the queries (`x` in this case). In 'cross attention', the queries still get produced from `x`, but the keys and values come from a different source (such as an encoder module).
- 'Scaled' attention additionally divides `W` by $1/\sqrt{H}$. This ensures that when the input `Q` and `K` are of unit variance, `W` has unit variance as well and softmax will stay diffuse and not saturate (see below).

In [None]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
W = q @ k.transpose(-2, -1) * head_size**-0.5

k.var(), q.var(), W.var()

In [None]:
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1))
# With larger values the probabilities become more concentrated, converges to a one-hot vector
print(torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 10, dim=-1))

## GPT Tokeniser Development

A token is a sequence of characters in a text that serves as a unit. Furthermore, tokenisation is the process of converting a text into a sequence of tokens. Tokenisation is critical to the correct functioning of transformers and bad tokenisation can cause issues with the models performance irrespective of the model architecture. If tokenisation is not done correctly, transformer models can struggle to spell words, struggle with non-English words, struggle with simple arithmetic, and even produces unintended outputs (see [SolidGoldMagikarp](https://www.lesswrong.com/posts/aPeJE8bSo6rAFoLqg/solidgoldmagikarp-plus-prompt-generation)).

Recall the GPT model in `gpt.py`:
```Python
class GPTLanguageModel(nn.Module):
    """GPT Decoder model. Consists of an embedding layer, transformer blocks, and a linear head."""

    def __init__(self):
        super().__init__()
        self.token_embed_table = nn.Embedding(vocab_size, n_embed) # (B,T) -> (B,T,C)
        # etc.
```
Tokens are the fundamental 'atoms' at the input of transformers. Each token (character) is used as an index to look up the corresponding row in the embedding table, where this row is a trainable vector (of size `n_embed`) representation of the token. Using characters as tokens is a naive approach due to the transformers having a limited context window (1024 tokens for GPT-2) in which tokens can attend to each other. Chunk vocabularies are used to tokenise text into character chunks instead of individual characters. These chunk vocabularies are constructed using the Byte Pair Encoding (BPE) algorithm (popularised in the [GPT-2 paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)). Using character chunks as tokens allows the model to attend to a wider portion of the text, which can improve performance.

Note that the tokeniser is completely separate from the transformer model. It has a separate training dataset to train the vocabulary on the BPE algorithm. The tokeniser then encodes/decodes between text and sequences of tokens. The transformer model only sees the tokens and never directly deals with any text.

[Tiktokeniser](https://tiktokenizer.vercel.app/) provides a visualisation into differences between various tokenisers available for GPT models. Use 'gpt2' and 'cl100k_base' as the model names to compare the tokenisation of GPT-2 and GPT-4.

In [None]:
text = 'Hello 你好'

print([ord(x) for x in text]) # Encoding to unicode values
print(list(text.encode('utf-8'))) # Encoding to utf-8 bytes

# The utf-8 encoding is different from the unicode values for non-ASCII characters 
# as it uses a variable number of bytes. 20320 is encoded as 228 189 160 in utf-8.

### Helper Functions

In [None]:
def consecutive_pairs(ints: list[int]) -> dict[tuple[int, int], int]:
    """
    Generate a dictionary of the frequencies of consecutive integers in the list.
    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
    """
    freq = {}
    for pair in zip(ints, ints[1:]):
        freq[pair] = freq.get(pair, 0) + 1
    return freq

text = 'abcab'
tokens = list(text.encode('utf-8'))

freq_pairs = consecutive_pairs(tokens)
print(freq_pairs)

In [None]:
def replace_pair(ints: list[int], pair: tuple[int, int], new_int: int) -> list[int]:
    """
    Replace all consecutive occurrences of a pair of integers in the list with a new integer.
    Example: ints=[1, 2, 3, 1, 2], pair=(1, 2), new_int=4 -> [4, 3, 4]
    """
    new_ints = []
    i = 0
    while i < len(ints):
        # If not at the last position AND the pair matches, replace it
        if (i < len(ints) - 1) and ints[i:i+2] == list(pair):
            new_ints.append(new_int)
            i += 2
        else:
            new_ints.append(ints[i])
            i += 1
    return new_ints

# Replace the most frequent pair with a new token (256)
max_pair = max(freq_pairs, key=freq_pairs.get)
new_tokens = replace_pair(tokens, pair=max_pair, new_int=256)
print(f'{tokens} -> {new_tokens}')

### Training via Byte Pair Encoding (BPE)

In [None]:
# Load new text from a file.
with open('tinyshakespeare.txt', 'r', encoding='utf-8') as file:
    text = file.read()

vocab_size = 265 # Desired vocabulary size

assert vocab_size >= 256
n_merges = vocab_size - 256
tokens = list(text.encode('utf-8'))
merges = {} # Dictionary to store the merges
vocab = {i: bytes([i]) for i in range(256)}

# Merge the most frequent pair n_merges times to create new tokens
for i in range(n_merges):
    # Find the most frequent consecutive pair of tokens
    freq_pairs = consecutive_pairs(tokens)
    max_pair = max(freq_pairs, key=freq_pairs.get)
    # Create a new token and assign it to an unused integer
    new_token = 256 + i
    tokens = replace_pair(tokens, max_pair, new_token)
    # Store the merge and the new token in the vocab
    merges[max_pair] = new_token
    vocab[new_token] = vocab[max_pair[0]] + vocab[max_pair[1]]
    print(f'{i+1}/{n_merges}: {max_pair} -> {new_token}')

print(vocab)

In [None]:
# The text is now represented by fewer tokens
print('New token length:', len(tokens))
print(f'Compression ratio: {len(list(text.encode("utf-8"))) / len(tokens):.2f}')

### Decoding and Encoding

In [None]:
def decode(tokens: list[int]) -> str:
    """Decode a sequence of tokens into a string."""
    bytes_ = b''.join(vocab[token] for token in tokens)
    text = bytes_.decode('utf-8', errors='replace') # Replace unknown characters
    return text

# Not every byte sequence is valid utf-8. Replacing unknown characters with '?'
# helps to avoid decoding errors as the language model may generate tokens that
# are not valid utf-8. For example 128 is not a valid utf-8 byte.
print(decode([128]))

In [None]:
def encode(text: str) -> list[int]:
    """Encode a string into a sequence of tokens."""
    tokens = list(text.encode('utf-8'))
    while len(tokens) > 1:
        freq_pairs = consecutive_pairs(tokens)
        # Find the most frequent consecutive pair that has been merged
        most_freq = min(freq_pairs, key=lambda pair: merges.get(pair, float('inf')))
        if most_freq not in merges:
            break # No more merges to apply
        # Merge the pair into a new token
        new_token = merges[most_freq]
        tokens = replace_pair(tokens, most_freq, new_token)
    return tokens

print(encode('the quick brown fox'))

In [None]:
# Check that the encode and decode functions are inverses
text == decode(encode(text))

### Tokenisation Notes

The tokeniser is represented using just the learned `merges` and `vocab` variables and can encode and decode text using the BPE algorithm.

**Splitting Text via RegEx Patterns (GPT-2)**

Instead of directly encoding each string for tokenisation, the string is split up into a list of strings using regular expressions. All the string in this list are processed independently by the tokeniser. Therefore merges can only happen within the same string. The results are then concatenated together to form the final tokenised string. This ensures that some consecutive pairs of characters are not merged together (i.e. 'e ').

The regular expression below is from the [GPT-2 tokeniser](https://github.com/openai/gpt-2/blob/master/src/encoder.py). The patterns `'s|'t|'re|'ve|'m|'ll|'d` match common contractions, however it only considers the ASCII apostrophe (') and not the unicode apostrophe (’). Furthermore, they do not ignore case and so will not match `'S|'T|'RE|'VE|'M|'LL|'D`. That is, `I'm` will be tokenised as `I`, `'m` and `I'M` will be tokenised as `I`, `'`, `M`. These are limitations of the GPT-2 tokeniser.

GPT-2 also uses one special token which denotes the end of text, `<|endoftext|>`.

In [None]:
import regex as re

regex = re.compile(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
print(re.findall(regex, 'I\'m. I\'M'))

**Vocabulary Size**

Large vocabulary sizes increase the number of tokens that the model can represent. Therefore, tokens can express more information in a shorter sequence. This allows transformers to attend to more tokens in the sequence and improves the model's ability to learn long-range dependencies. However, larger vocabulary sizes mean that the embedding table is larger and hence more computationally expensive to train. Furthermore, large vocabulary sizes mean that each unique token is less likely to be seen in the training data, hence the vector representation of the token may be under-trained, leading to worse overall performance.

### Test the Tokeniser

In [None]:
import os

from gpt.tokeniser.gpt import GPTTokeniser

# Special tokens to be added to the vocabulary. GPT-4 uses these tokens
special_tokens = {
    '<|endoftext|>': 100257,
    '<|fim_prefix|>': 100258,
    '<|fim_middle|>': 100259,
    '<|fim_suffix|>': 100260,
    '<|endofprompt|>': 100276
}

# Load new text from a file
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Create a tokeniser and do 64 merges
tokeniser = GPTTokeniser()
vocab_size = 256 + 64
tokeniser.train(text, vocab_size=vocab_size, verbose=True)

# Register special tokens
tokeniser.register_special_tokens(special_tokens)

# Verify that the encode and decode functions are inverses
assert text == tokeniser.decode(tokeniser.encode(text, 'all'))

# Verify that save/load work as expected
tokeniser.save('tmp')
tokeniser = GPTTokeniser('tmp.tkn')

# Verify that the encode and decode functions are inverses
assert text == tokeniser.decode(tokeniser.encode(text, 'all'))

# Remove the temporary file
os.remove('tmp.tkn')