In [26]:
# Props to this sensei
# https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=8

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.nn.functional as F

from tqdm import tqdm # progress bar

## Hyper-parameters

In [28]:
text_file = "tiny-shakespeare.txt"
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000 # total # of iterations for different batches
learning_rate = 4e-4
n_embd = 64 # embedding dimension
n_head = 4 # number of heads we'd like
n_layer = 4 # number of decoder blocks
dropout = 0.2 # dropout probability for the embedding layer, residual connections and attention weights

# printing related variables for training
eval_interval = 10
eval_iters = 200

device = ("cuda" if torch.cuda.is_available() else "mps"
          if torch.backends.mps.is_available() else "cpu")
print(f"Using {device} device")
torch.set_default_device(device)

Using cuda device


## Reading Data

In [29]:
# read file
with open(text_file, "r") as f:
    text = f.read()
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [30]:
vocab_list = sorted(list(set(text))) # more like a unique_char_list
vocab_size = len(vocab_list)

print(f"All the characters in the text: {''.join(vocab_list)}")
print(f"Length of the characters: {vocab_size}")

All the characters in the text: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of the characters: 65


## Tokenizer (character based, index/ascii)

In [31]:
class MyTokenizer:
    def __init__(self):
        self.char_to_index = None
        self.index_to_char = None

    def fit(self, char_list):  
        self.char_to_index = {char: idx for idx, char in enumerate(char_list)}
        self.index_to_char = {idx: char for char, idx in self.char_to_index.items()}

    def encode_index(self, input_str):
        return [self.char_to_index[char] for char in input_str]

    def decode_index(self, encoded_list):
        return ''.join([self.index_to_char[idx] for idx in encoded_list])

    @staticmethod
    def ascii_tokenizer(char):
        return ord(char)

    @staticmethod
    def ascii_decoder(ascii_value):
        return chr(ascii_value)

    def encode_combined(self, input_str, use_ascii=False):
        if use_ascii:
            return [self.ascii_tokenizer(char) for char in input_str]
        else:
            return self.encode_index(input_str)

    def decode_combined(self, encoded_list, use_ascii=False):
        if use_ascii:
            return ''.join([self.ascii_decoder(ascii_value) for ascii_value in encoded_list])
        else:
            return self.decode_index(encoded_list)

In [32]:
# Example usage:
tokenizer = MyTokenizer()
tokenizer.fit(vocab_list)

input_str = "Hello there"
encoded_list_ascii = tokenizer.encode_combined(input_str, use_ascii=True)
decoded_str_ascii = tokenizer.decode_combined(encoded_list_ascii, use_ascii=True)

encoded_list_index = tokenizer.encode_combined(input_str, use_ascii=True)
decoded_str_index = tokenizer.decode_combined(encoded_list_index, use_ascii=True)

print("Original String:", input_str)
print("Encoded List (ASCII):", encoded_list_ascii)
print("Decoded String (ASCII):", decoded_str_ascii)

print("Encoded List (Index):", encoded_list_index)
print("Decoded String (Index):", decoded_str_index)

Original String: Hello there
Encoded List (ASCII): [72, 101, 108, 108, 111, 32, 116, 104, 101, 114, 101]
Decoded String (ASCII): Hello there
Encoded List (Index): [72, 101, 108, 108, 111, 32, 116, 104, 101, 114, 101]
Decoded String (Index): Hello there


## Batching

In [33]:
# Encode all the data and split into train and val
encoded_data = torch.tensor(tokenizer.encode_combined(text))
n = int( 0.9 * len(encoded_data))

train_data = encoded_data[:n]
val_data = encoded_data[n:]

train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [34]:
x = train_data[0:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    
    print(f"when input is {context} the target: {target}")

when input is tensor([18], device='cuda:0') the target: 47
when input is tensor([18, 47], device='cuda:0') the target: 56
when input is tensor([18, 47, 56], device='cuda:0') the target: 57
when input is tensor([18, 47, 56, 57], device='cuda:0') the target: 58
when input is tensor([18, 47, 56, 57, 58], device='cuda:0') the target: 1
when input is tensor([18, 47, 56, 57, 58,  1], device='cuda:0') the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15], device='cuda:0') the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47], device='cuda:0') the target: 58
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58], device='cuda:0') the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47], device='cuda:0') the target: 64
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64], device='cuda:0') the target: 43
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43], device='cuda:0') the target: 52
when input is tensor([18, 47,

In [35]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # tensor of random indexes (batch_size, )
    x = torch.stack([data[i:i+block_size] for i in ix]) # from the random index go until block_size, for all random indexes
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # from the random index+1 go until block_size+1, for all random indexes
    return x,y 

xb, yb = get_batch("train") # xb -> input to the transformer
print("inputs: ")
print(xb.shape)
print(xb)

print("targets: ")
print(yb.shape)
print(yb)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs: 
torch.Size([32, 256])
tensor([[46, 43,  1,  ...,  1, 43, 63],
        [53, 61,  1,  ..., 43,  1, 53],
        [50, 50,  1,  ..., 24, 43, 58],
        ...,
        [ 1, 57, 59,  ..., 50, 39, 41],
        [53, 53,  1,  ..., 46, 47, 56],
        [61, 39, 56,  ...,  1, 53, 44]], device='cuda:0')
targets: 
torch.Size([32, 256])
tensor([[43,  1, 53,  ..., 43, 63, 43],
        [61,  1, 51,  ...,  1, 53, 61],
        [50,  1, 46,  ..., 43, 58,  1],
        ...,
        [57, 59, 41,  ..., 39, 41, 43],
        [53,  1, 46,  ..., 47, 56, 42],
        [39, 56, 42,  ..., 53, 44,  1]], device='cuda:0')
when input is [46] the target: 43
when input is [46, 43] the target: 1
when input is [46, 43, 1] the target: 53
when input is [46, 43, 1, 53] the target: 52
when input is [46, 43, 1, 53, 52] the target: 43
when input is [46, 43, 1, 53, 52, 43] the target: 1
when input is [46, 43, 1, 53, 52, 43, 1] the target: 47
when input is [46, 43, 1, 53, 52, 43, 1, 47] the target: 52
when input is [46, 43

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## GPT (char level)

In [36]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

@torch.no_grad()
def estimate_loss(): # to reduce noise of the losses
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPT(nn.Module):
    
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPT()
model.to(device)
logits, loss = model(xb, yb)
print(logits.shape)
print(loss)

print(tokenizer.decode_combined(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([8192, 65])
tensor(4.3291, device='cuda:0', grad_fn=<NllLossBackward0>)

sxcmEnsn3x$OoTwcskQ-BLUitb3;HCIHWpOlZBYWu-'Z'jsf-&DrV;RfK$Xr
mqXudue'hvfmfygv3QrgANJGoGaDzJb'mg.,E?R


## Train Model

In [37]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 4.3362, val loss 4.3351
step 10: train loss 3.7072, val loss 3.7244
step 20: train loss 3.4545, val loss 3.4856
step 30: train loss 3.3404, val loss 3.3712
step 40: train loss 3.2660, val loss 3.2993
step 50: train loss 3.1885, val loss 3.2234
step 60: train loss 3.1060, val loss 3.1424
step 70: train loss 3.0410, val loss 3.0763
step 80: train loss 2.9788, val loss 3.0110
step 90: train loss 2.9283, val loss 2.9590
step 100: train loss 2.8804, val loss 2.9079
step 110: train loss 2.8431, val loss 2.8669
step 120: train loss 2.8065, val loss 2.8271
step 130: train loss 2.7793, val loss 2.7964
step 140: train loss 2.7524, val loss 2.7684
step 150: train loss 2.7312, val loss 2.7444
step 160: train loss 2.7083, val loss 2.7237
step 170: train loss 2.6932, val loss 2.7043
step 180: train loss 2.6740, val loss 2.6897
step 190: train loss 2.6592, val loss 2.6732
step 200: train loss 2.6462, val loss 2.6588
step 210: train loss 2.6367, val loss 2.6480
step 220: train loss 

In [38]:
torch.save(model, "GPT model loss=2.26.model")

In [45]:
# generate from the model
#context = torch.zeros((1, 1), dtype=torch.long, device=device)
context = torch.tensor([tokenizer.encode_combined(list("HAMLET: \n"))], dtype=torch.long, device=device)
print(tokenizer.decode_combined(model.generate(context, max_new_tokens=1000)[0].tolist()))

HAMLET: 
thy owine, ad soucempe for ast
Wheipimt nie herew; wepecome here not: wnow!

LUKE him min thou the pegroud.

When'lly: send, reanes weer, witheathmfed, by dis gays
Youl wit, supus that my out the fo; Beare me, nory an.
You Enow, 'd sullastong and ite
Auppong, ill to tan nonur cevad Tand
To heis brther hy theee, the my wead dre whangr.

Thist moond pay surfe that RO Ereque o' vesitg: sto ulo ohirf r not;
This hast hevers fea dewen my yous?
JEd:
Tout Edleact not scam Gorion: thall!
Sppeh atw I gramst daand Unoke on muse owrd acuger,
So mish emose assute wourss isiprnessh and leve stpat ume,
By borftam nis mut to no'ss veenetl.'s,

WUETE:
Horth, ESwencht inons, das mode le.
Nest Sity shallickind withthe wit palove thoe tery thand
Whunoffasi marent po ame or, manding and mapoul las?
I EOrge it prrefurd norientes, thetu wad oullt fit beswow
We the the to wo and put y aust tunt:
I may yourd ploger, ay thou t,
Mom gin; anon wis t hondlunst berat mest: thare ban
We, dou is thyou there

## Self-Attention
Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [40]:
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [41]:
# version 1: averaging past context with for loops, the weakest form of aggregation
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C)) # bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)
print(x[0])
print(xbow[0])

tensor([[ 0.5658, -0.7962],
        [-1.0757, -0.0236],
        [ 0.2094,  1.2075],
        [-0.5380, -0.5993],
        [ 0.3070,  0.6486],
        [-0.0895, -1.0398],
        [-1.1796,  0.1757],
        [-0.5434, -0.6277]], device='cuda:0')
tensor([[ 0.5658, -0.7962],
        [-0.2549, -0.4099],
        [-0.1002,  0.1292],
        [-0.2096, -0.0529],
        [-0.1063,  0.0874],
        [-0.1035, -0.1005],
        [-0.2572, -0.0610],
        [-0.2930, -0.1319]], device='cuda:0')


In [42]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T)) # Takes only the lower triangular part of of matrix, others are zero
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2) # same thing with version 3, but faster

True

In [43]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [44]:
# version 4: self-attention!
# difference from other is this creates a non-uniform weighted averages
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
key = nn.Linear(C, n_head, bias=False) # What do I contain?
query = nn.Linear(C, n_head, bias=False) # What am I looking for?
value = nn.Linear(C, n_head, bias=False) # How the token will communicate to you, 
k = key(x)   # (B, T, head_size)
q = query(x) # (B, T, head_size)
wei =  q @ k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, T) ---> (B, T, T)
wei *= n_head**-0.5 # scaled, otherwise softmax would be way too peaky!!

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) # remove this to make it encoder attention block, allowing all tokens to communicate
wei = F.softmax(wei, dim=-1) 

v = value(x)
out = wei @ v
#out = wei @ x

out.shape # (B, T, head_size)

torch.Size([4, 8, 4])

## Train a GPT model to add n-digit numbers
https://github.com/karpathy/minGPT/tree/master/projects/adder