In [1]:
with open('input.txt', 'r') as f:
    text = f.read()

In [2]:
print(len(text))

1115394


In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


The code below is a simple way of encoding strings. Other uses of token libraries are SentencePiece (subword tokenizer, used by Google) and tiktoken (used by ChatGPT, has 50257 tokens, way more than the 65 characters that we have).

In [5]:
#this is a mapping from characters to integers
# and back, for easy encoding and decoding

# chars = !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

stoi = {c: i for i, c in enumerate(chars)} #means strings to integers
itos = {i: c for i, c in enumerate(chars)} #means integers to strings
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('singapore'))
print(decode(encode('singapore'))) #Google uses SentencePiece, GPT uses tiktoken

[57, 47, 52, 45, 39, 54, 53, 56, 43]
singapore


In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) #The first 1000 characters encoded as integers

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [7]:
#split into train and validation set
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

When we put [18, 47, 56, 57, 58,  1, 15, 47, 58] into a transformer, there will be 8 examples: In the context of 18, 47 comes next. When 18 & 47, 56 comes next. So on...

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'When input is {context}, the target is {target}.') #This are the eight examples that come for the first 9 characters

When input is tensor([18]), the target is 47.
When input is tensor([18, 47]), the target is 56.
When input is tensor([18, 47, 56]), the target is 57.
When input is tensor([18, 47, 56, 57]), the target is 58.
When input is tensor([18, 47, 56, 57, 58]), the target is 1.
When input is tensor([18, 47, 56, 57, 58,  1]), the target is 15.
When input is tensor([18, 47, 56, 57, 58,  1, 15]), the target is 47.
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is 58.


In [10]:
torch.manual_seed(1337)
batch_size = 4 #how many independent sequences will we process in parallel?
block_size = 8 #what is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #generate batch_size number of random offsets, between 0 and len(data) - block_size
    x = torch.stack([data[i:i+block_size] for i in ix]) #first block_size characters starting at each offset
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) #next block_size characters starting at each offset
    return x, y

xb, yb = get_batch('train')
print(xb.shape, xb.dtype) #4 sequences of 8 characters each
print(xb) #the first 4 sequences of 8 characters
print(yb.shape, yb.dtype) #4 sequences of 8 characters each
print(yb) #the next 4 sequences of 8 characters each, shifted by one character

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1] #the first t+1 characters of the b-th sequence
        target = yb[b, t] #the t-th character of the b-th sequence
        print(f'When input is {context.tolist()}, the target is {target}.') #This are the 4 examples that come for the first 8 characters of each of the 4 sequences

torch.Size([4, 8]) torch.int64
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
torch.Size([4, 8]) torch.int64
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
When input is [24], the target is 43.
When input is [24, 43], the target is 58.
When input is [24, 43, 58], the target is 5.
When input is [24, 43, 58, 5], the target is 57.
When input is [24, 43, 58, 5, 57], the target is 1.
When input is [24, 43, 58, 5, 57, 1], the target is 46.
When input is [24, 43, 58, 5, 57, 1, 46], the target is 43.
When input is [24, 43, 58, 5, 57, 1, 46, 43], the target is 39.
When input is [44], the target is 53.
When input is [44, 53], the target is 56.
When input is [44, 53, 56], the target is 1.
When input is [44, 53, 56, 1], the target is 58.
When input is [44, 5

In [11]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [19]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        #each toeken directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #65 x 65

    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx) #(B, T, C) Batch, Time, Channels

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape #B is batch size, T is block size, C is vocab size
            logits = logits.view(B*T, C) #stretch the array
            targets = targets.view(B*T) #one dimensionalize the targets
            loss = F.cross_entropy(logits, targets) #Negative Log Likelihood Loss, in PyTorch doc, they look for B x C x T instead. So the previous step will re-shape logits to that format

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is B by T array of indices in the current context
        for _ in range(max_new_tokens):
            #get the logits for the next token
            logits, loss = self(idx, None)
            #focus on the last time step
            logits = logits[:, -1, :]
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #append the sampled token to the input sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B , T+1)
        return idx  
    

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape) #4 x 8 x 65, 4 sequences of 8 characters each, each character has a probability distribution over the 65 characters
print(loss)

idx = torch.zeros(1,1, dtype=torch.long) #start with the first character, which is the first character in the vocabulary
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist())) #generate 100 characters, starting with the first character in the vocabulary

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Above creates an Embedding table, so following our sample xb, the first index '24' will pluck the 24th row in the table, second index '43' will pluck the 43rd. Logits is like the score for the next character in sequence, we are predicting for what comes next. Original model looks random as the model is not trained on context.

In [20]:
#create a PyTorch optimizer
#AdamW is a variant of Adam that decouples weight decay from the optimization step
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) 

In [26]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train') #get a batch of data
    logits, loss = m(xb, yb) #forward pass
    optimizer.zero_grad(set_to_none=True) #zero the gradients   
    loss.backward() #backward pass
    optimizer.step() #update the parameters

print(loss.item())

2.3132691383361816


In [30]:
print(decode(m.generate(idx = torch.zeros(1, 1, dtype=torch.long), max_new_tokens=500)[0].tolist())) #generate 100 characters, starting with the first character in the vocabulary


Thaueerrpanat thathe, s
WAnd Fithee.

Nobys
I ld mb, qun mod y wousa darketwave nghof IORitok has whodirate are atit G t hant m,
HAn
CELUSt y XENTha bu.
Wh blinouket th thiviglecldewist, trveayokeanguror mepes ' wexfalle spprdswhyealaiate foulokiou:
Beron, dopetuletougr ldr;
S:
I mby ongow:
An icinfr
Pasouserou' fenas'stigous.
Touser m an rore, I m; ist arasound theithathis hthe'le is t V:
W:
Be th Myoua Counaksor s ct?

Totris t eat;
DUCAn doy ho shembeervon worse anthaus, moulit ardidond, t rk


Tokens are not talking to one another.

##The mathematical trick in self-attention.

In [63]:
# toy example

torch.manual_seed(1337)
B, T, C = 4, 8, 2
x= torch.randn(B,T,C)
x.shape

#How to get our tokens to talk to one another? Work backwards.

torch.Size([4, 8, 2])

In [64]:
xbow = torch.zeros((B,T,C)) #bow stands for bag of words, which is a way to represent the input sequence as a weighted sum of the input tokens
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b, t] = torch.mean(xprev, 0)


In [65]:
wei = torch.tril(torch.ones(T, T)) 
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) -> (B,T,C)
torch.allclose(xbow, xbow2)

False

It should be True, but I am not sure why it is False. I inspected the numbers and they looked the same...?

In [56]:
xbow[0], xbow2[0]


(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

In [None]:
#version 3: using Softmax

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf')) #set the upper triangular part to -inf
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x # (B, T, T) @ (B, T, C) -> (B,T,C)
torch.allclose(xbow2, xbow3) 

True

In [None]:
#version 4: self-attention

torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# self attention Head
head_size = 16
key = nn.Linear(C, head_size, bias=False) #key projection
query = nn.Linear(C, head_size, bias=False) #query projection
value = nn.Linear(C, head_size, bias=False) #value projection
k = key(x) #B, T, head_size
q = query(x) #B, T, head_size
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf')) #set the upper triangular part to -inf, nodes are not allowed to communicate with future
wei = F.softmax(wei, dim=-1) #apply softmax to get probabilities, exponentially + normalize
#in encoder blocks, the masking is removed, but in decoder blocks, the masking is applied
v = value(x) #B, T, head_size

out = wei @ v # (B, T, T) @ (B, T, C) -> (B,T,C)

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]  #These are the weights, and not uniform.

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

How can a vowel find consonants in the past, but also in a data-dependent way? The beauty of self-attention: every single token emits 2 vectors - a query and a key. Query vector is 'What am I looking for' and the Key vector is 'What do I contain?'. The main affinity between these two vectors is to dot product both keys and queries to produce a weight. The higher that dot product, the higher the weight, means it is very relatable.

Self-attention: key, queries, values come from the same source\
Cross-attention: separate set of nodes, where we pull information and apply them into our original set of nodes

Batch normalization layer: ensure that each neuron is unit Gaussian distribution, 0 mean and 1 SD.

In [75]:
class BatchNormId:

    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)


    def __call__(self, x):
        xmean = x.mean(1, keepdim=True) #mean over the batch dimension
        xvar = x.var(1, keepdim=True, unbiased=False) #batch variance
        xhat = (x - xmean)/ torch.sqrt(xvar + self.eps) # normalize the input to unit variance
        self.out = self.gamma * xhat + self.beta # scale and shift the normalized input
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
    
torch.manual_seed(1337)
module = BatchNormId(100)
x = torch.randn(32, 100) #32 samples, 100 features
x = module(x)
x.shape #32 samples, 100 features

torch.Size([32, 100])

In [76]:
x[:,0].mean(), x[:,0].std() #mean and std of the first feature

(tensor(0.1476), tensor(0.8847))