In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

Step 1: Import the text, convert into data set, 

In [2]:
# make the data set from the input
# all chars are impt, even spaces, since we want the model to learn that
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
char_data = sorted(list(set(text)))
vocab_size = len(char_data)
vocab_size

65

In [3]:
# now we make the look up table which maps either str->int, or vice versa
stoi = { s:i for i, s in enumerate(char_data)}
itos = { i:s for i, s in enumerate(char_data)}

# encode and decode functions based on a passed list of chars
# remember for the decoder, you want this to output a string
# so you have to convert the list of chars into a str
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: ''.join([itos[i] for i in s])

## Step 2: Tokenize the data set

### 2.1: Encode the dataset as a torch.tensor, then split into train and val datasets

In [4]:
data = torch.tensor(encode(text), dtype=torch.long)

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

### 2.2 Initiate a block size for computational efficiency. As a small example, make the outputs predict the next token based on our set up

In [5]:
block_size = 8
# remember to seperate input x, and ideal output y, from what the model will use
x = train_data[:block_size]
y = train_data[1:block_size+1] # shift by one

# now create context for the model, and target for the model,
# based on ideal inputs and outputs
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f'context to model: {context}, target for model to learn: {target}')

context to model: tensor([18]), target for model to learn: 47
context to model: tensor([18, 47]), target for model to learn: 56
context to model: tensor([18, 47, 56]), target for model to learn: 57
context to model: tensor([18, 47, 56, 57]), target for model to learn: 58
context to model: tensor([18, 47, 56, 57, 58]), target for model to learn: 1
context to model: tensor([18, 47, 56, 57, 58,  1]), target for model to learn: 15
context to model: tensor([18, 47, 56, 57, 58,  1, 15]), target for model to learn: 47
context to model: tensor([18, 47, 56, 57, 58,  1, 15, 47]), target for model to learn: 58


### 2.3: Now we want to learn the data in terms of batches. This is recreating how the model will be learning the inputs. We formulate the inputs as context to the model, and then we have targets that the model will learn which we write as y

In [6]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

# based on batch and block sizes, generate small batch of data with inputs x and targets y
def get_batch(split):
    # condition the data to be batched based on whether we're training or validating the model
    data = train_data if split == 'train' else val_data
    
    '''
     remember that we want to batch the data at random---this helps the learning process
     we want the index to start at some random index in the training data,
     but we don't want it to overextend the data set
     size=(batch_size,) specifies a 1D tensor of size batch_size, it's just the notation 
     we want it to be batch_size since that's how many batches we want
     then the block_size wiill be used as a jump off point
    '''
    ix = torch.randint(low=0, high=len(data)-block_size, size=(batch_size,))
    '''
     we want to be able to form the batches cleanly from ix
     one way to do this is to create the inputs from ix using torch.stack
     to make a sequence of tensors---exactly what i'm looking for
    '''
    x = torch.stack([data[i:i+block_size] for i in ix])

    # we do the same thing to form the ideal outputs of the model
    # i.e. the ideal posterior
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x, y

# now we get the batches
xb, yb = get_batch('train')

# let's see their contents

print('------')
print('inputs:')
print(xb.shape)
print(xb)

print('targets:')
print(yb.shape)
print(yb)
print('------')

# and now let's understand how the model will be learning the inputs in the same way
# we'll be iterating through each batch of the model
for b in range(batch_size):
    # then we'll be iterating through each element in the block
    # this is called the 'time' dimension because we're considering each block
    # of a batch as elements in a time series
    for t in range(block_size):
        # then we form the context within the batch
        # if we don't add the +1, bc we start at 0th index, we're technically
        # not doing next token prediction. Additionally we'd also be starting at 0
        context = xb[b, :t+1]
        
        # the targets are already shifted from the batch creation,
        # so no need to shift when iterating through the time block
        target = yb[b, t]
        print(f'context: {context}, target: {target}')

------
inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
------
context: tensor([24]), target: 43
context: tensor([24, 43]), target: 58
context: tensor([24, 43, 58]), target: 5
context: tensor([24, 43, 58,  5]), target: 57
context: tensor([24, 43, 58,  5, 57]), target: 1
context: tensor([24, 43, 58,  5, 57,  1]), target: 46
context: tensor([24, 43, 58,  5, 57,  1, 46]), target: 43
context: tensor([24, 43, 58,  5, 57,  1, 46, 43]), target: 39
context: tensor([44]), target: 53
context: tensor([44, 53]), target: 56
context: tensor([44, 53, 56]), target: 1
context: tensor([44, 53, 56,  1]), target: 58
context: tensor([44, 53, 56,  1, 58]), target: 46
co

### Step 3: Create a Bigram language model

#### 3.1: Method 1 - Using only a feed forward approach

In [7]:
torch.manual_seed(1337)
'''
    Always instantiate model as nn.Module so that PyTorch can cleanly interact with the
    features of your model construction. BLM will be a subclass of nn.Module
    - allows for creating nn layers easier
    - the nn.Module knows what to do when you create a forward() fcn, for example
    
    __init__(self, vocab_size): where we define layers and operations
    params: 
    - super().__init__(): Include this the majority of the time so that we inherit 
    everything from the nn.Module class
    
    - vocab_size: size of vocab list in dataset. We use it to make an embedding table so
    that we can reduce the number of dimensions that our data set will be stored in
    Alternative approaches include one hot vectors, but those aren't very efficient 
    for training. Otherwise we'd need to keep track of all possible combos of the bigram
    data set. And this scales very well for > 50,000 data sets
    --> What you instead do is instantiate a weight matrix, and then pluck out
    the corresponding row of that character (which is our token)
    
    --
    fcn: forward(self, idx, targets=None)
    parameters: 
    idx: indices of the inputs, i.e. our tokens. this will be our batch of x
    targets: our desired outputs of the BLM
''' 
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__() 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, targets=None):
        ''' whats going on in the forward pass?
        
        logits are the result of plucking out the rows of the token emb table
        which correspond to the indices of the inputs
        -- dim = Batch, Time, Channels = 4, 8, vocab_size=65
        ---- these correspond to the scores for the next token to be predicted
        logits[0] corresponds to the first batch, where there are 8 vectors which each
        correspond to the scores of each of the 65 possible tokens for each 
        "moment of time" which is only 8 or this time. It really corresponds
        to the block size because we've discretized time series into blocks
        '''
        logits = self.token_embedding_table(idx)
        
        # need to have a test case for when no targets as passed for the generate function
        if targets is None:
            loss = None
        else:
            # we need to reshape logits so that we can compute cross entropy correctly
            # we'll call B*T the minibatch dimension
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            
            # targets needs to be the minibatch dim according to PyTorch docs
            targets = targets.view(B*T)
            
            # then compute the loss
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    '''
        idx serves as the current context of the model. the job of generate is to 
        create max_new_tokens more tokens based off this passed context
    '''
    def generate(self, idx, max_new_tokens):
        # run a for loop which iterates over amount of specified new tokens to generate
        for _ in range(max_new_tokens):
            # based off of this current context, we want to get the logits but not the loss
            # this is why we needed this condition set in the forward pass
            logits, loss = self(idx)
            
            # as we're generating the sequences, we're going to be adding the next
            # token to the context, so we'll always want to look to the last dimension
            # of the logits in order to calculate the probabilities of predicting
            # the next token
            logits = logits[:, -1, :] # becomes (B, C)
                        
            # apply softmax to get the probabilities
            # we want probabilities of the channel dim, since this is the prob
            # of a character appearing
            probs = F.softmax(logits, dim=-1) # dim = (B, C)
                        
            # sample from the distribution using multinomial
            # binomial -- Y/N, i.e. sampling from distribution of two categories
            # multinomial samples from probability distributions of k categories
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            
            # append sampled index to a running sequence of indices throughout gen process
            # we append idx_next to the time dimension, so dim=1 since idx dim = (B, T)
            idx = torch.cat((idx, idx_next), dim=1) # new dim of idx = (B, T+1)
            
        return idx
        
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

# our baby starting point is from a batch=1, time=1 tensor
# dtype is a 64bit integer, i.e. long
starting_point = torch.zeros((1, 1), dtype=torch.long)
decode(m.generate(starting_point, max_new_tokens=50)[0].tolist())

"\nSr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLER"

### 3.2 Now let's train the model

#### 3.2.1. A common choice to make is the Adam optimizer. But we'll be using the weight decay version of the Adam optimizer

In [8]:
# what's nice about nn.Module is that when instantiating the optimizer in PyTorch,
# you can just pass in m.parameters() and it'll auto create them for you
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

And now we train on a batch size of 32

In [9]:
%%time
batch_size = 32

for steps in range(10_000):
    # get the batchs from get_batch
    xb, yb = get_batch('train')
    
    # obtain logits and loss from the created BLM
    logits, loss = m(xb, yb)
    
    # zero out the gradients before doing the backward pass so you don't have an 
    # accumulation of gradients. 
    # we don't want accumulation because it will ruin the training process
    optimizer.zero_grad(set_to_none=True)
    
    # then do backward pass according to the AdamW optimizer
    loss.backward()
    
    # then with this backward pass, update the model
    optimizer.step()

CPU times: user 47.3 s, sys: 27.6 ms, total: 47.3 s
Wall time: 7.89 s


In [10]:
loss

tensor(2.3164, grad_fn=<NllLossBackward0>)

In [11]:
starting_point = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(starting_point, max_new_tokens=200)[0].tolist()))


Whmer thoun s's:Conchuntilalllevise sthat dy hangilyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprroutherc


### 4. Understanding self attention from the ground up

#### 4.1. How do we constrain the context window? 

These models should not have access to future time series data. Otherwise that defeats the purpose of training a model to be able to predict. 

We're going to work with a really basic example of predicting the next token and work our way up to self attention.

In [14]:
torch.manual_seed(1337)
'''
B = batch, which is the batch size = 4
T = time, the number of tokens we have in our time series = 8
C = channels, i.e. the vocab size, which is just two in this case
'''
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.randn(B, T, C) # how we're embedding the 'inputs' of the model
x.shape

torch.Size([4, 8, 2])


A common way to be able to predict the next token is to average over all previous tokens that the model has previously seen. 

##### 4.1.1. The most brute force way to average over the previous context to predict the next token

In [46]:
%%time

# instantiate our bag of words, i.e. what the model has seen previously
# we'll be averaging over this
xbow = torch.zeros((B, T, C))

# then we'll brute force iterate through each batch and time in order to take the average
# and predict the next token
for b in range(B):
    for t in range(T):
        # create vector representing all previous tokens
        # need the plus one so that you don't start predicting from an empty vector
        xprev = x[b, :t+1]
        
        # now bag of words will contain the average of each word 
        # as we continue to see context
        #print(xprev)
        xbow[b, t] = torch.mean(xprev, dim=0)
        #print(xbow)

CPU times: user 1.94 ms, sys: 0 ns, total: 1.94 ms
Wall time: 1.31 ms


In [43]:
xbow[0][-1]

tensor([-0.0341,  0.1332])

So in this batch, the last value in xbow represents the average over all of the previous tokens.

Now this is all well for getting the averages, but it's not good for efficiency especially as models scale up. We need to do things differently. 

##### 4.1.2. Using triangular matrices to represent context windows

In [49]:
torch.manual_seed(42)
# the below allows you to more easily see where each element of c comes from
#a = torch.tensor(((1., 1., 1.), (2., 2., 2.), (3., 3., 3.)))
#a = torch.ones(3, 3) # the standard a, without pulling out the triangular matrix
a = torch.tril(torch.ones(3, 3)) # take the triangular matrix of a matrix of all 1s
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('----------')
print('b=')
print(b)
print('----------')
print('c=')
print(c)
print('----------')

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----------
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----------
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
----------


We can use this trick for taking an average over all of our weights that we would be using for training. Like a hybrid of looking at the weighted average of the context

In [79]:
%%time
# dim(weights) needs to be (T, T) since we are weighting each observation
# in the time series
wei = torch.tril(torch.ones(T, T))

'''
    In order to normalize the row how we expect, we need to have keepdim=True
    because otherwise the tensor broadcasting will not work accordingly
    
    If you don't specify keepdim=True, PyTorch will make up the dimensions on the left
    so that we end up normalizing the columns even though we say dim=1
'''
wei = wei / torch.sum(wei, dim=1, keepdim=True)
print(wei.shape) # dim = (T, T), and now we're averaging

# and we create our new bag of words with this trick
# (T, T) @ (B, T, C) ==> (B, T, T) @ (B, T, C) = (B, T, C)
xbow2 = wei @ x
xbow2

torch.Size([8, 8])
CPU times: user 3.16 ms, sys: 890 µs, total: 4.05 ms
Wall time: 777 µs


tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [72]:
torch.sum(wei, dim=1), torch.sum(wei, dim=1, keepdim=True)

(tensor([1.0000, 1.5000, 1.8333, 2.0833, 2.2833, 2.4500, 2.5929, 2.7179]),
 tensor([[1.0000],
         [1.5000],
         [1.8333],
         [2.0833],
         [2.2833],
         [2.4500],
         [2.5929],
         [2.7179]]))

In [66]:
torch.sum(wei, dim=1).shape, torch.sum(wei, dim=1, keepdim=True).shape

(torch.Size([8]), torch.Size([8, 1]))

 Now xbow2 isn't normalized or anything, so the averaging is a little weird. 
 
 Let's implement this with softmax instead, which is the exponential averaging

In [97]:
%%time
# create the traingular matrix and weights as before, this time let weights
# start at zero for better results--we also do a similar thing for sentence generation
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))

# now e^(-inf) = 1/e^(inf) = 0, so we need to reinstantiate the weights with this
# so that the softmax will work properly
wei = wei.masked_fill(tril == 0, float('-inf'))

# we want to softmax on the rows of the weights
wei = F.softmax(wei, dim=1)
# now that we have our normalized weights, we need to multiply as before
xbow3 = wei @ x

CPU times: user 1.24 ms, sys: 7 µs, total: 1.25 ms
Wall time: 743 µs


And now we have trainable weights which are siphoned off from a context at a given point in the observation window. 

##### 4.1.4. Now onto actual self attention

Self attention intends to be a "smarter" averaging. We still want to average over the past, but if something is more relevant for the next predicted token, we want this to flow up to the surface. This is what self attention intends to solve. I am not sure exactly how this is done yet. 

We are going to characterize each token with two feature vectors: Queries and keys. 

**Queries**: What am I looking for

**Keys**: What do I contain

For example let's look at the following sentence: 

All that glitters is not gold

When we're looking at "glitters", we want to hope that "gold" is relevant enough to have a high probability of appearing within the next token prediction. When we're actually doing the learning process, we're taking the 'query' vector of "glitters" and we're going to dot product with all of the other words (tokens) in the sentence, which will be the "values". This dot product will create a score. That score will indicate the likelihood that that token will appear next in the sentence. We will be training the model so that the query+key and value vectors will be independently tuned.

This score is really the attention score. The "self" attention part comes from how all of these vectors are from the inputs themselves. 

Then the "values" vector of self attention will further provide an actual context dependence to the score". Since for instance, gold isn't the only thing that glitters. Diamonds also glitter. So once the query/key vectors are dot producted and softmax'd, the values will further contribute to the attention scores 

In [98]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# create a single Head of self attention
# value is the "public" version of the values of x, so we don't have to directly matmul it
head_size = 16
key = nn.Linear(C, head_size, bias=False) # (dim_input, dim_output, bias)
query = nn.Linear(C, head_size, bias=False) # (dim_input, dim_output, bias)
value = nn.Linear(C, head_size, bias=False) # (dim_input, dim_output, bias)

# performs x . key^T or x . query^T since there is no bias
k = key(x) # (B, T, head_size)
q = query(x) # (B, T, head_size)
v = value(x) # (B, T, head_size)

# do batch matrix multiplication---the last two dimensions will multiply like regular matrices
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T), the last two dimensions do regular matmul

# and now we do the same thing as before
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ v