In [4]:
# The initial step is to import a dataset
!curl -s -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [5]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
print("length of dataset in characters: ", len(text))
# we also want to see the first 100 characters
print(text[:100])

length of dataset in characters:  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
# our vocabulary is the set of unique characters in the text (in form of a list)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [8]:
# we will create a mapping from characters to indices and vice versa. This is also known as tokenization.
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}
# Then we make to lambda functions to convert strings to list of indices and vice versa
encode = lambda x: [char_to_idx[ch] for ch in x]
decode = lambda x: ''.join([idx_to_char[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [9]:
# Next we will convert the dataset into a pytorch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type())
print(data[:100])

torch.Size([1115394]) torch.LongTensor
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [10]:
# Now we split the dataset into validation and training sets
train_size = int(0.9 * len(data))
val_data = data[train_size:]
train_data = data[:train_size]

In [11]:
# We define the context window size, also called block size
block_size = 8
# Given a block size of 8, we can train the transformer to predict the second element in the sequence given the first element,
# the third element given the first two, and so on. So actually, for a block size of 8, we will have 8 training examples from each sequence.
# This is helps the network in two ways: it learns to predict the next element in a sequence no matter if the sequence is short or long,
# and it makes training faster since we can train on multiple examples at the same time.
x = train_data[:block_size]
y = train_data[1:block_size+1] # y_train will be the same as x_train, except shifted one position to the right

for t in range(block_size):
    context = x[:t+1] # all elements up to t+1 (excluded)
    target = y[t] # t-th target to predict, which corresponds to the t+1 of "context"
    print(f"when the context is {context} the target is {target}")
    

when the context is tensor([18]) the target is 47
when the context is tensor([18, 47]) the target is 56
when the context is tensor([18, 47, 56]) the target is 57
when the context is tensor([18, 47, 56, 57]) the target is 58
when the context is tensor([18, 47, 56, 57, 58]) the target is 1
when the context is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when the context is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when the context is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [12]:
torch.manual_seed(1337)
batch_size = 4 # number of independent sequences that are trained in parallel
block_size = 8 # length of sequences/context size window

# Now we make a function that generates a batch of training data or validation data based on the input
def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    # We want to generate a batch of random sequences, to do this we will randomly sample a starting index for each sequence.
    # This index has to be between 0 and the length of the dataset minus the block size.
    ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
    # Each block is identified by an x and y (label) tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # y will be the same as x, except shifted one position to the right
    return x, y
    
x_example, y_example = get_batch('train')
print(f"Input shape: {x_example.shape}, Target shape: {y_example.shape}")
print(f"Inputs: {x_example}")
print(f"Targets: {y_example}")
print("\n-----------------\n")

# Now we make a loop that plots the input and the target for each sequence in the batch
for batch in range(batch_size):
    for t in range(block_size):
        context = x_example[batch, :t+1]
        target = y_example[batch, t]
        print(f"Batch {batch+1}, when the context is {context} the target is {target}")

Input shape: torch.Size([4, 8]), Target shape: torch.Size([4, 8])
Inputs: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

-----------------

Batch 1, when the context is tensor([24]) the target is 43
Batch 1, when the context is tensor([24, 43]) the target is 58
Batch 1, when the context is tensor([24, 43, 58]) the target is 5
Batch 1, when the context is tensor([24, 43, 58,  5]) the target is 57
Batch 1, when the context is tensor([24, 43, 58,  5, 57]) the target is 1
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1]) the target is 46
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1, 46]) the target is 43
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1, 46,

In a **large language model (LLM)**, **logits** are the raw, unnormalized output values produced by the model just before applying a **softmax function** to convert them into probabilities.

We process a batch of token blocks in parallel using the `forward()` method. For each token in the blocks, the model predicts the probabilities of all possible tokens in the vocabulary, efficiently handling all sequences and tokens simultaneously. This enables the model to learn and make next-token predictions across the entire batch at once.


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # for each token of the vocabulary, we will have a corresponding embedding. The right value for the embedding are learned 
        # during training. The torch method Embedding is used to create the embedding table. This is of size vocab_size x embedding_size, 
        # where embedding_size is a hyperparameter that we can choose, in this case we will use vocab_size.
        # Remember that the embedding of a token is the starting point to make the predictions.
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, target=None):
        # idx: A tensor of shape (batch_size, block_size) containing token IDs.
        # - batch_size: Number of sequences (blocks) processed in parallel.
        # - block_size: Number of tokens in each sequence (i.e., the length of each block).
        # Each element in idx represents a token in the vocabulary, serving as input to the model
        # to retrieve embeddings and predict the probabilities of the next tokens.
        logits = self.token_embedding_table(idx) # shape of logits: (batch_size, block_size, embedding_size)
        
        # NOTE: Currently, the logits are directly obtained from the embedding table without passing through additional model layers.
        # This means the logits are simply the embeddings of the input tokens and do not include any further transformations.
        
        if target == None:
            loss = None
        else:
            # To be compliant with the cross-entropy loss, we need to reshape the logits tensor to (batch_size * block_size, embedding_size).
            # Remember: embedding_size == vocab_size
            logits = logits.view(-1, vocab_size)
            target = target.view(-1)
            
            # The loss tells us how well the model is doing on the training data. 
            loss = F.cross_entropy(logits, target)
        
        return logits, loss
    
    def generate(self, idx, max_next_tokens):
        
        # NOTE: Since we are using a Bigram model, we will only consider the last token in the sequence to predict the next token.
        # This function though, is constructed in a more general way, so in the future we can also exploit the other tokens in the sequence. 
        
        # like in the forward method, idx is a tensor of shape (batch_size, block_size) containing token IDs.
        for t in range(max_next_tokens):
            # We get the predictions for the next token, i.e. the logits
            logits, _ = self.forward(idx)
            # But to predict the next token, we only need the last prediction, which is the one at the last position of the sequence.
            last_logit = logits[:, -1, :] # shape: (batch_size, embedding_size)
            # We get the token with the highest probability applying the softmax function to the last prediction
            probabilities = F.softmax(last_logit, dim=-1) # shape: (batch_size, embedding_size)
            # Sample from the probability distribution to get the token ID of the next token
            next_token = torch.multinomial(probabilities, num_samples=1) # shape: (batch_size, 1)
            # We add the new token to the sequence
            idx = torch.cat([idx, next_token], dim=1) # shape: (batch_size, block_size + 1)
            
        return idx
            
model = BigramLanguageModel(vocab_size)
# In PyTorch, every custom model class that inherits from nn.Module must define a forward method. This method specifies the 
# computation the model performs when it processes input data.
# When you call model(x_example, y_example), PyTorch automatically invokes the forward method of the model.
logits, loss = model(x_example, y_example)
print(logits.shape)
print(loss)

# Now we make an example of how the model generates text. The input will be a zero, that is the first token in the vocabulary that 
# represent a "start new line" token. The batch size will be 1.
input_sequence = torch.zeros(1, 1, dtype=torch.long)
generated_sequence = model.generate(input_sequence, max_next_tokens=100)
generated_sequence_first_batch = generated_sequence[0].tolist() # in this case we only have one batch
generated_text = decode(generated_sequence_first_batch)
print(generated_text)


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [14]:
# Now we will train the model. 
# We will use the Adam W optimizer, which is a popular optimizer for training neural networks.
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
# The optimizer takes care of updating the model's parameters based on the computed gradients.

# Training loop
max_epochs = 10000
batch_size = 32
for epoch in range(max_epochs):
    # Get a batch of training data
    x_batch, y_batch = get_batch('train')
    # Evaluate the model on the batch using the loss
    _, loss = model(x_batch, y_batch)
    # Reset the gradients to zero
    optimizer.zero_grad(set_to_none=True)
    # Compute the gradients for the parameters
    loss.backward()
    # Update the parameters of the model
    optimizer.step()

# Print the loss after the last epoch
print(loss)

tensor(2.5728, grad_fn=<NllLossBackward0>)


In [15]:
# Now we generate text using the trained model
input_sequence = torch.zeros(1, 1, dtype=torch.long)
generated_sequence = model.generate(input_sequence, max_next_tokens=100)
generated_sequence_first_batch = generated_sequence[0].tolist()
generated_text = decode(generated_sequence_first_batch)
print(generated_text)


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y helti


In [16]:
# Now here we will make a little piece of code to show how to implement a single-head perform attention mechanism
torch.manual_seed(1337)
batch_size,  block_size, embedding_size = 4, 8, 32
# We will create a random tensor of shape (batch_size, block_size, embedding_size)
x = torch.randn(batch_size, block_size, embedding_size)

# Attention block
head_size = 16
key = nn.Linear(embedding_size, head_size, bias=False)
query = nn.Linear(embedding_size, head_size, bias=False)
# We apply the key and query linear layers to the input tensor. All the tokens (for a total of batch x block) in the input tensor are processed in parallel. All of them 
# will produce a key and a query. 
k = key(x) # shape: (batch_size, block_size, head_size)
q = query(x) # shape: (batch_size, block_size, head_size)
# We calculate the scaled dot product attention, the product between a query and a key indicates the affinity between the two tokens (to one the query was belonging, 
# to the other the key). These affinities are de facto the attention weights, that specify how much a token is important for another token.
# N.B. for k we transpose the last two dimensions, because the first one is the batches, and elements in different batches are independent.
weights = q @ k.transpose(-2, -1) # (batch_size, block_size, head_size) @ (batch_size, head_size, block_size) -> (batch_size, block_size, block_size)

# Now we can make the matrix of all the weights in a computational efficient way, as explained in the iPad notes.
tril = torch.tril(torch.ones(block_size, block_size))
weights = weights.masked_fill(tril == 0, float('-inf'))
# The softmax function is applied to the weights to obtain the attention weights. The softmax function ensures that the attention weights sum to 1.
weights = F.softmax(weights, dim=-1)
output = weights @ x # (batch_size, block_size, block_size) @ (batch_size, block_size, embedding_size) -> (batch_size, block_size, embedding_size)
output.shape


torch.Size([4, 8, 32])

In [17]:
weights

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089