In [1]:
# The initial step is to import a dataset
!curl -s -O https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [2]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))
# we also want to see the first 100 characters
print(text[:100])

length of dataset in characters:  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# our vocabulary is the set of unique characters in the text (in form of a list)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [5]:
# we will create a mapping from characters to indices and vice versa. This is also known as tokenization.
char_to_idx = {ch:i for i,ch in enumerate(chars)}
idx_to_char = {i:ch for i,ch in enumerate(chars)}
# Then we make to lambda functions to convert strings to list of indices and vice versa
encode = lambda x: [char_to_idx[ch] for ch in x]
decode = lambda x: ''.join([idx_to_char[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [6]:
# Next we will convert the dataset into a pytorch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type())
print(data[:100])

torch.Size([1115394]) torch.LongTensor
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
# Now we split the dataset into validation and training sets
train_size = int(0.9 * len(data))
val_data = data[train_size:]
train_data = data[:train_size]

In [8]:
# We define the context window size, also called block size
block_size = 8
# Given a block size of 8, we can train the transformer to predict the second element in the sequence given the first element,
# the third element given the first two, and so on. So actually, for a block size of 8, we will have 8 training examples from each sequence.
# This is helps the network in two ways: it learns to predict the next element in a sequence no matter if the sequence is short or long,
# and it makes training faster since we can train on multiple examples at the same time.
x = train_data[:block_size]
y = train_data[1:block_size+1] # y_train will be the same as x_train, except shifted one position to the right

for t in range(block_size):
    context = x[:t+1] # all elements up to t+1 (excluded)
    target = y[t] # t-th target to predict, which corresponds to the t+1 of "context"
    print(f"when the context is {context} the target is {target}")
    

when the context is tensor([18]) the target is 47
when the context is tensor([18, 47]) the target is 56
when the context is tensor([18, 47, 56]) the target is 57
when the context is tensor([18, 47, 56, 57]) the target is 58
when the context is tensor([18, 47, 56, 57, 58]) the target is 1
when the context is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when the context is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when the context is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [9]:
torch.manual_seed(1337)
bath_size = 4 # number of independent sequences that are trained in parallel
block_size = 8 # length of sequences/context size window

# Now we make a function that generates a batch of training data or validation data based on the input
def get_batch(split):
    if split == 'train':
        data = train_data
    else:
        data = val_data
    # We want to generate a batch of random sequences, to do this we will randomly sample a starting index for each sequence.
    # This index has to be between 0 and the length of the dataset minus the block size.
    ix = torch.randint(0, data.size(0) - block_size, (bath_size,))
    # Each block is identified by an x and y (label) tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # y will be the same as x, except shifted one position to the right
    return x, y
    
x_example, y_example = get_batch('train')
print(f"Input shape: {x_example.shape}, Target shape: {y_example.shape}")
print(f"Inputs: {x_example}")
print(f"Targets: {y_example}")
print("\n-----------------\n")

# Now we make a loop that plots the input and the target for each sequence in the batch
for batch in range(bath_size):
    for t in range(block_size):
        context = x_example[batch, :t+1]
        target = y_example[batch, t]
        print(f"Batch {batch+1}, when the context is {context} the target is {target}")

Input shape: torch.Size([4, 8]), Target shape: torch.Size([4, 8])
Inputs: tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

-----------------

Batch 1, when the context is tensor([24]) the target is 43
Batch 1, when the context is tensor([24, 43]) the target is 58
Batch 1, when the context is tensor([24, 43, 58]) the target is 5
Batch 1, when the context is tensor([24, 43, 58,  5]) the target is 57
Batch 1, when the context is tensor([24, 43, 58,  5, 57]) the target is 1
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1]) the target is 46
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1, 46]) the target is 43
Batch 1, when the context is tensor([24, 43, 58,  5, 57,  1, 46,

In a **large language model (LLM)**, **logits** are the raw, unnormalized output values produced by the model just before applying a **softmax function** to convert them into probabilities.

We process a batch of token blocks in parallel using the `forward()` method. For each token in the blocks, the model predicts the probabilities of all possible tokens in the vocabulary, efficiently handling all sequences and tokens simultaneously. This enables the model to learn and make next-token predictions across the entire batch at once.


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # for each token of the vocabulary, we will have a corresponding embedding. The right value for the embedding are learned 
        # during training. The torch method Embedding is used to create the embedding table. This is of size vocab_size x embedding_size, 
        # where embedding_size is a hyperparameter that we can choose, in this case we will use vocab_size.
        # Remember that the embedding of a token is the starting point to make the predictions.
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, target=None):
        # idx: A tensor of shape (batch_size, block_size) containing token IDs.
        # - batch_size: Number of sequences (blocks) processed in parallel.
        # - block_size: Number of tokens in each sequence (i.e., the length of each block).
        # Each element in idx represents a token in the vocabulary, serving as input to the model
        # to retrieve embeddings and predict the probabilities of the next tokens.
        logits = self.token_embedding_table(idx) # shape: (batch_size, block_size, embedding_size)
        
        # To be compliant with the cross-entropy loss, we need to reshape the logits tensor to (batch_size * block_size, embedding_size).
        # Remember: embedding_size == vocab_size
        logits = logits.view(-1, vocab_size)
        target = target.view(-1)
        
        # The loss tells us how well the model is doing on the training data. 
        loss = F.cross_entropy(logits, target)
        
        return logits, loss
    
model = BigramLanguageModel(vocab_size)
# In PyTorch, every custom model class that inherits from nn.Module must define a forward method. This method specifies the 
# computation the model performs when it processes input data.
# When you call model(x_example, y_example), PyTorch automatically invokes the forward method of the model.
logits, loss = model(x_example, y_example)
print(logits.shape)
print(loss)


torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)
