In [1]:
with open("input.txt", 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
len(text)  # ~1M characters

1115394

In [3]:
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [4]:
# get unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


# Tokenization

In [5]:
# mappings, can use unordered data structure for these
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for ch, i in stoi.items()}

encode = lambda s: [stoi[c] for c in s]  # encode a str into numbers, must use an ordererd data structuer
decode = lambda l: ''.join([itos[i] for i in l])  # decode a list of ints

print(encode("Hello, world!"))
print(decode(encode("Hello, world!")))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42, 2]
Hello, world!


## Other ways
- SentencePiece
- tiktoken

In [6]:
import tiktoken

enc = tiktoken.get_encoding("gpt2")

In [7]:
enc.n_vocab  # ~50k, ours is 65

50257

In [8]:
enc.encode("Hello, world!")

[15496, 11, 995, 0]

In [9]:
enc.decode([15496, 11, 995, 0])

'Hello, world!'

## Prepare Dataset

Tokenize/Encode our training set

In [10]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)

data.shape, data.dtype

  cpu = _conversion_method_template(device=torch.device("cpu"))


(torch.Size([1115394]), torch.int64)

In [12]:
data[:1000]  # is 1 big 1D tensor

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

Split into Train/Test

In [14]:
# 90/10 split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

## Batches
Training on whole dataset is expensive <br>
So we divide it into chunks with some max sequence length (AKA context size, block size, batch dimension)

In [18]:
block_size = 8
train_data[:block_size+1]  # +1 cuz we want to predict that one, given the first `block_size` chars

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

We predict on each position (from 0 to block_size) <br>
This is not just for performance: make Transformer used to seeing contexts of length from 1 to block_size

In [19]:
x = train_data[:block_size]  # we use each position, from 0 to block_size
y = train_data[1:block_size+1]  # corresponding predictions for each position
for t in range(block_size):
    ctx = x[:t+1]
    target = y[t]
    print(f"Given {ctx}, predict {target}")

Given tensor([18]), predict 47
Given tensor([18, 47]), predict 56
Given tensor([18, 47, 56]), predict 57
Given tensor([18, 47, 56, 57]), predict 58
Given tensor([18, 47, 56, 57, 58]), predict 1
Given tensor([18, 47, 56, 57, 58,  1]), predict 15
Given tensor([18, 47, 56, 57, 58,  1, 15]), predict 47
Given tensor([18, 47, 56, 57, 58,  1, 15, 47]), predict 58


Batch = many sequences stacked on top of each other (as GPUs are good at parallel processing, we'll process these sequences in a batch in parallel (all at once)) <br>
Make a batch:

In [23]:
torch.manual_seed(1337)

batch_size = 4  # number of independent sequences to process in parallel
block_size = 8  # max len of each sequence

def get_batch(split):
    data = train_data if split == 'train' else val_data

    # pick batch_size number of indices, each ranging from 0 to "len(data) - block_size"
    # why "- block_size"? to avoid out of bounds, cuz we use i+block_size below
    ix = torch.randint(len(data) - block_size, (batch_size, ))

    # stack each sequence for parallel processing (GPUs are good at that)
    x = torch.stack([data[i:i+block_size] for i in ix])  # random input sequences
    # why "+1"? cuz we want to predict next token (if input starts from index i, then corresponding prediction starts from i+1)
    # would "+1" in "i+block_size+1" give out of bounds? no, cuz this index is excluded (it'll go from i+1 to i+block_size)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  # corresponding predictions for each position in input sequence
    
    return x, y

xb, yb = get_batch("train")
print('Inputs: ', xb.shape)
print('Targets: ', yb.shape)
print('-------')

for b in range(batch_size): # for each sequence in this batch
    for t in range(block_size): # for each position in current sequence
        # "b" gives current sequence
        # "t" gives position in current sequence
        ctx = xb[b, :t+1]  # t+1 is exclusive, will give 0 to t
        target = yb[b, t]
        print(f"Given {ctx}, predict {target}")


Inputs:  torch.Size([4, 8])
Targets:  torch.Size([4, 8])
-------
Given tensor([24]), predict 43
Given tensor([24, 43]), predict 58
Given tensor([24, 43, 58]), predict 5
Given tensor([24, 43, 58,  5]), predict 57
Given tensor([24, 43, 58,  5, 57]), predict 1
Given tensor([24, 43, 58,  5, 57,  1]), predict 46
Given tensor([24, 43, 58,  5, 57,  1, 46]), predict 43
Given tensor([24, 43, 58,  5, 57,  1, 46, 43]), predict 39
Given tensor([44]), predict 53
Given tensor([44, 53]), predict 56
Given tensor([44, 53, 56]), predict 1
Given tensor([44, 53, 56,  1]), predict 58
Given tensor([44, 53, 56,  1, 58]), predict 46
Given tensor([44, 53, 56,  1, 58, 46]), predict 39
Given tensor([44, 53, 56,  1, 58, 46, 39]), predict 58
Given tensor([44, 53, 56,  1, 58, 46, 39, 58]), predict 1
Given tensor([52]), predict 58
Given tensor([52, 58]), predict 1
Given tensor([52, 58,  1]), predict 58
Given tensor([52, 58,  1, 58]), predict 46
Given tensor([52, 58,  1, 58, 46]), predict 39
Given tensor([52, 58,  1,

In [24]:
xb

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])

# Modeling

## Bigram
The simplest model

In [58]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        # below is our lookup table
        # the rows will be interpreted as logits (scores) for the next token
        # each token will read these logits for next token from this table
        # when we train it, it'll adjust these scores
        # so it'll be able to predict the next token given the current one
        self.token_emb_table = nn.Embedding(vocab_size, vocab_size)  # table size CxC, where C is vocab_size
    
    def forward(self, idx, targets=None):
        # dimension of idx and targets is (B, T) i.e. (batch_size, block_size)
        # below gives tensor of shape (B, T, C)
        # cuz each token "t" picks the "t-th" row from the table
        # and as each row is vocab_size large, it's like each token becomes vocab_size in length
        
        logits = self.token_emb_table(idx)

        # targets is optional cuz we don't provide it during inference/generation
        # it's only provided/needed during training
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            # F.cross_entropy() expects C to be 2nd dimension
            # stretch out all the sequences, preserving the C dimension
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            
            # use -ve log-likelihood as loss to know how well we're predicting
            # when training, it'll maximize the logit associated with correct target (so that one will be picked on later predictions)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # note: the generation process for this Bigram model is inefficient
        # cuz we're feeding the whole sequence for each forward pass
        # but the model only looks at the last token to predict the next token
        # why keep it inefficient?
        # so it works without any changes in future for models that look into the past

        
        # idx is (B, T)
        # max_new_tokens is number of tokens we want to generate
        for _ in range(max_new_tokens):
            logits, loss = self(idx)  # (B, T, C)
            
            # select the logits from last time step for each batch
            # (B, T, C) -> (B, C)
            logits = logits[:, -1, :]
            # softmax turns logits into probabilities (makes them b/w 0 & 1)
            probs = F.softmax(logits, dim=-1)

            # use the probs as multinomial distribution to sample 1 value (the predicted next token)
            # samples 1 value for each batch -> (B, 1)
            idx_next = torch.multinomial(probs, num_samples=1)
            # stack prev context and current predicted token for next iteration
            # this makes it (B, T), (B, T+1), (B, T+2), ..., , (B, T+max_new_tokens)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [59]:
m = BigramLM(vocab_size)
logits, loss = m(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.8786, grad_fn=<NllLossBackward0>))

In [60]:
# (B, T) = (1, 1) -> 1 batch & 1 initial token
initial = torch.zeros((1, 1), dtype=torch.long)

generated_text = m.generate(initial, max_new_tokens=100)

# get only first batch: (B, T+100) -> (T+100)
# then turn it into python list instead of pytorch tensor
generated_text = generated_text[0].tolist()

decode(generated_text)  # gives random gibberish as we've not trained it

"\nSr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3"

# Training

In [61]:
# simplest: SGD optimizer
# AdamW is much more advanced and popular, works well
# typical good learning rate: 3e-4
# but for very small models, can get away w/ using higher learning rates
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [69]:
batch_size = 32

for steps in range(10_000):
    xb, yb = get_batch("train")  # get a batch w/ randomly chosen sequences
    
    logits, loss = m(xb, yb)

    optimizer.zero_grad(set_to_none=True)  # zero out the gradient
    loss.backward()  # calculate the loss w.r.t each parameter
    optimizer.step()  # update the parameters

print(loss.item())

2.4567582607269287


In [74]:
initial = torch.zeros((1, 1), dtype=torch.long)
generated_text = m.generate(initial, max_new_tokens=500)[0].tolist()
print(decode(generated_text))


HEThal ban, a hevese,
FOLorere, tungsed o.
Ande ht all com f?
foud sas ggord d brd
Ayay,
Wh, whan uspre pl lllurs he ff arame,
BR:
Fo nnglld avexquteecet d anove! s wn Sck,
n iroke.
Whe INasa 'd gnt. thack; tusllthe nd azelsen s fudarave k:d; blinon ancoud h thend t y ntino brthese bufrandef t prro,
TENERLThalld aveno ofrland Oreillongsthador ginthe l my'epr
DI illlore

Four nd ar, by'd tand e t th he:
A py the LINTola veeden the fonokerimeds, w RLO: iongeak.
Sor ce pose os ammachin.'s.
the be
G
