In [1]:
with open("data/input.txt") as f:
    text = f.read()

In [2]:
f"Length of characters: {len(text)}"

'Length of characters: 1115394'

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
f"Vocab size: {vocab_size}", f"All characters: {''.join(chars)}"

('Vocab size: 65',
 "All characters: \n !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

In [4]:
stoi = {c:i for i, c in enumerate(chars)}
itos = {i:c for i, c in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join(itos[k] for k in i)

In [5]:
encode("hi there"), decode(encode("hi there"))

([46, 47, 1, 58, 46, 43, 56, 43], 'hi there')

In [6]:
import torch

In [7]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
n = int(0.85 * len(data))

train_data = data[:n]
val_data = data[n:]

train_data.shape, val_data.shape

(torch.Size([948084]), torch.Size([167310]))

In [9]:
block_size = 16
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43])

### Taking all previous inputs as context for predicting the target

##### Obtaining the next token given all previous tokens. It is sampled from the probability distribution that maps all context tokens and a distribution of next token probabilities is generated, where a sampling strategy picks the next token
$x_{i+1} ∼ q(x_0, x_1, ..., x_i)$

##### Probability of a token being output can be obtained as the conditional probability of the trained distribution given all the context tokens
$p(x_{i+1}) = q(x_{i+1}|x_0, x_1, ..., x_i)$

In [10]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"t = {t}, context = {context}, target = {target}")

t = 0, context = tensor([18]), target = 47
t = 1, context = tensor([18, 47]), target = 56
t = 2, context = tensor([18, 47, 56]), target = 57
t = 3, context = tensor([18, 47, 56, 57]), target = 58
t = 4, context = tensor([18, 47, 56, 57, 58]), target = 1
t = 5, context = tensor([18, 47, 56, 57, 58,  1]), target = 15
t = 6, context = tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
t = 7, context = tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58
t = 8, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]), target = 47
t = 9, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]), target = 64
t = 10, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]), target = 43
t = 11, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]), target = 52
t = 12, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]), target = 10
t = 13, context = tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10]), target = 0
t = 14, context = tensor([18, 47

## Creating Batches

##### Batch Size is the number of batches per example

##### Block size is the maximum context length/window for predicting

In [11]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

In [12]:
def get_batch(split):
    data = train_data if split == "train" else val_data

    ix = torch.randint(len(data) - block_size, (batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

In [13]:
xb, yb = get_batch("train")
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [14]:
xb, yb

(tensor([[53, 61, 57, 10,  0, 20, 43,  1],
         [39, 41, 43, 42,  1, 58, 46, 43],
         [52, 41, 43,  8,  0,  0, 24, 17],
         [26, 33, 31, 10,  0, 25, 53, 57]]),
 tensor([[61, 57, 10,  0, 20, 43,  1, 58],
         [41, 43, 42,  1, 58, 46, 43,  1],
         [41, 43,  8,  0,  0, 24, 17, 27],
         [33, 31, 10,  0, 25, 53, 57, 58]]))

In [15]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"Input is {context}, target is {target}")

Input is tensor([53]), target is 61
Input is tensor([53, 61]), target is 57
Input is tensor([53, 61, 57]), target is 10
Input is tensor([53, 61, 57, 10]), target is 0
Input is tensor([53, 61, 57, 10,  0]), target is 20
Input is tensor([53, 61, 57, 10,  0, 20]), target is 43
Input is tensor([53, 61, 57, 10,  0, 20, 43]), target is 1
Input is tensor([53, 61, 57, 10,  0, 20, 43,  1]), target is 58
Input is tensor([39]), target is 41
Input is tensor([39, 41]), target is 43
Input is tensor([39, 41, 43]), target is 42
Input is tensor([39, 41, 43, 42]), target is 1
Input is tensor([39, 41, 43, 42,  1]), target is 58
Input is tensor([39, 41, 43, 42,  1, 58]), target is 46
Input is tensor([39, 41, 43, 42,  1, 58, 46]), target is 43
Input is tensor([39, 41, 43, 42,  1, 58, 46, 43]), target is 1
Input is tensor([52]), target is 41
Input is tensor([52, 41]), target is 43
Input is tensor([52, 41, 43]), target is 8
Input is tensor([52, 41, 43,  8]), target is 0
Input is tensor([52, 41, 43,  8,  0]),

## Implementing the Bi-gram model

In [16]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x110b1fd50>

In [17]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(
            vocab_size,
            vocab_size
        )

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)

            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
    

model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
logits.shape, loss

(torch.Size([32, 65]), tensor(4.9670, grad_fn=<NllLossBackward0>))

In [18]:
encoded = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()
decoded = decode(encoded)
decoded

'\nSKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp\nwnYWmnxKWWev-tDqXErVKLgJ'

In [19]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [21]:
batch_steps = 32

for steps in range(10000):
    xb, yb = get_batch("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(f"At {steps} step, the loss is: {loss.item()}")

At 0 step, the loss is: 4.45281457901001
At 1 step, the loss is: 4.821342945098877
At 2 step, the loss is: 4.780389785766602
At 3 step, the loss is: 4.539166450500488
At 4 step, the loss is: 4.395447254180908
At 5 step, the loss is: 4.530900001525879
At 6 step, the loss is: 4.790666103363037
At 7 step, the loss is: 4.204006195068359
At 8 step, the loss is: 4.515846252441406
At 9 step, the loss is: 4.532397747039795
At 10 step, the loss is: 4.280624866485596
At 11 step, the loss is: 4.683449745178223
At 12 step, the loss is: 4.556112766265869
At 13 step, the loss is: 4.445948123931885
At 14 step, the loss is: 4.601198673248291
At 15 step, the loss is: 4.653167247772217
At 16 step, the loss is: 4.711243152618408
At 17 step, the loss is: 4.617818832397461
At 18 step, the loss is: 4.65828800201416
At 19 step, the loss is: 4.478277206420898
At 20 step, the loss is: 4.747836112976074
At 21 step, the loss is: 4.626686096191406
At 22 step, the loss is: 4.774348735809326
At 23 step, the loss is

In [22]:
encoded = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()
decoded = decode(encoded)
decoded

"\nA'Te.\njs\nbeZvimm? mad antho hayerstomy, h IFOWk, whercqHassclly.\nING pe in, is, ounnsto?pa vig m BGB"