# 2025-02-12
## Plan
- Get the tiny shakespeare dataset
- Build encoding and decoding logic
- Visualize what context_size is
- Train test/ val split
- Buld get_batch function
- Build bigram model that uses an embedding of vocab size

In [1]:
import torch as t
import requests as r
import torch.nn as nn
import torch.optim as optim

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
dataset_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
data = r.get(dataset_url)
data = data.text
print(data[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
unique_chars = sorted(set(list(data)))
c_to_i = { c: i for i, c in enumerate(unique_chars) }
i_to_c = { i: c for i, c in enumerate(unique_chars) }

def encode(chars):
    return [c_to_i[char] for char in chars]

def decode(ints):
    return ''.join(list(i_to_c[i] for i in ints))

print(encode("hiii"))
print(decode(encode("hiiii")))

[46, 47, 47, 47]
hiiii


In [4]:
data_tensor = t.LongTensor(encode(data))
data_tensor

tensor([18, 47, 56,  ..., 45,  8,  0])

In [5]:
data_tensor.shape

torch.Size([1115394])

In [6]:
context_size = 8
x = data_tensor[0:context_size+1]

for i in range(1, len(x)):
    print(f"Context for {x[i]} is {x[:i]}")

Context for 47 is tensor([18])
Context for 56 is tensor([18, 47])
Context for 57 is tensor([18, 47, 56])
Context for 58 is tensor([18, 47, 56, 57])
Context for 1 is tensor([18, 47, 56, 57, 58])
Context for 15 is tensor([18, 47, 56, 57, 58,  1])
Context for 47 is tensor([18, 47, 56, 57, 58,  1, 15])
Context for 58 is tensor([18, 47, 56, 57, 58,  1, 15, 47])


In [7]:
num_train = int(0.8 * len(data_tensor))
train, test = data_tensor[:num_train], data_tensor[num_train:]

In [8]:
len(train), len(test)

(892315, 223079)

In [9]:
train, test

(tensor([18, 47, 56,  ..., 39, 58,  1]),
 tensor([63, 53, 59,  ..., 45,  8,  0]))

In [10]:
batch_size = 4

def get_batch(split):
    to_batch = train if split == "train" else test
    batch_start_idx = t.randint(len(to_batch) - context_size, (4, ))
    x = t.stack([to_batch[i:i+context_size] for i in batch_start_idx])
    y = t.stack([to_batch[i+1:i+1+context_size] for i in batch_start_idx])
    
    return x, y

x, y = get_batch('train')
x, y

(tensor([[50, 43,  6,  0, 35, 46, 43, 56],
         [63,  0, 40, 43, 52, 41, 46, 43],
         [61, 53, 59, 50, 42,  1, 58, 43],
         [ 1, 46, 53, 52, 53, 59, 56,  5]]),
 tensor([[43,  6,  0, 35, 46, 43, 56, 43],
         [ 0, 40, 43, 52, 41, 46, 43, 56],
         [53, 59, 50, 42,  1, 58, 43, 50],
         [46, 53, 52, 53, 59, 56,  5, 42]]))

In [70]:
for batch_idx in range(batch_size):
    for t_idx in range(context_size):
        print(f"Batch {batch_idx}: Context for {y[batch_idx, t_idx]} is {x[batch_idx, :t_idx]}")

Batch 0: Context for 19 is tensor([], dtype=torch.int64)
Batch 0: Context for 24 is tensor([0])
Batch 0: Context for 27 is tensor([ 0, 19])
Batch 0: Context for 33 is tensor([ 0, 19, 24])
Batch 0: Context for 15 is tensor([ 0, 19, 24, 27])
Batch 0: Context for 17 is tensor([ 0, 19, 24, 27, 33])
Batch 0: Context for 31 is tensor([ 0, 19, 24, 27, 33, 15])
Batch 0: Context for 32 is tensor([ 0, 19, 24, 27, 33, 15, 17])
Batch 1: Context for 53 is tensor([], dtype=torch.int64)
Batch 1: Context for 59 is tensor([41])
Batch 1: Context for 57 is tensor([41, 53])
Batch 1: Context for 47 is tensor([41, 53, 59])
Batch 1: Context for 52 is tensor([41, 53, 59, 57])
Batch 1: Context for 1 is tensor([41, 53, 59, 57, 47])
Batch 1: Context for 22 is tensor([41, 53, 59, 57, 47, 52])
Batch 1: Context for 59 is tensor([41, 53, 59, 57, 47, 52,  1])
Batch 2: Context for 47 is tensor([], dtype=torch.int64)
Batch 2: Context for 58 is tensor([59])
Batch 2: Context for 57 is tensor([59, 47])
Batch 2: Context fo

In [96]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_embedding_dict = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        # I don't love how this function returns a different shape for logits depending on targets
        # x: (B, T)
        # targets: (B, T)
        logits = self.vocab_embedding_dict(x) # (B, T, C)
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # (B* T, C)
            targets = targets.view(B*T) # (B*T, 1)
            loss = nn.functional.cross_entropy(logits, targets)
        else:
            loss = None

        return logits, loss

    def generate(self, idxs, max_num_tokens):
        # do one step of idxs (B, T) until max tokens
        # that means you do a forward pass, and pick the max probability
        # -ln(p) lower => p is higher, sanity check for p = 1 :check:
        
        for i in range(max_num_tokens):
            logits, _ = self(idxs)
            # get the logits of the last time step
            logits = logits[:, -1, :]
            # get the max
            probs = nn.functional.softmax(logits, dim=-1) # (B, C)
            idx_next = t.multinomial(probs, num_samples=1) # (B, 1)
            idxs = t.cat([idxs, idx_next], dim=1) # (B, T+ 1)
        return idxs

In [97]:
model = BigramLanguageModel(len(i_to_c))
model(x, y) # expect loss of around -ln(1/vocab_size) negative log prob

(tensor([[ 0.3907, -0.2744,  0.2176,  ..., -2.2814,  1.7432, -0.6710],
         [ 0.3483, -0.2885,  0.0237,  ..., -0.4732,  0.5566,  1.4295],
         [ 0.5494, -0.5782, -0.3410,  ...,  1.4509, -0.3217, -0.7092],
         ...,
         [-0.1415,  0.6276,  0.2061,  ..., -0.5163,  1.2369, -1.5646],
         [ 0.1698, -1.7079,  1.5011,  ...,  1.5102,  0.3420, -0.6705],
         [-0.6856, -0.5750, -0.0708,  ...,  1.1784,  0.9538,  1.7870]],
        grad_fn=<ViewBackward0>),
 tensor(4.4908, grad_fn=<NllLossBackward0>))

In [98]:
# expected loss
-t.log(t.Tensor([1 / len(i_to_c)]))

tensor([4.1744])

In [99]:
model.generate(x, 4)

tensor([[ 0, 19, 24, 27, 33, 15, 17, 31, 35, 18, 33,  8],
        [41, 53, 59, 57, 47, 52,  1, 22, 63, 16, 44, 43],
        [59, 47, 58, 57,  1, 46, 47, 58, 11,  3, 59, 61],
        [42,  1, 44, 43, 58, 41, 46,  1,  6, 48, 17, 47]])

In [100]:
y

tensor([[19, 24, 27, 33, 15, 17, 31, 32],
        [53, 59, 57, 47, 52,  1, 22, 59],
        [47, 58, 57,  1, 46, 47, 58, 46],
        [ 1, 44, 43, 58, 41, 46,  1, 63]])

# 2025-02-13
## Plan
- Get the computed forward pass and decode it
- Write a training loop with the Adam optimizer
- Move everything to a script
- Write a self attention block

In [106]:
zero_idxs = t.zeros((1,1), dtype=t.long)
tokens_to_generate = 1000
idxs = model.generate(zero_idxs, tokens_to_generate)
idxs

tensor([[ 0, 35, 34, 24, 33, 28,  5, 64, 38, 39, 50,  3, 14, 26, 49, 24, 58, 44,
         44, 45, 36, 28, 45, 38, 26,  6, 30, 24, 43, 36,  1, 61, 43, 25, 17, 13,
          8, 62, 20, 26, 10, 11, 17, 36, 62, 31,  2, 49,  0, 38, 47, 26, 64, 53,
         15, 28, 54, 64, 47, 53, 26, 18, 57, 26, 17,  9, 11, 39,  7,  5, 11, 63,
         48, 40, 61, 32, 33, 39, 33, 11, 64, 45, 20, 59,  3, 56, 10, 22, 64, 28,
          8, 36, 28, 57, 27, 19, 29, 32, 53,  9, 63]])

In [113]:
decode(idxs[0].tolist())

"\nWVLUP'zZal$BNkLtffgXPgZN,RLeX weMEA.xHN:;EXxS!k\nZiNzoCPpzioNFsNE3;a-';yjbwTUaU;zgHu$r:JzP.XPsOGQTo3y"

In [116]:
# one liner
decode(model.generate(t.zeros((1,1), dtype=t.long), tokens_to_generate)[0].tolist())

"\nKvACBHPZ &$:alsMGrvZ'cM\nfWV&bR,,LQJFrMlRZNqetwPIihwwH&&Dy &Rk;ugA'jCLGJ&$ zKjqtVePNHPojpDfD!zYqyjRme"

In [123]:
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)

for i in range(10000):
    optimizer.zero_grad(set_to_none=True)
    x, y = get_batch('train')
    _, loss = model.forward(x, y)
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"loss: {loss}")

loss: 2.6867167949676514
loss: 2.3886053562164307
loss: 2.1273231506347656
loss: 2.8246164321899414
loss: 2.5422158241271973
loss: 2.576198101043701
loss: 2.6322779655456543
loss: 2.833005666732788
loss: 2.434377431869507
loss: 2.2865514755249023


In [126]:
tokens_to_generate = 500
print(decode(model.generate(t.zeros((1,1), dtype=t.long), tokens_to_generate)[0].tolist()))


An

Pou'll thay; CEESANRD m the;
AUS: toigof cke feniren; yof ioll d as aghothipl,
AMENoat tlid toung bur hous be w'den a s?
HAs hetrware.
Maltonen I py ovoung teang burger.
Wh at ar cr m thetu angleng s Inowive wen minoutwis, BRinethis f m nor ERDe or s, hy wharorokn dothel' houe he ot finsor,

T:

AUTithesod w de hont t houlyouly t sthar:
ELINoun st y my ur marthe thithy col he fiote g, wit bil, plert s

LADr k ivenghyotinges whoure;

Fithishireaingupovemprad; ca ano me, stondo s batheeands:
R


Now to implement the attention blocks. First get some understanding of what we're trying to do
Each word is trying to build context of the words around it
simplest way is to average all teh tokens before you

In [22]:
B, T, C = 4, 8, 2
x = t.rand(B, T, C)
x.shape


torch.Size([4, 8, 2])

In [24]:
x[0]

tensor([[0.6900, 0.3497],
        [0.3722, 0.0837],
        [0.7150, 0.2682],
        [0.3834, 0.4518],
        [0.3707, 0.3285],
        [0.1733, 0.5982],
        [0.8305, 0.3529],
        [0.9904, 0.6499]])

In [80]:
avg1 = []
for batch in x:
    avg1.append(t.stack([batch[:i + 1, :].sum(dim=0) / (i + 1) for i in range(len(x[0]))]))
    
avg1 = t.stack(avg1)
avg1.shape

torch.Size([4, 8, 2])

In [81]:
avg1[0], x[0]

(tensor([[0.6900, 0.3497],
         [0.5311, 0.2167],
         [0.5924, 0.2338],
         [0.5401, 0.2883],
         [0.5063, 0.2964],
         [0.4508, 0.3467],
         [0.5050, 0.3476],
         [0.5657, 0.3854]]),
 tensor([[0.6900, 0.3497],
         [0.3722, 0.0837],
         [0.7150, 0.2682],
         [0.3834, 0.4518],
         [0.3707, 0.3285],
         [0.1733, 0.5982],
         [0.8305, 0.3529],
         [0.9904, 0.6499]]))

In [82]:
wei = t.tril(t.ones(T, T))
wei /= wei.sum(dim=1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [83]:
avg2 = wei @ x
avg2

tensor([[[0.6900, 0.3497],
         [0.5311, 0.2167],
         [0.5924, 0.2338],
         [0.5401, 0.2883],
         [0.5063, 0.2964],
         [0.4508, 0.3467],
         [0.5050, 0.3476],
         [0.5657, 0.3854]],

        [[0.4816, 0.2730],
         [0.5606, 0.5247],
         [0.6979, 0.4842],
         [0.5526, 0.5649],
         [0.6234, 0.5597],
         [0.6425, 0.5787],
         [0.6249, 0.6259],
         [0.5481, 0.6575]],

        [[0.2985, 0.5692],
         [0.3423, 0.2985],
         [0.3372, 0.2056],
         [0.3000, 0.2645],
         [0.4126, 0.2850],
         [0.4041, 0.2488],
         [0.4826, 0.2492],
         [0.4450, 0.2248]],

        [[0.7437, 0.9064],
         [0.4657, 0.5460],
         [0.5133, 0.5500],
         [0.5575, 0.6121],
         [0.4882, 0.6471],
         [0.5685, 0.6253],
         [0.5439, 0.6299],
         [0.5142, 0.6156]]])

In [84]:
t.allclose(avg2, avg1)

True

In [88]:
tril = t.tril(t.ones(T, T))
wei = t.zeros((T, T))
wei = wei.masked_fill(tril == 0, -t.inf)
wei = nn.functional.softmax(wei, dim=1)

In [89]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [90]:
avg3 = wei @ x
t.allclose(avg1, avg3)

True