In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-04-20 01:27:20--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-04-20 01:27:21 (11.4 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [1]:
with open(r"input.txt", 'r+') as f:
    text = f.read()

print(text[:10])

First Citi


In [2]:
char = sorted(set(text))
char = "".join(char)

vocab_size = len(char)
print(char)
print(len(char))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [3]:
# Building a simple tokenizer based on characters.

stoi = { ch:i for i, ch in enumerate(char)}
itos = { i:ch for i, ch in enumerate(char)}

print('a', stoi['a'], itos[stoi['a']])

# Encoding and decoding functions
encode = lambda s: [stoi[ch] for ch in s] # Takes a string and output integers
decode = lambda s: ''.join([itos[i] for i in s]) # Takes a list of integers as input and outpu a string

print(encode('hello'))
print(decode(encode('hello')))

a 39 a
[46, 43, 50, 50, 53]
hello


In [4]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.size)
print(data[:10])

torch.Size([1115394]) <built-in method size of Tensor object at 0x7d8f79bd6840>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [5]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [6]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [7]:
# Eg

x = train_data[:block_size]
y = train_data[1:block_size+1]

for i in range(block_size):
    context = x[:i+1]
    target = y[i]

    print(context, target)

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [8]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    
    ix = torch.randint(0, len(data) - block_size, (batch_size,))            # Generating random posoitions and getting the data
    x = torch.stack([data[i:i+block_size] for i in ix])     # Stacking in rows
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xb, yb = get_batch('train')
print("inputs: ")
print(xb.shape)
print(xb)

print("Targets: ")
print(yb.shape)
print(yb)

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
Targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [9]:
# To see the batch

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(context, target)

tensor([24]) tensor(43)
tensor([24, 43]) tensor(58)
tensor([24, 43, 58]) tensor(5)
tensor([24, 43, 58,  5]) tensor(57)
tensor([24, 43, 58,  5, 57]) tensor(1)
tensor([24, 43, 58,  5, 57,  1]) tensor(46)
tensor([24, 43, 58,  5, 57,  1, 46]) tensor(43)
tensor([24, 43, 58,  5, 57,  1, 46, 43]) tensor(39)
tensor([44]) tensor(53)
tensor([44, 53]) tensor(56)
tensor([44, 53, 56]) tensor(1)
tensor([44, 53, 56,  1]) tensor(58)
tensor([44, 53, 56,  1, 58]) tensor(46)
tensor([44, 53, 56,  1, 58, 46]) tensor(39)
tensor([44, 53, 56,  1, 58, 46, 39]) tensor(58)
tensor([44, 53, 56,  1, 58, 46, 39, 58]) tensor(1)
tensor([52]) tensor(58)
tensor([52, 58]) tensor(1)
tensor([52, 58,  1]) tensor(58)
tensor([52, 58,  1, 58]) tensor(46)
tensor([52, 58,  1, 58, 46]) tensor(39)
tensor([52, 58,  1, 58, 46, 39]) tensor(58)
tensor([52, 58,  1, 58, 46, 39, 58]) tensor(1)
tensor([52, 58,  1, 58, 46, 39, 58,  1]) tensor(46)
tensor([25]) tensor(17)
tensor([25, 17]) tensor(27)
tensor([25, 17, 27]) tensor(10)
tensor([25

In [10]:
## Bigram Language Model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):
        logits = self.token_embedding_table(idx) #(B, T, C) (Batchm, time, channel)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape

            logits = logits.view(B*T, C) # (B*T, C)  # For getting loss we need to do this as we cannot feed b,t,c directly
            targets = targets.view(B*T) # (B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens=100):
        for _ in range(max_new_tokens):
            # get the prediction
            logits, loss = self(idx)
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            idx = torch.cat([idx, idx_next], dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)



Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [11]:
optimizer = torch.optim.Adam(m.parameters(), lr = 1e-3)

In [12]:
batch_size = 32

for steps in range(10000):

    #Sample batch
    xb, yb = get_batch('train')

    # Forward pass
    logits, loss = m(xb, yb)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # if steps % 10 == 0:
    #     print(f"Loss at step {steps}: {loss.item():.4f}")

print(loss.item())

2.572469472885132


In [13]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y helti


#### The Mathematical trick in sekf-attention

In [14]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [15]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

We are trying to couple in a way that info frmo prev go to future. We cannot have leakage so that info from, future available in past. We can average so that last one by one.
So we may lose some data but for now it is okay.

In [71]:
# We want x[b,t] = mean_{i<=t} x[b,i] we mean so that that will be for last 2
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [75]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)  # True 

False

In [76]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)


False

In [82]:
# completed video till: 01:02 

In [None]:
# version 4: self-attention!  Decoder block

torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [26]:
out

tensor([[[-2.5103e-02, -2.3612e-01, -4.5171e-01,  4.7792e-01,  5.0296e-01,
          -4.0769e-01, -1.2294e-01, -4.6356e-02,  3.1703e-01,  2.4524e-01,
          -1.9162e-01,  5.3467e-01,  3.7165e-01,  2.1231e-01,  5.9100e-01,
           1.8384e-01],
         [ 4.9167e-01, -5.6461e-01, -5.9052e-01,  4.4887e-01,  7.7757e-01,
          -7.6121e-01,  1.4188e-02,  1.6654e-01,  4.7274e-01,  2.1897e-01,
          -5.3842e-01,  5.1860e-01,  4.4462e-01,  4.9806e-01,  7.5010e-01,
           1.0307e-01],
         [ 1.5293e-01,  1.3120e-02, -3.4630e-01,  2.0143e-01,  2.3533e-01,
          -2.3894e-01,  1.7949e-02,  1.8566e-01, -9.8037e-02, -1.6844e-01,
           5.3766e-02,  1.8995e-01,  5.0373e-02, -1.0214e-01, -4.0242e-02,
           6.5572e-01],
         [ 9.3663e-02,  3.5403e-01, -1.0914e-01, -9.4043e-02,  1.0841e-01,
           2.5260e-01,  5.8558e-02, -6.2868e-02, -3.8557e-01, -7.3532e-02,
           8.6651e-02, -8.5347e-02, -2.2574e-01, -1.9268e-01,  1.0780e-01,
           6.0662e-01],
    

In [None]:
# version 4: self-attention!  encoder block used for sentiment classification

torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)


tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))

# If encoder block so what we do is don'y use trill to mask so we have all the matrics rather than triangular matrics as we need for whole and will just generate single output embeddings.
# wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [25]:
out

tensor([[[-2.5103e-02, -2.3612e-01, -4.5171e-01,  4.7792e-01,  5.0296e-01,
          -4.0769e-01, -1.2294e-01, -4.6356e-02,  3.1703e-01,  2.4524e-01,
          -1.9162e-01,  5.3467e-01,  3.7165e-01,  2.1231e-01,  5.9100e-01,
           1.8384e-01],
         [ 4.9167e-01, -5.6461e-01, -5.9052e-01,  4.4887e-01,  7.7757e-01,
          -7.6121e-01,  1.4188e-02,  1.6654e-01,  4.7274e-01,  2.1897e-01,
          -5.3842e-01,  5.1860e-01,  4.4462e-01,  4.9806e-01,  7.5010e-01,
           1.0307e-01],
         [ 1.5293e-01,  1.3120e-02, -3.4630e-01,  2.0143e-01,  2.3533e-01,
          -2.3894e-01,  1.7949e-02,  1.8566e-01, -9.8037e-02, -1.6844e-01,
           5.3766e-02,  1.8995e-01,  5.0373e-02, -1.0214e-01, -4.0242e-02,
           6.5572e-01],
         [ 9.3663e-02,  3.5403e-01, -1.0914e-01, -9.4043e-02,  1.0841e-01,
           2.5260e-01,  5.8558e-02, -6.2868e-02, -3.8557e-01, -7.3532e-02,
           8.6651e-02, -8.5347e-02, -2.2574e-01, -1.9268e-01,  1.0780e-01,
           6.0662e-01],
    