In [6]:
with open("./input.txt", "r", encoding="utf-8") as infile:
    text = infile.read()

In [7]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [8]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenization

In [10]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("Hello friend"))
print(decode(encode("Hello friend")))

[20, 43, 50, 50, 53, 1, 44, 56, 47, 43, 52, 42]
Hello friend


Trade-off codebook size vs sequence length:

- Large codebook size i.e vocab size -> Small sequence length
- Small codebook size -> Large sequence length

In [11]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [14]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [15]:
block_size = 8
train_data[: block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [17]:
x = train_data[:block_size]
y = train_data[1 : block_size + 1]

for t in range(block_size):
    content = x[: t + 1]
    target = y[t]
    print(f"when the input is {content} the target: {target}")

when the input is tensor([18]) the target: 47
when the input is tensor([18, 47]) the target: 56
when the input is tensor([18, 47, 56]) the target: 57
when the input is tensor([18, 47, 56, 57]) the target: 58
when the input is tensor([18, 47, 56, 57, 58]) the target: 1
when the input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when the input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


Predict all the above sequences in parallel using a transformer model.

Need to also handle the batch dimension in the transformer model.

In [34]:
torch.manual_seed(1337)

batch_size = 4  # how many independent sequences will we process in parallel
block_size = 8  # what is the max context length for predictions?


def get_batch(split):
    # generate a small batch of data of inputs x and y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")

In [35]:
xb, yb

(tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]]),
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
         [53, 56,  1, 58, 46, 39, 58,  1],
         [58,  1, 58, 46, 39, 58,  1, 46],
         [17, 27, 10,  0, 21,  1, 54, 39]]))

In [36]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, : t + 1]
        target = xb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")
    break

when input is [24] the target: 24
when input is [24, 43] the target: 43
when input is [24, 43, 58] the target: 58
when input is [24, 43, 58, 5] the target: 5
when input is [24, 43, 58, 5, 57] the target: 57
when input is [24, 43, 58, 5, 57, 1] the target: 1
when input is [24, 43, 58, 5, 57, 1, 46] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 43


### Basic model

In [50]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)


class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        logits = self.embedding(x)  # B, T, C (batch, time, channels)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)

            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax
            probs = F.softmax(logits, dim=-1)  # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)

            # concatenate current result to existing idx, and run
            # through the loop again to get the next char.
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


m = BigramLM(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss.item())

# Generate 100 new tokens starting from a single zero token (empty context)
# Then decode the generated sequence back to text using our decode function
print(
    decode(
        m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()
    )
)

torch.Size([32, 65])
4.878634929656982

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


Expected Loss is: -ln(1/|V|) = -ln(1/65) = 4.17

In [52]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [60]:
batch_size = 32

for steps in range(10000):
    
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=None)

    loss.backward()
    optimizer.step()

print(loss.item())

2.399778127670288


In [62]:
print(
    decode(
        m.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=300)[0].tolist()
    )
)


TENTUSBRirce htate hy adfouliepastrou hour tonghour wes ovattilis usere g thartet thalire d rth?
Ay! s,
wodollltalrr,
ORE:
Nonge yif Sleng nofuco mare: theve, hoo eg sth nd spe; acourinuof in onceshachr sesmf r, ithern IINGe thes; ovin, macon oto st
OForitourthe He, ispth athilire arethofand bor que


#### Math Trick for self-attention

In [63]:
torch.manual_seed(1337)

B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

We would like these above 8 tokens to talk to each other i.e couple them. Tokens should only communicate with tokens BEFORE them. This is to ensure information only flows from previous context to the current token.

Easiest way to communicate: 
- Just do an average of all the preceding tokens. 
  - Average, however, is an extremely weak form of interaction. 
  - We've lost information about spatial relationships between tokens.

We will see how to bring this back later.

We'll first implemenet the average method.

In [65]:
# We want x[b, t] = mean_{i<=t} x[b, i]
# Similar to bag-of-words approach
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [68]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [69]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [71]:
(-0.3596 + 0.1808)/2

-0.0894

First token has the same value in both x and xbow. However, we see that from the 2nd token, it's an average of the previous token and the current token. 3rd token will be an average of the 1st, 2nd and 3rd token and so on.

**THE TRICK:**

We can be very efficient about this using a matrix multiplication. We can create a matrix that will do this averaging for us. 

In [72]:
torch.manual_seed(42)
a = torch.ones(3,3)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

a=
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])


The trick is using a matrix multiply. This matrix will be a lower triangular matrix(tril) with 1s on the diagonal and 0's above the diagonal.

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

In [77]:
torch.tril(torch.ones((3,3))) 

tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])

Replacing the value of a with the above

In [78]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])


We see that c now has the sum of all the previous tokens. We can now divide by the number of tokens to get the average.

In [91]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, dim=1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print("a=")
print(a)
print("b=")
print(b)
print("c=")
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


We see that the final value of c is now correct! Let's implement this for our existing token tensor.

In [None]:
# We want x[b, t] = mean_{i<=t} x[b, i]
# Similar to bag-of-words approach
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [95]:
# wei: short for weights
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [106]:
xbow2 = wei @ x # (T, T) # (B, T, C)
# Pytorch will create batch dimension for wei i.e (B, T, C) @ (B, T, C)
# Then it will matrix multiply
# Final result shape: (B, T, C)

assert torch.allclose(xbow, xbow2)

xbow[0], xbow2[0]

(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

#### Another way to write the above: Softmax

In [109]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf")) # when the value = 0, replace it with '-inf'
wei = F.softmax(wei, dim=1) 
# this works because e^-inf -> 0, as value of -inf increases.
# hence, it's the same 
xbow3 = wei @ x

assert torch.allclose(xbow, xbow3)
assert torch.allclose(xbow2, xbow3)

xbow[0], xbow2[0], xbow3[0]


(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]))

Softmax is a normalizaed exponential function. It's a way to normalize a set of values to a probability distribution.

This is why the value above, when using a softmax, is the same as creating a tril matrix of 1, normalizing it by the row count and multiplying it with the token tensor.

**Why should we use the softmax version?**

- `wei` begins with 0. Think of it as an interaction string/affinity
  - Tells us how much of the tokens in the past should we consider/aggregate
- However, this line: `wei = wei.masked_fill(tril == 0, float("-inf"))`
  - This line says that we cannot communicate with the future tokens i.e their contribution is 0. 

`wei` is set as 0 by us, but it will not always remain 0. These afffinities will be data dependentant. They will start looking at each other, and some tokens will find other tokens more or less interesting. The amount with which they find each other interesting(called `affinities` for now) will be learned by the model.

TLDR:

- We can do weighted average of the tokens using matrix multiplication: Use a tril matrix and softmax