# Load

In [1]:
with open("data/input.txt",'r', encoding="utf-8") as f:
    text = f.read()

print(f"Text length {len(text)}")

chars = sorted(list(set(text)))
vocab_size = len(chars)

Text length 1115394


# Tokenizer

In [2]:
import torch

# Character to Integer
ctoi = {ch:i for i,ch in enumerate(chars)}
# Integer to Character
itoc = {i:ch for i,ch in enumerate(chars)}


encode = lambda string: [ctoi[c] for c in string]
decode = lambda list: ''.join([itoc[i] for i in list])

print(encode("Hello World"))


data = torch.tensor(encode(text), dtype = torch.long)

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42]


# Train Test split

In [3]:
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

# Model

In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# HyperParams
block_size = 8
batch_size = 32
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_iters = 200 
n_embed = 32
# Reproducibility
torch.manual_seed(1337)


def get_bath(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint( len(data) - block_size, (batch_size,)  )
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


xb, yb = get_bath("train")


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()

    for split in ["train","test"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_bath(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ self attention Layer """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # Not a parameter, so a buffer

    def forward(self, x):
        B,T,C = x.shape
        k =  self.key(x)
        q = self.query(x)

        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        wei = F.softmax(wei, dim = -1)

        v = self.value(x)

        out = wei @ v

        return out


class MultiHeadAttention(nn.Module):
    """multiple heads of SA in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self,x):
        return torch.cat([h(x) for h in self.heads], dim = -1)



class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed) # (V, E). Call takes (,) to (, ,E)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # self.sa_head = Head(n_embed) # For self attention = sa
        if (n_embed % 4 != 0):
            self.sa_head = Head(n_embed)
            print("Using single SA head...")
        self.sa_head = MultiHeadAttention(4, int(n_embed / 4))
        self.linear_head = nn.Linear(n_embed, vocab_size) # (E, V)

    def forward(self, idx, targets = None):
        B, T = idx.shape # B = batch size, T = # of context tokens

        # idx is B,T
        token_embeddings = self.token_embedding_table(idx) # (B, T, E)
        position_embeddings = self.position_embedding_table( torch.arange(T, device = device) ) # (T, E)

        x = token_embeddings + position_embeddings # broadcast to (B,T,E)

        x = self.sa_head(x)

        logits = self.linear_head(x) # (B, T, V)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # Because of S.A., we can only use block size number of tokens
            idx_cond = idx[:, -block_size:]

            # predictions
            logits, loss = self(idx_cond) #model call
            logits = logits[:,-1,:] # (B,C)
            probs = F.softmax(logits, dim = -1) # (B,C)
            idx_next = torch.multinomial(probs, num_samples = 1) # Probability sample
            idx = torch.cat((idx, idx_next), dim = 1) # Concat
        return idx
    

model = BigramModel()
m = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)


for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        ltr = losses["train"]
        ltst = losses["test"]
        print(f"Step {iter} train loss {ltr:.4f}, test loss {ltst:.4f}")

    xb, yb = get_bath("train")

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()


8
Step 0 train loss 4.1734, test loss 4.1750
Step 300 train loss 2.8502, test loss 2.8686
Step 600 train loss 2.6358, test loss 2.6615
Step 900 train loss 2.5512, test loss 2.5605
Step 1200 train loss 2.5059, test loss 2.5103
Step 1500 train loss 2.4535, test loss 2.4653
Step 1800 train loss 2.4204, test loss 2.4357
Step 2100 train loss 2.3946, test loss 2.3938
Step 2400 train loss 2.3660, test loss 2.3864
Step 2700 train loss 2.3499, test loss 2.3732


# Text Generation

In [8]:
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(
    m.generate(context, max_new_tokens=500)[0].tolist()
))


Thourrmet hu, deinor be my anng: this, frre horwh se of sen you then thandell sop.

HAlle mobaice kasotpingot therte Thisht, tha mimis by.
To heler Iten may hur-se isbdow, danf whe-onay bivare myrare werrer aviirt be,
Myoush, ad bor on, wand ses
Tee, bere bpre refsacese plave hern, bonkrusonde whess sun thome yon move theas beat!
The shing, gie il the oth:
In.

He E:
Bathy gorme pve kist thalje bhele.

Fre Rif ing non glorder dose tastes ote, peffamanoup ole eotsst she couky we; fererim hand icu


## Scripting

In [19]:
torch.manual_seed(1337)

B,T,C = 4,8,32
# Bath of 4
# 8 Tokens
x = torch.randn(B,T,C)

# Query: What we look for
# Key: What we find
head_size = 16
key = nn.Linear(C, head_size, bias = False) # Layer takes C channels into head_size ones
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias= False)

k = key(x) # B, T, hs
q = query(x) # B, T, hs

wei = q @ k.transpose(-2, -1) # B,T,hs @ B,hs,T  --> B, T, T

# Only look at past tokens
tril = torch.tril(torch.ones(T,T))
# wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = 1)

v = value(x)

out = wei @ v

print("Output is same shape as input but takes into account previous tokens. Model will learn that some tokens are correlated to previous tokens.")
print(out.shape)

# x private information to a token
# query - what I am interested in as a token
# key - what information I have as a token
# value -  what I am comunicated


Output is same shape as input but takes into account previous tokens. Model will learn that some tokens are correlated to previous tokens.
torch.Size([4, 8, 16])
