In [2]:
with open('data/s_char/tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(f"length of text = {len(text)}")
print(text[:100])

length of text = 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
# Unique Characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(','.join(chars))
print(f"Vocab Size = {vocab_size}")


, ,!,$,&,',,,-,.,3,:,;,?,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z
Vocab Size = 65


In [6]:
stoi = { ch: i for i, ch in enumerate(chars)}
itos = { i: ch for i, ch in enumerate(chars)}
encode = lambda s: [ stoi[c] for c in s] # encode a string to a list of integers
decode = lambda l: ''.join([ itos[i] for i in l]) # decode a list of integers back to a string

In [32]:
print(encode("Hello world"))
print(decode(encode("Hello world")))

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
Hello world


In [7]:
# Sub-word tokenizer --> used by GPT2,3...
import tiktoken
enc = tiktoken.get_encoding('gpt2')
print(f"Vocabulary = {enc.n_vocab}") # len(enc._mergeable_ranks) + len(enc._special_tokens)
tokens = enc.encode('hello i am waveboarding??')
all_tokens = { **enc._mergeable_ranks, **enc._special_tokens }
token_mapping = { tkn_id: tkn for tkn, tkn_id in all_tokens.items() }
for tkn_id in tokens: print(f"{tkn_id}: {token_mapping.get(tkn_id, 'N/A')}")

Vocabulary = 50257
31373: b'hello'
1312: b' i'
716: b' am'
6769: b' wave'
27794: b'boarding'
3548: b'??'


In [9]:
# Tokenize the entire text and store into torch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(f"Shape: {data.shape}, Dtype: {data.dtype}")
print(data[:100]) ## Looks like 0 = \n, 1 = ' '

Shape: torch.Size([1115394]), Dtype: torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [10]:
# Split in train, val
tv_split = int(0.9*len(data))
train_data = data[:tv_split]
val_data = data[tv_split:]
print(f"Split Index = {tv_split}, Training len = {len(train_data)}, Validation len = {len(val_data)}")

Split Index = 1003854, Training len = 1003854, Validation len = 111540


In [11]:
block_size = 8  ### Context Window [Max. Context length for predictions]
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:
# Get the examples from `block_size+1` bytes of data
# x = train_data[:block_size]  ### Input
# y = train_data[1:block_size+1] ### Output labels for the above inputs
# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print(f"Context = {context} => Target = {target}")

# Do the same with one array!
chunk = train_data[:block_size+1]
for t in range(block_size):
    context = chunk[:t+1]
    target = chunk[t+1]
    print(f"Context = {context} => Target = {target}")


Context = tensor([18]) => Target = 47
Context = tensor([18, 47]) => Target = 56
Context = tensor([18, 47, 56]) => Target = 57
Context = tensor([18, 47, 56, 57]) => Target = 58
Context = tensor([18, 47, 56, 57, 58]) => Target = 1
Context = tensor([18, 47, 56, 57, 58,  1]) => Target = 15
Context = tensor([18, 47, 56, 57, 58,  1, 15]) => Target = 47
Context = tensor([18, 47, 56, 57, 58,  1, 15, 47]) => Target = 58


In [13]:
def get_batch(split, batch_size): ## Batch Size = Number of sequences being processed in parallel!
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # Generate `batch_size` random offsets 
    x = torch.stack([data[i:i+block_size] for i in ix ]) # Each sample is stacked as a row!
    y = torch.stack([data[i+1:i+block_size+1] for i in ix ])
    return x, y

In [11]:
# Adding the batch dimension to the examples
torch.manual_seed(1337) ## To get repeatability!

def print_batch(batch_size):
    for b in range(batch_size):
        for t in range(block_size):
            context = xb[b, :t+1]
            target = yb[b, t]
            print(f"Batch {b}, Block/Time {t} => Context = {context} => Target = {target}")

xb, yb = get_batch('train', batch_size=4)
print(f"Shapes: xb = {xb.shape}, yb = {yb.shape}")
print_batch(batch_size=4)


Shapes: xb = torch.Size([4, 8]), yb = torch.Size([4, 8])
Batch 0, Block/Time 0 => Context = tensor([24]) => Target = 43
Batch 0, Block/Time 1 => Context = tensor([24, 43]) => Target = 58
Batch 0, Block/Time 2 => Context = tensor([24, 43, 58]) => Target = 5
Batch 0, Block/Time 3 => Context = tensor([24, 43, 58,  5]) => Target = 57
Batch 0, Block/Time 4 => Context = tensor([24, 43, 58,  5, 57]) => Target = 1
Batch 0, Block/Time 5 => Context = tensor([24, 43, 58,  5, 57,  1]) => Target = 46
Batch 0, Block/Time 6 => Context = tensor([24, 43, 58,  5, 57,  1, 46]) => Target = 43
Batch 0, Block/Time 7 => Context = tensor([24, 43, 58,  5, 57,  1, 46, 43]) => Target = 39
Batch 1, Block/Time 0 => Context = tensor([44]) => Target = 53
Batch 1, Block/Time 1 => Context = tensor([44, 53]) => Target = 56
Batch 1, Block/Time 2 => Context = tensor([44, 53, 56]) => Target = 1
Batch 1, Block/Time 3 => Context = tensor([44, 53, 56,  1]) => Target = 58
Batch 1, Block/Time 4 => Context = tensor([44, 53, 56,

In [18]:
# Bigram language model

import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token gets the logits for the next token from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx, targets --> B x T (batch_size x block_size)
        logits = self.token_embedding_table(idx) # B x T x C [vocab_size]
        # loss = F.cross_entropy(logits, targets) # Does not work because pytorch needs B * C * T for multi-dimensional array
        if targets is None:
            loss = None
        else:
            # So, Reshaping so that cross_entropy works
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, num_tokens): ## Generate next `num_tokens` tokens, idx --> B x T
        for _ in range(num_tokens):
            logits, loss = self(idx) ## logits --> B x T x C
            # focus on last time step (only the last character) 
            logits = logits[:, -1, :] ## --> B x C
            # counts = logits.exp() # counts, equivalent to N
            # probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
            probs = F.softmax(logits, dim=-1) ## --> B x C
            # Sample from the probability distribution to get the next idx.
            idx_next = torch.multinomial(probs, num_samples=1) ## --> B x 1
            idx = torch.cat((idx, idx_next), dim=1) ## --> B x T+1
        return idx


m = BigramLanguageModel(vocab_size)
xb, yb = get_batch('train', batch_size=4)
logits, loss = m(xb, yb)
print(f"Logits Shape = {logits.shape}, Loss = {loss}, Expected loss = ln(1/n) = {-1 * torch.log(torch.tensor(1/vocab_size))}")

idx = torch.zeros((1,1), dtype=torch.long) ## Create the initial 'text' to generate the continuation --> Using 0 = \n
tokens = m.generate(idx, num_tokens=100)
print(decode(tokens[0].tolist()))


Logits Shape = torch.Size([32, 65]), Loss = 4.705134868621826, Expected loss = ln(1/n) = 4.174387454986572

P-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3!dcb


In [19]:
# create an optimizer object
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # instead of torch.optim.SGD

In [21]:
batch_size = 32 # Increase batch size to 32
for step in range(10000): ## `n` steps
    xb, yb = get_batch('train', batch_size=batch_size) ## xb = B x T
    # print(f"Shapes: {xb.shape} / {yb.shape}")
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True) ## Zero out existing gradients computed for previous step
    loss.backward()
    optimizer.step() ## change the weights based on the gradients
    if step % 1000 == 0:
        print(f"Step {step}, Loss = {loss.item()}")
    # print(loss.item())

print(loss.item())

Step 0, Loss = 2.436946392059326
Step 1000, Loss = 2.48052716255188
Step 2000, Loss = 2.426077127456665
Step 3000, Loss = 2.4889259338378906
Step 4000, Loss = 2.3345420360565186
Step 5000, Loss = 2.4978251457214355
Step 6000, Loss = 2.440619945526123
Step 7000, Loss = 2.3754453659057617
Step 8000, Loss = 2.4478201866149902
Step 9000, Loss = 2.5060737133026123
2.5448358058929443


In [17]:
# Regenerate after training for 100+1000+10000 cycles/steps
idx = torch.zeros((1,1), dtype=torch.long) ## Create the initial 'text' to generate the continuation --> Using 0 = \n
tokens = m.generate(idx, num_tokens=100)
print(decode(tokens[0].tolist()))



ARCKICOMave wap

I RO:
Banleenoalit-blt
INRon

UM: nd kngonesll;
O: pa heore 'ga llis?-sur inidind;



In [135]:
# Basically, checking how cross_entropy is implemented under the hood~
i = 0
xb = torch.stack([data[i:i+block_size] for i in range(0,4*block_size,block_size) ]) # Each sample is stacked as a row!
yb = torch.stack([data[i+1:i+block_size+1] for i in range(0,4*block_size,block_size) ])
print(f"Shapes: {xb.shape}, {yb.shape}")
logits = m.token_embedding_table(xb)
print(f"Logits Shape = {logits.shape}")
B,T,C = logits.shape
loss = F.cross_entropy(logits.view(B*T, C), yb.view(B*T)) # logits[0], yb[0]) # Softmax followed by NLL Loss
print(f"Cross Entropy Loss = {loss.item()}")

# Computing the Softmax in the next two lines
logits_max = logits - logits.max(2, keepdims=True).values ## for numerical stability, probs dont change
# print(f"Logits Max = {logits_max[0][0]}")
# print(f"Logits = {logits[0][0]}")
counts = logits_max.exp() # counts, equivalent to N
probs = counts / counts.sum(2, keepdims=True) # probabilities for next character / Softmax
logprobs = torch.log(probs)
log_softmax = F.log_softmax(logits, dim=2)
print(f"Log Probs Shape = {logprobs.shape}, All Close = {torch.allclose(logprobs, log_softmax)} -- Targets Shape = {yb.shape}")

xloss = -1 * torch.sum(logprobs * F.one_hot(yb)) / (B * T)
print(f"Computed Loss = {xloss.item()}, {torch.isclose(loss, xloss)}")

Shapes: torch.Size([4, 8]), torch.Size([4, 8])
Logits Shape = torch.Size([4, 8, 65])
Cross Entropy Loss = 2.570775270462036
Log Probs Shape = torch.Size([4, 8, 65]), All Close = True -- Targets Shape = torch.Size([4, 8])
Computed Loss = 2.570775032043457, True


In [23]:
def compute_loss(
    logits: torch.Tensor, targets: torch.Tensor | None = None) -> torch.Tensor | None:
    """Compute loss for the predicted logits v/s the targets"""
    if targets is None:
        loss = None
    else:
        # loss = F.cross_entropy(logits, targets) # Does not work because pytorch needs B * C * T for multi-dimensional array
        # So, Reshaping so that cross_entropy works
        B, T, C = logits.shape
        logits = logits.view(B * T, C)
        targets = targets.view(B * T)
        loss = F.cross_entropy(logits, targets)
    return loss
