# NanoGPT

Character-based Transformer model trained over Shakespeare books

In [30]:
import torch
import torch.nn as nn
from torch.nn import functional as F

## Initialization

In [31]:
# We always start with a dataset to train on. Let's read it
text = open('tinyshakespeare.txt', 'r').read()

print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115390


In [32]:
# check for gpu support
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [33]:
# Get all unique characters that our dataset has
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [34]:
# Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join([itos[v] for v in i])

print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [35]:
# Encode the entire text dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115390]) torch.int64


In [36]:
# Let's now slip the data into a train and a validation dataset
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [37]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [38]:
x = train_data[:block_size+1]
y = train_data[1:block_size+1]

for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f'when input is {context}, target is {target}')

when input is tensor([18]), target is 47
when input is tensor([18, 47]), target is 56
when input is tensor([18, 47, 56]), target is 57
when input is tensor([18, 47, 56, 57]), target is 58
when input is tensor([18, 47, 56, 57, 58]), target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [39]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will be processed in parallel?
block_size = 8 # whats the maximum context length for predictions?

def get_batch(split):
  # generate a small batch of data of inputs x and target y
  data = train_data if split == 'train' else val_data

  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x,y

xb, yb = get_batch('train')

print('Inputs:')
print(xb.shape)
print(xb)
print("Targets:")
print(yb.shape)
print(yb)
print('---')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b][:t+1]
    target = yb[b][t]
    print(f'when input is {context.tolist()}, target is {target}')


Inputs:
torch.Size([4, 8])
tensor([[43, 51,  1, 44, 39, 47, 56, 12],
        [46, 53, 53, 42,  1, 46, 43,  1],
        [52, 48, 59, 56, 47, 53, 59, 57],
        [47, 52, 45,  1, 47, 56, 53, 52]])
Targets:
torch.Size([4, 8])
tensor([[51,  1, 44, 39, 47, 56, 12,  0],
        [53, 53, 42,  1, 46, 43,  1, 57],
        [48, 59, 56, 47, 53, 59, 57,  1],
        [52, 45,  1, 47, 56, 53, 52,  8]])
---
when input is [43], target is 51
when input is [43, 51], target is 1
when input is [43, 51, 1], target is 44
when input is [43, 51, 1, 44], target is 39
when input is [43, 51, 1, 44, 39], target is 47
when input is [43, 51, 1, 44, 39, 47], target is 56
when input is [43, 51, 1, 44, 39, 47, 56], target is 12
when input is [43, 51, 1, 44, 39, 47, 56, 12], target is 0
when input is [46], target is 53
when input is [46, 53], target is 53
when input is [46, 53, 53], target is 42
when input is [46, 53, 53, 42], target is 1
when input is [46, 53, 53, 42, 1], target is 46
when input is [46, 53, 53, 42, 1

## Bigram Model

Let's start feeding our batches into neural networks. The simplest possible neural network, which in the case of natural language one could be the bigram model

In [40]:
torch.manual_seed(1337)

class BingramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    # idx and targets are both (B, T) tensor of integers
    # in here we "translate" each character into it's embedding
    logits = self.token_embedding_table(idx) # (Batch, Time, Channel)

    if targets == None:
      loss = None
    else:
      # cross_entropy expects (B, C, T)
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits.view(B*T, C), targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)
      # focus only on the last time step. Hisotry not used because so far it's a bigram model
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get the probabilities
      probs = F.softmax(logits, dim=1) # (B, C)
      # sample from distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T1)

    return idx

# Create the model
m = BingramLanguageModel(vocab_size)
m = m.to(device)

# Generate from model using 1 batch starting with char 0 (new line)
out = m.generate(torch.tensor([[0]], device=device), max_new_tokens=100)

print(decode(out[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [41]:
# Create Pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=0.001)

In [62]:
batch_size = 32
max_iters = 10000
eval_iters = 200
eval_interval = 2000

@torch.no_grad()
def estimate_loss():
  out = {}

  # put our model into evaluation phase
  m.eval()

  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = m(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()

  # resetting our model to training phase
  m.train()

  return out


for step in range(eval_interval):
  # every once in awhile evaluate the loss on train and val set using batches mean
  if step % eval_iters == 0:
    losses = estimate_loss()
    print(f"step {step}/{max_iters}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()


step 0/10000: train loss 2.4400, val loss 2.4826
step 200/10000: train loss 2.4550, val loss 2.4828
step 400/10000: train loss 2.4485, val loss 2.4927
step 600/10000: train loss 2.4527, val loss 2.4841
step 800/10000: train loss 2.4536, val loss 2.4784
step 1000/10000: train loss 2.4503, val loss 2.4848
step 1200/10000: train loss 2.4507, val loss 2.4898
step 1400/10000: train loss 2.4535, val loss 2.4843
step 1600/10000: train loss 2.4592, val loss 2.4899
step 1800/10000: train loss 2.4554, val loss 2.4811


In [63]:
# fetch from model again

# Generate from model using 1 batch starting with char 0 (new line)
out = m.generate(torch.tensor([[0]], device=device), max_new_tokens=600)

print(decode(out[0].tolist()))


Wherorusoorerf by byothotht y go ie moserexis.

Wedatthierth?

NO: cake teerersomp'dve o theshe charher lay ht Is athanoutod, s ond my ah wak tosooupo,
CAUKELoul-be imanson pis!'boimours'tu ppous CHemy d ameakeairce
Clt r s t men MESTHAnoug, me
Whickeat wisouthapr ar l.
Ad laspay antar been then'coeoarer, AESond, froukint usord uy m, acheaid, y Cuse
I s twoudin keay athoully'sel lif tharery avorshe very hortho antelee f t ftof chefons ise, an lakefouitlat tadack ES: yo my Sirvedenor,
Wime i
DO:

IOUS S:

I iowhe thiveme thee n drro ofofecherent cu sa ceancedenof tithay me ng mpo thitiothavy ou


# Mathematical Trick on Self-Attention

Before jumping into transformers, it's handy to understand a trick that's crucial on the attention mechanism

In [64]:
# consider the following toy example:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B, T, C)

In [67]:
# efficient way to compute average using matrix multiplication
torch.manual_seed(1337)
a=torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a=')
print(a)
print('b=')
print(b)
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])
c=
tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])


In [72]:
# using the latter
wei = torch.tril(torch.ones(T, T))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbox = wei @ x # (T, T) @ (B, T, C) --> (B, T, C)

# which is the same
tril = torch.tril(torch.ones(T, T))
wei2 = torch.zeros(T, T)
wei2 = wei2.masked_fill(tril == 0, float('-inf'))
wei2 = F.softmax(wei2, dim=0)
xbox2 = wei2 @ x

In [77]:
torch.mean(torch.tensor([[1,2,3], [4,5,6]], dtype=torch.float), 0)

tensor([2.5000, 3.5000, 4.5000])

## Enhancing Our Model

We can see that our model improved, but obviously it's not good. In the Bigram model the tokens are not talking to each other. One char is only being redicted based on the previous one. We need to start making talk and use the whole context!