<a href="https://colab.research.google.com/github/gut-puncture/gpt-2-shakespeare/blob/main/gpt_2_shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-07-29 17:33:42--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-07-29 17:33:42 (39.9 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [227]:
#training params
batch_size = 32 #8 inputs given to the model in one training step
block_size = 8 #maximum context length for prediction or chunk size
max_iters = 10000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
torch.manual_seed(1337)

<torch._C.Generator at 0x79d9803ab230>

In [2]:
#read text to inspect
with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [3]:
len(text)

1115394

In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(len(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [19]:
#char and integer token id lookup dicts
s_to_i = {}
i_to_s = {}
for pair in enumerate(chars):
  pair = list(pair)
  s_to_i[pair[1]] = pair[0]
  i_to_s[pair[0]] = pair[1]

In [29]:
def encode(string):
  return [s_to_i[char] for char in string]

def decode(integer_list):
  return ''.join([i_to_s[integer] for integer in integer_list])

In [35]:
#creating tensor for entire text
import torch
data = torch.tensor(encode(text), dtype=torch.long)
data.shape, data.dtype
data[:100]

In [51]:
#train-test split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [119]:
torch.manual_seed(1337)
block_size = 8 #max size of chunk for training
batch_size = 4

def get_batch(split):
  #generate a small batch of inputs x and targets y
  data = train_data if split == 'train' else val_data
  index = torch.randint(len(data) - block_size, (batch_size,))
  x = []
  y = []
  for i in index:
    x.append(data[i:i+block_size])
    y.append(data[i+1:i+block_size+1])
  x = torch.stack(x)
  y = torch.stack(y)
  return x, y

#FOR UNDERSTANDING: training example
for b in range(batch_size):
  for t in range(block_size):
    context = xb[b][:t+1]
    target = yb[b,t]
    print(context, target)

In [232]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X,Y)
      losses[k] = loss.item() #.item simply gets the loss value out of the loss object which is cross_entropy object
    out[split] = losses.mean() #diff losses for train and validation set
  model.train()
  return out



In [228]:
#putting the input in a Bigram Language Model


class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    #idx and targets are are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx) #shape is (Batch, block/chunk size, vocab size);

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
  #idx is of the shape (B,T) but the T is not necessarily equal to the batch size (8).
  #this is just a way of saying that the input taken for generating text by the model is exactly like the input taken for training i.e.
  #a bunch of tokens in a sequence
    for i in range(max_new_tokens):
      logits, loss = self(idx)

      logits = logits[:, -1, :] #becomes [B,C] since we only extract the logits of the last token in the input
      #apply softmax
      probs = F.softmax(logits, dim=-1) #becomes [B, C]
      #sample from the distribution
      idx_next = torch.multinomial(probs, num_samples =1) #(B,1) as one of the token ids is selected as per the probs
      idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
    return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)

torch.Size([256, 65])


In [217]:
#optimizer object
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [221]:
batch_size = 32
max_iters = 10000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
for iter in range(max_iters):

  #sample text
  xb, yb = get_batch('train')

  #evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.474918842315674


In [None]:
idx = torch.zeros((1,1), dtype=torch.long) #one batch with one token long batch size and the token is \n since that is token id 0
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist()))

#Self Attention

In [233]:
#self_attention mask understanding example
torch.manual_seed(1337)
B, T, C = 4,8,2 #Batches, Context/Block/Chunk length or the time dimension, Channels or information stored for each token
x = torch.randn(B,T,C)
x

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])

In [240]:
xbow = torch.zeros(B,T,C)
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] #(t,C) #this is a rudimentary "bag of words" example where we average out the logits for each token before a token. real attention blocks will not average
    xbow[b,t] = torch.mean(xprev,0)

print(xbow)

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])


In [None]:
#we want to ensure that the tokens can only look at the tokens which came before them to get context.
#When we're serving a string of text to the model, we will ensure that each token can only look at the tokens befor it.
#This is done using some mathematical insights:
#1. we can take a triangular matrix with half of the matrix being zeroes and the other half filled.
#   Then we take the weights of each token and create a mask where we mark each element in the upper triangle as -inf
#   Applying softmax to this

In [250]:
tril = torch.tril(torch.ones((T,T)))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])