In [3]:
# Load the data set
with open('input.txt', 'r') as f:
  input = f.read()
chars = sorted(list(set(input)))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [4]:
# simple tokenizor: mappings and encode and decode functions
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [5]:
# encode the entire text dataset
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
data = torch.tensor(encode(input))
print(data.shape, data.dtype)
print(data[:1000])

cpu
torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
  

In [6]:
# split into train and val sets
# learning: this split is simple, the entire encoded set is simply split into two.
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
# data loader
"""
learning: notice the difference between the organization of this data and that of makemore (i.e., name) data.
The makemore data is simpler in that it consists of names, which structures the data with each name having an start and an end. 
A block, defined by the block_size, operates within a name. On the contrary, this shakespera data is a blob of 
text, there is no inherent structure to it and hence allows abitrary slicing. A block in this context operates 
in the entire text blob and the positioning of it is determined by a random process (i.e., randint()) since there is 
no natural start points.
WAIT, the above analysis doesn't seem to be complete. Even though there is a natural start and end point of a name, 
the `def build_dataset(words):` function uses a block to slide through a word and repeats for all words. And the 
X in data contains examples of blocks instead of words. And then, I can use the same technique (i.e., block sliding)
for the entire text blob in this data, producing a dataset of a size of (len(text) - block_size).
So I guess they are different options for setting up the data.

An input block is reused (see `when input is... the target: ...` logs) to max training opportunities.

learning: torch.stack() turns
[tensor([24, 43, 58,  5, 57,  1, 46, 43]),
  tensor([44, 53, 56,  1, 58, 46, 39, 58]),
  tensor([52, 58,  1, 58, 46, 39, 58,  1]),
  tensor([25, 17, 27, 10,  0, 21,  1, 54])]
into
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]])
"""
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  idx = torch.randint(len(data)-block_size, (batch_size,))
  # learning: I didn't use torch.stack
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+block_size+1] for i in idx])
  return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
# learning: the targets in yb are index into the next char
print('targets')
print(yb.shape)
print(yb)

print('----')
# note: this structure in the training data isn't needed for the bigram model training
# note: this structure in the training data is only applied to the attention layers during the training of transformer model
for b in range(batch_size):
  for t in range(block_size):
    # learning: I forgot the indexing of time dimension is `:t+1`
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53,

# Bigram definition

In [19]:
import torch.nn as nn
from torch.nn import functional as F
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # learning: this token_embedding_table is bound to a nn.Embedding object
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # TODO: I need a positional embedding table

  def forward(self, idx, targets=None):
    # learning: I didn't know this guy returns logits
    # learning: self.token_embedding_table[idx] and got TypeError: 'Embedding' object is not subscriptable
    # I forgot that nn.Embedding is a class and it takes inputs as described in https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
    logits = self.token_embedding_table(idx)

    # got RuntimeError: Expected target size [4, 65], got [4, 8] when `loss = F.cross_entropy(logits, targets)`
    # to fix that, we transform them into 2d tensors by merging the B and T dimensions using `.view`
    if targets is not None:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)  # B*T shows T is useless outside of the attention module
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    else:
      loss = None
    
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    # learning: I need to start with the form of generation. In this case, the form is that of ChatGPT
    # I provide a prompt (i.e., idx) and the model use that to generate texts of length max_new_tokens
    for _ in range(max_new_tokens):
      logits, _ = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      # learning: idx.append(idx_next) -> AttributeError: 'Tensor' object has no attribute 'append'
      idx = torch.cat((idx, idx_next), dim=1)
    return decode(idx[0].tolist())

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
# learning: the out.shape is [4,8,65] because we haven't done multinomial on the 65 logits yet. Therefore,
# we have the last dimension of 65
print(logits.shape)
print(loss)
# Use `torch.zeros((1,1), dtype=torch.long)` as the prompt. `torch.long` is necessary because the default 
# dtype is float with `zeros`
print(m.generate(torch.zeros((1,1), dtype=torch.long), 100))

torch.Size([32, 65])
tensor(4.7319, grad_fn=<NllLossBackward0>)

wftE:fU
-&p?L3HdvPQSQ.FNFW:OqzUC:llUpuNY3p&HBho3pRjU
iwSsMMRKvqYedh.3p;EdWxcAifQeEqXU'JVNYPWjc
z,-iN


# Bigram training

In [10]:
# train the bigram model with torch optimizer
optimizer = torch.optim.AdamW(m.parameters())

for steps in range(10000):
  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)

  loss.backward()
  # learning: I put zero_grad() before step() and no learning occurred. `.data -= lr * .grad`, nothing happens
  # when `.grad` is 0
  optimizer.step()
  optimizer.zero_grad()

print(m.generate(torch.zeros((1,1), dtype=torch.long), 500))


A jS,
fr'k tcaant ngandechoutonicomyby n, dis Gorllllin dr, t ke zhowa te the 'Then perar Ay hw
CO:
CHChygurs-
S r a t lk:
Q olta cu
Ca, s ARDelids GS fe:
YY&cavefed re; sche,d ss! IZqSDXve pr s'Coyoomo?
CIqAll: u:

Wh mb, wesapiQero' giver.
I br th g JONyom whem tro ceriese s it purfr,t d,'mef cir l itheay
and'OKfand 'e wis toshurkichis'l a ss Binor,' d ngh or t s fondZTESMinccan,
The she notomy he pe f d ave wnING hecby dr h me u.
MNORINous ngof ayo may;d th Abe l lou pthepatur,
Ane? pouthit c


# Implementation of attention

In [11]:
# aggregate past context. The input and output share the same shape: (B, T, C)
# note: there is a nested loop approach and a softmax approach to this
# B,T,C = 4,8,2
# x = torch.randn(B,T,C)
# xbow = torch.zeros((B,T,C))
# wei = torch.tril(torch.ones((T,T)))
# aver = wei / torch.sum(wei, 1, keepdim=True)
# xbow = aver @ x
# print(x[0])
# print(xbow[0])

# the softmax approach
B,T,C = 4,8,2
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))  # this wei is 2d
aver = torch.softmax(wei, dim=-1)  # this is the "attention pattern" described in 1B3B video
xbow = aver @ x # (T T) @ (T 2)
print(x[0])
print(xbow[0])

tensor([[-1.3794,  0.2624],
        [-1.2309,  0.4991],
        [-0.3531, -0.6193],
        [-0.8199, -1.8553],
        [-1.2193,  0.2188],
        [-1.8425,  0.2470],
        [ 1.3761, -0.6989],
        [ 0.8018,  0.1684]])
tensor([[-1.3794,  0.2624],
        [-1.3051,  0.3808],
        [-0.9878,  0.0474],
        [-0.9458, -0.4282],
        [-1.0005, -0.2988],
        [-1.1408, -0.2079],
        [-0.7813, -0.2780],
        [-0.5834, -0.2222]])


In [12]:
# turn the dumb aggregate method into a single head of self attention
B,T,C = 4,6,8
head_size = 8
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T, T))
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
q = query(x) # B, T, head_size
k = key(x) # B, T, head_size
v = value(x) # B, T, head_size
wei = q @ k.transpose(-2, -1)  # this wei is 3d. that is why softmax's dim can't be 1, but is -1
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.softmax(wei, dim=-1) # B, T, T
print('attention pattern: ')
print(wei[0])
print('value matrix before weighted sum (i.e., wei @ v):')
print(v[0])
out = wei @ v  # B, T, head_size
print('output of the self attention head: ')
print(out[0])  # I can see how the weighted sum affects the output of the attention head

attention pattern: 
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.7843, 0.2157, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2240, 0.3344, 0.4416, 0.0000, 0.0000, 0.0000],
        [0.0302, 0.6503, 0.1961, 0.1235, 0.0000, 0.0000],
        [0.1168, 0.7052, 0.0851, 0.0739, 0.0189, 0.0000],
        [0.1463, 0.2580, 0.1649, 0.1042, 0.1172, 0.2093]],
       grad_fn=<SelectBackward0>)
value matrix before weighted sum (i.e., wei @ v):
tensor([[-0.7059,  0.4846,  0.5991, -0.2759,  0.0818,  0.0714, -0.0364,  0.1166],
        [-0.0129, -0.5899,  0.6649,  0.6830, -1.0855,  0.3631,  0.4322, -0.4728],
        [ 0.4290, -0.2209,  0.6159,  0.1923,  0.6443, -0.4568,  0.5005, -0.0281],
        [-0.6370, -1.1575, -0.3756,  0.4416, -0.6450, -0.0261,  0.0154,  1.1953],
        [ 1.1061,  0.5586, -0.1329, -0.5069,  0.5895,  0.0889, -0.0968, -0.3687],
        [-0.2447, -1.0041, -0.5983,  0.1493, -0.6762, -0.6977,  0.9403,  1.0576]],
       grad_fn=<SelectBackward0>)
output of the self a

In [13]:
# adding a linear layer to the bigram model
# adding a positional embedding to the model
pos_embedding_table = nn.Embedding(8, 32)
pos_embedding_table(torch.arange(8))
print(torch.allclose(pos_embedding_table.weight, pos_embedding_table(torch.arange(8))))

True


In [14]:
# TODO: complete Block class - DONE
# TODO: add residual connections
  # TODO: add projections
# TODO: add dropouts
# TODO: add layernorm

B,T,C = 4,6,32

class Head(nn.Module):
  """ One head of self attention """
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(C, head_size, bias=False)
    self.query = nn.Linear(C, head_size, bias=False)
    self.value = nn.Linear(C, head_size, bias=False)
    # learning: I forgot to use register_buffer. I use it because I don't want it to be in the computational graph
    self.register_buffer('tril', torch.tril(torch.ones((T, T))))

  def forward(self, x):
    # learning: what I wrote vs. what is enough: B, T, C = x.shape[0], x.shape[1], x.shape[2]
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)
    # learning: the normalization base is C, I used 2
    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril == 0, float('-inf'))
    wei = torch.softmax(wei, dim=-1)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  """ Multiple heads of self attention in parallel """
  def __init__(self, n_head):
    super().__init__()
    head_size = C // n_head
    # self.h = Head(head_size)
    # learning: the above was what I did. I should instead put these Heads in a ModuleList
    self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
    self.proj = nn.Linear(head_size * n_head, C, bias=False)

  def forward(self, x):
    # return torch.cat((self.h(x), self.h(x), self.h(x), self.h(x)), dim=-1)
    # learning: the above was what I did. I should instead put these Heads in a ModuleList
    # Because this class is a representation of MULTI heads instead of a single head being forwarded multiple times
    x = torch.cat([head(x) for head in self.heads], dim=-1)
    x = self.proj(x)
    return x

class FeedFoward(nn.Module):
  """ This layer is applied to each position in a block locally and independently """
  def __init__(self, n_embd):
    super().__init__()
    # learning: I forgot the non-linearity. With the non-linearity, I also need nn.Sequential
    self.net = nn.Sequential(
      nn.Linear(n_embd, n_embd, bias=False),
      nn.ReLU(),
      nn.Linear(n_embd, n_embd, bias=False)  # projection layer
    )

  def forward(self, x):
    return self.net(x)
  
class Block(nn.Module):
  """ Transformer block: Communicate-then-compute"""

  def __init__(self, n_head):
    super().__init__()
    self.sa = MultiHeadAttention(n_head=n_head)
    self.ffwd = FeedFoward(n_embd=C)
  
  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x

tensor([[[-5.1128e-01,  7.1639e-01, -6.5479e-01,  1.0583e-01, -8.2697e-02,
           5.3774e-01, -1.0094e+00, -8.5511e-01, -2.8455e-01, -4.4945e-02,
           5.6408e-01,  4.6272e-01,  5.2987e-01, -1.2838e+00,  1.9325e-01,
           1.0372e-01,  1.3753e+00, -1.1843e+00, -3.5781e-01,  6.6724e-01,
          -3.3983e-01,  4.4522e-01, -5.3159e-01,  7.8568e-01,  1.3475e-01,
          -7.8972e-02,  5.0798e-01,  9.0163e-01,  1.0944e+00, -5.9593e-01,
          -4.7291e-01, -9.6337e-02],
         [ 3.2456e-01,  4.3658e-01, -6.0482e-01,  3.3561e-02,  2.2846e-01,
           1.0905e+00, -6.0211e-01, -4.3757e-01, -1.3601e-01,  4.0826e-02,
           8.7570e-01,  1.0827e-01,  3.8885e-01, -7.7548e-01,  4.3872e-01,
          -3.7804e-01,  2.6918e-01, -1.4338e+00, -1.5597e+00, -1.3550e-01,
           4.3272e-01, -2.7150e-01, -2.5645e-02,  6.5949e-02,  1.4966e-01,
          -3.2941e-01,  6.5958e-01,  3.3163e-01,  8.2996e-01, -7.9754e-01,
          -3.7881e-01, -2.2242e-01],
         [ 6.7855e-01,  5.

# GPT definition using the layers defined above

In [None]:
# hyperparameters
# ...

class GPTLanguageModel(nn.Module):
  pass

In [None]:
# Training
# TODO: use CPU to train
# TODO: use GPU to train
# TODO: review nanoGPT training script