In [58]:
# Load the data set
with open('input.txt', 'r') as f:
  input = f.read()
chars = sorted(list(set(input)))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [59]:
# simple tokenizor: mappings and encode and decode functions
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [60]:
# encode the entire text dataset
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
data = torch.tensor(encode(input))
print(data.shape, data.dtype)
print(data[:1000])

cuda
torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
 

In [61]:
# split into train and val sets
# learning: this split is simple, the entire encoded set is simply split into two.
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [62]:
# data loader
"""
learning: notice the difference between the organization of this data and that of makemore (i.e., name) data.
The makemore data is simpler in that it consists of names, which structures the data with each name having an start and an end. 
A block, defined by the block_size, operates within a name. On the contrary, this shakespera data is a blob of 
text, there is no inherent structure to it and hence allows abitrary slicing. A block in this context operates 
in the entire text blob and the positioning of it is determined by a random process (i.e., randint()) since there is 
no natural start points.
WAIT, the above analysis doesn't seem to be complete. Even though there is a natural start and end point of a name, 
the `def build_dataset(words):` function uses a block to slide through a word and repeats for all words. And the 
X in data contains examples of blocks instead of words. And then, I can use the same technique (i.e., block sliding)
for the entire text blob in this data, producing a dataset of a size of (len(text) - block_size).
So I guess they are different options for setting up the data.

An input block is reused (see `when input is... the target: ...` logs) to max training opportunities.

learning: torch.stack() turns
[tensor([24, 43, 58,  5, 57,  1, 46, 43]),
  tensor([44, 53, 56,  1, 58, 46, 39, 58]),
  tensor([52, 58,  1, 58, 46, 39, 58,  1]),
  tensor([25, 17, 27, 10,  0, 21,  1, 54])]
into
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
         [44, 53, 56,  1, 58, 46, 39, 58],
         [52, 58,  1, 58, 46, 39, 58,  1],
         [25, 17, 27, 10,  0, 21,  1, 54]])
"""
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split, batch_size, block_size):
  data = train_data if split == 'train' else val_data
  idx = torch.randint(len(data)-block_size, (batch_size,))
  # learning: I didn't use torch.stack
  x = torch.stack([data[i:i+block_size] for i in idx])
  y = torch.stack([data[i+1:i+block_size+1] for i in idx])
  x, y = x.to(device), y.to(device)
  return x, y

xb, yb = get_batch('train', batch_size, block_size)
print('inputs:')
print(xb.shape)
print(xb)
# learning: the targets in yb are index into the next char
print('targets')
print(yb.shape)
print(yb)

print('----')
# note: this structure in the training data isn't needed for the bigram model training
# note: this structure in the training data is only applied to the attention layers during the training of transformer model
for b in range(batch_size):
  for t in range(block_size):
    # learning: I forgot the indexing of time dimension is `:t+1`
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'when input is {context.tolist()} the target: {target}')

inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]], device='cuda:0')
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]], device='cuda:0')
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the

# Bigram definition

In [63]:
import torch.nn as nn
from torch.nn import functional as F
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # learning: this token_embedding_table is bound to a nn.Embedding object
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    # TODO: I need a positional embedding table

  def forward(self, idx, targets=None):
    # learning: I didn't know this guy returns logits
    # learning: self.token_embedding_table[idx] and got TypeError: 'Embedding' object is not subscriptable
    # I forgot that nn.Embedding is a class and it takes inputs as described in https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
    logits = self.token_embedding_table(idx)

    # got RuntimeError: Expected target size [4, 65], got [4, 8] when `loss = F.cross_entropy(logits, targets)`
    # to fix that, we transform them into 2d tensors by merging the B and T dimensions using `.view`
    if targets is not None:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)  # B*T shows T is useless outside of the attention module
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    else:
      loss = None
    
    return logits, loss
  
  def generate(self, idx, max_new_tokens):
    # learning: I need to start with the form of generation. In this case, the form is that of ChatGPT
    # I provide a prompt (i.e., idx) and the model use that to generate texts of length max_new_tokens
    for _ in range(max_new_tokens):
      logits, _ = self(idx)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      idx_next = torch.multinomial(probs, num_samples=1)
      # learning: idx.append(idx_next) -> AttributeError: 'Tensor' object has no attribute 'append'
      idx = torch.cat((idx, idx_next), dim=1)
    return decode(idx[0].tolist())

m = BigramLanguageModel(vocab_size)
m = m.to(device)
logits, loss = m(xb, yb)
# learning: the out.shape is [4,8,65] because we haven't done multinomial on the 65 logits yet. Therefore,
# we have the last dimension of 65
print(logits.shape)
print(loss)
# Use `torch.zeros((1,1), dtype=torch.long)` as the prompt. `torch.long` is necessary because the default 
# dtype is float with `zeros`
print(m.generate(torch.zeros((1,1), dtype=torch.long, device=device), 100))

torch.Size([32, 65])
tensor(5.0364, device='cuda:0', grad_fn=<NllLossBackward0>)

yq$;tfBfROkNdcuwdZZTkOMl;,ertK
w:!PLCkMBbeA$3:XaSGJO-3p&M-c?KL3auhpFYVXJFhNNNuhq$OMxv.tbVFYdXlrFZaAe


# Bigram training

In [64]:
# train the bigram model with torch optimizer
optimizer = torch.optim.AdamW(m.parameters())

for steps in range(10000):
  xb, yb = get_batch('train', batch_size, block_size)
  logits, loss = m(xb, yb)

  loss.backward()
  # learning: I put zero_grad() before step() and no learning occurred. `.data -= lr * .grad`, nothing happens
  # when `.grad` is 0
  optimizer.step()
  optimizer.zero_grad()

print(m.generate(torch.zeros((1,1), dtype=torch.long, device=device), 500))


Wawice my.

HDEdaromzy mug
Yowhthmoof isth ble mil;KI ll!,

W:

Ye sengmin lat HNGEdrovDEs, and Win nghir.
TWjomesel lind me l.
HAshe ce hiry ptug; aisspllw y.
Hllin's noroopetelives
MPOFGll, d mothakleo Windo whthCoisb3MI'Tham dourive ce higend t so mower; te

ANk d nterurt f s ar igr Wam:

Enge maleronth,
Mf Pre?

WISo myr f-NLLERar,

b&hak
ardsal thes ghesthiuin ccuk?
araney Iry ts I&fr y c!NGJknge tonok, mary.
Yor 'Wour me?m sora anghy t-senomes twe men.
Wand tho-z; cin s th llugy od,OThourc


# Attention explained

In [65]:
# aggregate past context. The input and output share the same shape: (B, T, C)
# note: there is a nested loop approach and a softmax approach to this
# B,T,C = 4,8,2
# x = torch.randn(B,T,C)
# xbow = torch.zeros((B,T,C))
# wei = torch.tril(torch.ones((T,T)))
# aver = wei / torch.sum(wei, 1, keepdim=True)
# xbow = aver @ x
# print(x[0])
# print(xbow[0])

# the softmax approach
B,T,C = 4,8,2
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))  # this wei is 2d
aver = torch.softmax(wei, dim=-1)  # this is the "attention pattern" described in 1B3B video
xbow = aver @ x # (T T) @ (T 2)
print(x[0])
print(xbow[0])

tensor([[-0.9407,  1.6751],
        [ 1.0907,  0.1475],
        [-0.6920, -0.7834],
        [-0.0246,  0.3170],
        [ 0.8250,  0.4557],
        [-0.8341, -0.8359],
        [-0.8077,  0.9350],
        [ 0.9412,  1.1077]])
tensor([[-0.9407,  1.6751],
        [ 0.0750,  0.9113],
        [-0.1807,  0.3464],
        [-0.1416,  0.3391],
        [ 0.0517,  0.3624],
        [-0.0959,  0.1627],
        [-0.1976,  0.2730],
        [-0.0553,  0.3773]])


In [66]:
# turn the dumb aggregate method into a single head of self attention
B,T,C = 4,6,8
head_size = 8
x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T, T))
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
q = query(x) # B, T, head_size
k = key(x) # B, T, head_size
v = value(x) # B, T, head_size
wei = q @ k.transpose(-2, -1)  # this wei is 3d. that is why softmax's dim can't be 1, but is -1
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = torch.softmax(wei, dim=-1) # B, T, T
print('attention pattern: ')
print(wei[0])
print('value matrix before weighted sum (i.e., wei @ v):')
print(v[0])
out = wei @ v  # B, T, head_size
print('output of the self attention head: ')
print(out[0])  # I can see how the weighted sum affects the output of the attention head

attention pattern: 
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2946, 0.7054, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1027, 0.0692, 0.8281, 0.0000, 0.0000, 0.0000],
        [0.1805, 0.6161, 0.1524, 0.0510, 0.0000, 0.0000],
        [0.3649, 0.2724, 0.0512, 0.2387, 0.0728, 0.0000],
        [0.3559, 0.1576, 0.0735, 0.0998, 0.1328, 0.1805]],
       grad_fn=<SelectBackward0>)
value matrix before weighted sum (i.e., wei @ v):
tensor([[ 4.3790e-01, -7.2814e-01,  7.8591e-01, -5.4940e-01,  5.5663e-01,
         -4.8415e-01,  6.0563e-01, -2.2115e-01],
        [ 2.4389e-01, -9.4100e-01, -6.5698e-02,  1.0840e+00, -3.3134e-01,
          7.0538e-01,  1.6110e+00, -2.4243e-01],
        [-1.7131e-01,  6.7616e-01, -4.2472e-01, -4.2271e-01,  5.0947e-02,
         -4.3690e-01, -9.3460e-01, -5.0428e-02],
        [ 8.2210e-01, -1.8570e-01, -7.9552e-01,  4.3803e-02, -9.9740e-01,
         -4.9402e-01, -7.5507e-01,  1.0097e+00],
        [ 1.9484e-01,  3.3190e-01,  3.3583e-02,  4.0934

In [67]:
# adding a linear layer to the bigram model
# adding a positional embedding to the model
pos_embedding_table = nn.Embedding(8, 32)
pos_embedding_table(torch.arange(8))
print(torch.allclose(pos_embedding_table.weight, pos_embedding_table(torch.arange(8))))

True


# Attention definition and model definition

In [68]:
# hyperparameters
batch_size = 64
block_size = 256
n_embd = 384
n_head = 6
n_layer = 6
max_iters = 8000
eval_iters = 200
eval_interval = 500
learning_rate = 3e-4
dropout = 0.2

class Head(nn.Module):
  """ One head of self attention """
  def __init__(self, head_size, n_embd):
    super().__init__()
    self.n_embd = n_embd
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    # learning: I forgot to use register_buffer. I use it because I don't want it to be in the computational graph
    self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    # learning: what I wrote vs. what is enough: B, T, C = x.shape[0], x.shape[1], x.shape[2]
    q = self.query(x)
    k = self.key(x)
    v = self.value(x)
    # learning: the normalization base is C, I used 2
    wei = q @ k.transpose(-2, -1) * self.n_embd**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = torch.softmax(wei, dim=-1)
    wei = self.dropout(wei)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  """ Multiple heads of self attention in parallel """
  def __init__(self, n_head, n_embd):
    super().__init__()
    head_size = n_embd // n_head
    # self.h = Head(head_size)
    # learning: the above was what I did. I should instead put these Heads in a ModuleList
    self.heads = nn.ModuleList([Head(head_size, n_embd) for _ in range(n_head)])
    self.proj = nn.Linear(head_size * n_head, n_embd, bias=False)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # return torch.cat((self.h(x), self.h(x), self.h(x), self.h(x)), dim=-1)
    # learning: the above was what I did. I should instead put these Heads in a ModuleList
    # Because this class is a representation of MULTI heads instead of a single head being forwarded multiple times
    x = torch.cat([head(x) for head in self.heads], dim=-1)
    x = self.proj(x)
    x = self.dropout(x)
    return x

class FeedForward(nn.Module):
  """ This layer is applied to each position in a block locally and independently """
  def __init__(self, n_embd):
    super().__init__()
    # learning: I forgot the non-linearity. With the non-linearity, I also need nn.Sequential
    self.net = nn.Sequential(
      nn.Linear(n_embd, n_embd, bias=False),
      nn.ReLU(),
      nn.Linear(n_embd, n_embd, bias=False),  # projection layer
      nn.Dropout(dropout)
    )

  def forward(self, x):
    return self.net(x)
  
class Block(nn.Module):
  """ Transformer block: Communicate-then-compute"""

  def __init__(self, n_head, n_embd):
    super().__init__()
    self.sa = MultiHeadAttention(n_head, n_embd)
    self.ffwd = FeedForward(n_embd=n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)    
  
  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

In [69]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embd_table = nn.Embedding(vocab_size, n_embd)
    self.pos_embd_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_head, n_embd) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)
    self.fully_connected = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    token_embd = self.token_embd_table(idx) # B T C
    # learning: I wrote self.pos_embd_table(idx). But the pos_embd_table takes POSITION as input instead of idx
    # learning 2: note that torch.arange(T) uses T instead of block_size because idx might be shorter than block_size
    pos_embd = self.pos_embd_table(torch.arange(T, device=device)) # B T C
    x = token_embd + pos_embd
    x = self.blocks(x) # B T C
    x = self.ln_f(x)
    logits = self.fully_connected(x) # B T vocab_size

    if targets is not None: # training mode
      logits = logits.view(B*T, -1)
      # learning: I wrote targets.view(B*T, -1), which returns a 2d tensor. I need 1d for targets instead
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    else:
      loss = None

    return logits, loss


  def generate(self, idx, max_token):
    for _ in range(max_token):
      # learning: forgot to crop idx
      idx_cond = idx[:, -block_size:]
      logits, _ = self(idx_cond)
      # learning: forgot the line below. Logits are generated for each and every position in a sequence and 
      # I only need the last set of logits
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)
      next_token = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, next_token), dim=1)
    return idx

In [70]:
# Training
# learning: parameters can be extracted simply using m
m = GPTLanguageModel()
m = m.to(device)
optimizer = torch.optim.AdamW(m.parameters(), learning_rate)
# learning: m.parameters() returns an iterable of params, each of which are defined 
# using basic layer constructs provided in torch.nn
print(sum(p.numel() for p in m.parameters()), 'parameters')

5466689 parameters


### 1st training run attempt - failed
```
OUTPUT
      0/   5000: 4.7011
    200/   5000: 4.7024
    400/   5000: 4.5861
    600/   5000: 4.8061
    800/   5000: 4.7599
   1000/   5000: 4.8209
   1200/   5000: 4.9178
   1400/   5000: 4.7532
   1600/   5000: 4.8195
   1800/   5000: 4.6576
   2000/   5000: 4.8024
   2200/   5000: 4.6016
   2400/   5000: 4.6853
   2600/   5000: 4.7858
   2800/   5000: 4.8272
   3000/   5000: 4.8139
   3200/   5000: 4.7915
   3400/   5000: 4.6659
   3600/   5000: 4.7886
   3800/   5000: 4.7374
   4000/   5000: 4.7108
   4200/   5000: 4.8204
   4400/   5000: 4.8655
   4600/   5000: 4.7810
   4800/   5000: 4.6445

bA&XUjnT$TZ?Nhe$Rz:bhYKcdQgNbYF?GOajb'pw?M-
;w?SnYwPqKLQw?ydxodwcXcGbIYmNT,Y
Kyp$VX'DgaNY ziDbSOiaX,aTq mKRs!y&'d
I'etfyGaH
XjJSGKYO,vQ&iUjch&mvmmZkaTTKWzXr
bYF?BnIBOacdhWTYKVTSjmmuZj ;XVbYKeqzSYCWoYNYReOO$NOA.j

qhahWXTYFxN!.Ya'kzq$qvmiqfoGnstMMcaOFf3SSaF;fRaKAOaGYFRGsWCFY?Iq!qfamN-WBy!AF!PHgXa$JUT;:&bSjOmKMKRYT3Cj RKWIaUWzeW$FOah&mxUATVmROtqh!&uKRgNdTXb;Tbs
;MuT?&aAb;,YKRaXKmxc:eTX;GaHhWz,Oa$OOa
Wc!UObGOGUXOTSrTTYUGMFYm;jWzYjaqOBNUJ&dFbWC-Tm
WDTTjA
isiOb;.WhqA
TXON!wv
IqeNIPUiXhinYsf.UN
T, DmP
```

Two questions For the above training run:
1. I don't think I need to Carefully initialize the Parameters like the Make more series anymore Because The definition of parameters in a layer is not done by hand. But what is the range of initial loss? I got above four.
  AN: Andrej's: `step 0: train loss 4.4116, val loss 4.4022`. Same range.
2. The training is not working Given that the loss is not decreasing. Why?
  AN: I have the this in the previous cell..
```
optimizer = torch.optim.AdamW(m.parameters())
m = GPTLanguageModel()
```

In [71]:
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            # learning: using sampling over evaluating against entire datasets for efficiency
            X, Y = get_batch(split, batch_size, block_size)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

for i in range(max_iters):
  xb, yb = get_batch('train', batch_size, block_size)
  _, loss = m(xb, yb)
  loss.backward()

  if i % eval_interval == 0 or i == max_iters-1:
    losses = estimate_loss()
    print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # learning: forgot optimizer.step(). That is used to update the params
  optimizer.step()
  optimizer.zero_grad()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, 500)[0].tolist()))

step 0: train loss 4.2937, val loss 4.2903
step 500: train loss 2.1114, val loss 2.1642
step 1000: train loss 1.7357, val loss 1.8782
step 1500: train loss 1.5621, val loss 1.7387
step 2000: train loss 1.4625, val loss 1.6551
step 2500: train loss 1.3975, val loss 1.6061
step 3000: train loss 1.3481, val loss 1.5707
step 3500: train loss 1.3106, val loss 1.5493
step 4000: train loss 1.2786, val loss 1.5267
step 4500: train loss 1.2490, val loss 1.5211
step 5000: train loss 1.2283, val loss 1.5038
step 5500: train loss 1.2082, val loss 1.4982
step 6000: train loss 1.1897, val loss 1.4985
step 6500: train loss 1.1680, val loss 1.4862
step 7000: train loss 1.1521, val loss 1.4851
step 7500: train loss 1.1355, val loss 1.4855
step 7999: train loss 1.1198, val loss 1.4897

Thingsmen, dark, with us: therefore word thou, dead tale.

CORIOLANUS:
What, pestime to speak--
Untilly must cleave, unshap of nine lest,
But unfrighties partience how it would for right
Make divied. Farewell; we have no 

# Results analysis
### experiment 1
```
batch_size = 32
block_size = 6
n_embd = 32
n_head = 4
n_layer = 3
max_iters = 5000
```
Arch: 
1. Multi headed attention
2. residual connections

22849 parameters

Results:
- Training time: 25.1s
- Performance: train loss 2.1236, val loss 2.1695

### experiment 2
```
batch_size = 32
block_size = 6
n_embd = 32
n_head = 4
n_layer = 3
max_iters = 10000 # increased
```
Arch: 
1. Multi headed attention
2. residual connections

22849 parameters

Results
- Training time: 62.5s
- Performance: train loss 2.0394, val loss 2.1237

### experiment 3
```
batch_size = 32
block_size = 32 # increased
n_embd = 32
n_head = 4
n_layer = 3
max_iters = 10000
```
Arch: 
1. Multi headed attention
2. residual connections

23681 parameters

Results:
- Training time: 2m 36.3s
- Performance: train loss 1.8381, val loss 1.9865

### experiment 4
```
batch_size = 32
block_size = 32
n_embd = 32
n_head = 4
n_layer = 3
max_iters = 10000
learning_rate = 3e-4 # decreased from 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections

23681 parameters

Results
- Training time: 2m 36.3s
- Performance: train loss 2.0735, val loss 2.1210

Observation: This learning rate is too slow for the majority of training

### experiment 5
```
batch_size = 32
block_size = 32
n_embd = 32
n_head = 4
n_layer = 6 # increased
max_iters = 10000
learning_rate = 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections

42113 parameters

Results
- Training time: 4m 58s
- Performance: train loss 1.7351, val loss 1.8810

Observation: The losses got below 2.0 with half of 10000 iterations

### experiment 6
```
batch_size = 32
block_size = 32
n_embd = 128 # increased
n_head = 4
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections

610625 parameters

Results
- Training time: 8m 4.5s
- Performance: train loss 1.4459, val loss 1.6770

### experiment 7 (too long to train)
```
batch_size = 32
block_size = 256 # increased
n_embd = 128
n_head = 4
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections

639297 parameters

Results
- Training time: too long to train
- Performance: too long to train

Observation: this only increased the size of `pos_embd_table`. The parameters count didn't increase by much relatively speaking. However, Training takes significantly longer. It took 6 minutes to complete 500 iterations, that is 2hr estimated for 10000 iterations.

### experiment 8: see if this is faster than set 7
```
batch_size = 32
block_size = 32
n_embd = 256 # increased
n_head = 4
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections

2,400,833 parameters (GPT3 has 175b params)

Results
- Training time: 14m 45s
- Performance: train loss 1.4654, val loss 1.6836

Observation: 
1. This is much faster than set 7. 500 iterations took about a minute. Seems like these params aren't created equal when it comes to impacts on training speed. Interestingly, the doubling of `n_embd` increased the parameter count by 4 folds while the increase of `block_size` only added about 30000 params to the count. 
2. There is no perf improvement over set 6. The second half of 10000 iterations didn't do much in terms of reducing the losses

**Question: Why is the training time not proportional to the increase of param count?**
AN (Claude): 
- The large increase in block_size (set 7) significantly slowed down training due to the quadratic complexity of attention mechanisms.
- Increasing n_embd (set 8) increased the parameter count more but had a smaller impact on training time. This could be due to a combination of factors including better cache utilization and the M2's ability to handle larger matrix operations relatively efficiently.
- the key point is the impact on training time depends more on how the parameters affect the computational structure of the model rather than just the raw number of parameters. 

### experiment 9: add layernorm to see if the second half of training works better
```
batch_size = 32
block_size = 32
n_embd = 256
n_head = 4
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
```
Arch: 
1. Multi headed attention
2. residual connections
3. layernorm

2,407,489 parameters (GPT3 has 175b params)

Results
- Training time: 14m
- Performance: train loss 1.3485, val loss 1.5908

Observations: 
- I'm not sure if the second half of the training worked better but the training performance improved over experiment 8. Layernorm contributed to the overall training effectiveness.
- overfitting is significant given the gap between train loss and val loss

### experiment 10: add dropout to reduce overfitting
```
batch_size = 32
block_size = 32
n_embd = 256
n_head = 4
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
dropout = 0.2 # added
```
Arch: 
1. Multi headed attention
2. residual connections
3. layernorm
4. dropout

2,407,489 parameters (GPT3 has 175b params)

Results
- Training time: 17m 40s
- Performance: train loss 1.4242, val loss 1.6229

Observation: dropout seemed to maintain the gap between train loss and val loss. But it also seemed to hamper overall training performance. This run is worse than the previous even though overfitting is less severe.

## GPU training

### experiment 11: same set up as experiment 10, see how much faster the GPU can make possible
GPU instance: 
gpu_1x_a100_sxm4, Lambda Labs

Results
- training time: 5m
- Performance: train loss 1.4252, val loss 1.6392
Observation: 1/3 the time of CPU training.

### experiment 12: same set up but use an instance of multiple gpus
GPU instance: 
gpu_1x_a100_sxm4, Lambda Labs

Results
- training time: 4m
- Performance: train loss 1.4305, val loss 1.6353
Observation:
- no improvement over previous. I probably need to set up compute distribution to utilize the multiple GPUs.
- and I did. I attempted to use `nn.DataParallel` for simpler multi-GPU set up but it doesn't work with the custom definition of transformer. DDP is used in https://github.com/karpathy/nanoGPT/blob/master/train.py#L8 and is recommended. But since it is more complex, I will do this when I study the nanoGPT train code.

### experiment 13: same set up but use a better GPU
GPU instance: 
gpu_1x_h100_pcie, Lambda Labs

Results
- training time: 9m 30s
- Performance: train loss 1.4297, val loss 1.6355
Observation: this is so much slower than gpu_1x_a100_sxm4. Why is that? I guess it is not that important to have an answer to this now

### experiment 14: scaling up the model further
GPU instance: 
gpu_1x_a100_sxm4, Lambda Labs

```
batch_size = 64 # increased
block_size = 256 # increased
n_embd = 384 # increased
n_head = 6 # increased
n_layer = 6
max_iters = 10000
learning_rate = 1e-3
dropout = 0.2
```
Arch: 
1. Multi headed attention
2. residual connections
3. layernorm
4. dropout

5,466,689 parameters (GPT3 has 175b params)

Results
- Training time: 16m
- Performance: train loss 0.8349, val loss 1.5687
observation: this training run massively overfit the model in the second half of the training. let me reduce the learning rate and see if overfitting gets any better

```
step 0: train loss 4.2937, val loss 4.2903
step 500: train loss 1.6452, val loss 1.8075
step 1000: train loss 1.3973, val loss 1.6076
step 1500: train loss 1.2997, val loss 1.5436
step 2000: train loss 1.2340, val loss 1.5108
step 2500: train loss 1.1880, val loss 1.4952
step 3000: train loss 1.1508, val loss 1.4896
step 3500: train loss 1.1157, val loss 1.4844
step 4000: train loss 1.0861, val loss 1.4781
step 4500: train loss 1.0569, val loss 1.4957
step 5000: train loss 1.0328, val loss 1.5006
step 5500: train loss 1.0064, val loss 1.5027
step 6000: train loss 0.9837, val loss 1.5126
step 6500: train loss 0.9603, val loss 1.5160
step 7000: train loss 0.9385, val loss 1.5198
step 7500: train loss 0.9175, val loss 1.5382
step 8000: train loss 0.8999, val loss 1.5411
step 8500: train loss 0.8814, val loss 1.5569
step 9000: train loss 0.8643, val loss 1.5571
step 9500: train loss 0.8486, val loss 1.5714
step 9999: train loss 0.8349, val loss 1.5687
```

### experiment 15: use a smaller learning rate
```
batch_size = 64 # increased
block_size = 256 # increased
n_embd = 384 # increased
n_head = 6 # increased
n_layer = 6
max_iters = 8000 # reduced too so it doesn't run for too long
learning_rate = 3e-4 # reduced
dropout = 0.2
```

Results
- Training time: 10m 40s
- Performance: step 7999: train loss 1.1198, val loss 1.4897
Observation: 5000 steps are enough. The rest didn't do anything useful