copied from [gpt.py](https://github.com/karpathy/nanochat/blob/master/nanochat/gpt.py)

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../my_nanochat')

In [2]:
from my_nanochat.my_gpt import GPTConfig, GPT
from my_nanochat.my_common import get_dist_info
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial

### Understanding optimizers in general

First try to remember/reconstruct what an optimizer does. I think it decides what the learning rate should be at each training step. For example, in early training steps we want to multiply the gradient by a higher learning rate and in later training steps by a lower one, but there are countless ways to adjust the learning rate over time ranging from a linear predetermined schedule to one that takes into account "clues" such as how fast the loss is training, if we're converging, etc. I believe the optimizer encapsulates all that.

Let's see what ChatGPT says: In deep learning, an optimizer is the algorithm that adjusts a model’s parameters (weights and biases) during training to minimize the loss function. It decides how to update the parameters based on gradients computed from backpropagation.

So maybe it's even a bit more general than what I thought. It's job is not just or only to decide the learning rate, but in fact to update the weights.

How do you use a torch optimizer? What's the contract? Let's try a simple example.

In [3]:
x = torch.randn(5, 2)
y = (x[:,0] * 5 + x[:,1] * 7).unsqueeze(-1)

In [4]:
x, y

(tensor([[ 0.2985,  2.5948],
         [-0.1912,  0.6394],
         [-1.4373,  1.1328],
         [ 0.3734,  1.1152],
         [ 1.0200,  0.6448]]),
 tensor([[19.6562],
         [ 3.5200],
         [ 0.7432],
         [ 9.6734],
         [ 9.6139]]))

In [5]:
model = torch.nn.Linear(2,1,bias=False)
model(x)

tensor([[-1.3072],
        [-0.4432],
        [-1.2876],
        [-0.4497],
        [ 0.1077]], grad_fn=<MmBackward0>)

In [6]:
model.weight

Parameter containing:
tensor([[ 0.4573, -0.5564]], requires_grad=True)

In [7]:
optimizer = torch.optim.SGD(model.parameters())
for i in range(10_000):
    optimizer.zero_grad()
    F.mse_loss(model(x),y).backward()
    if i % 1000 == 0:
        print(f"step: {i}, loss: {F.mse_loss(model(x),y).item()}, learning rate: {optimizer.param_groups[0]['lr']}")
    optimizer.step()

step: 0, loss: 130.42807006835938, learning rate: 0.001
step: 1000, loss: 0.9276749491691589, learning rate: 0.001
step: 2000, loss: 0.06006918102502823, learning rate: 0.001
step: 3000, loss: 0.004045575857162476, learning rate: 0.001
step: 4000, loss: 0.0002725429367274046, learning rate: 0.001
step: 5000, loss: 1.8373380953562446e-05, learning rate: 0.001
step: 6000, loss: 1.2555952935144887e-06, learning rate: 0.001
step: 7000, loss: 1.0440614062190434e-07, learning rate: 0.001
step: 8000, loss: 2.836850399035029e-08, learning rate: 0.001
step: 9000, loss: 2.836850399035029e-08, learning rate: 0.001


In [8]:
model.weight

Parameter containing:
tensor([[4.9998, 7.0001]], requires_grad=True)

So judging by that at least, the contract seems to be that you give it the parameters and you tell it to take steps. It's reponsible for reading the gradient, multiplying it by the learning rate, and subtracting it from the weights, or whatever it chooses to do to update the weights.

### Step through GPT.setup_optimizers()

In [9]:
my_config = GPTConfig(
    sequence_len=7,
    vocab_size=10,
    n_layer=2,
    n_head=2,
    n_kv_head=2,
    n_embd=16,
)
gpt = GPT(my_config)
gpt.init_weights()
gpt

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(10, 16)
    (h): ModuleList(
      (0-1): 2 x Block(
        (attn): CausalSelfAttention(
          (c_q): Linear(in_features=16, out_features=16, bias=False)
          (c_k): Linear(in_features=16, out_features=16, bias=False)
          (c_v): Linear(in_features=16, out_features=16, bias=False)
          (c_proj): Linear(in_features=16, out_features=16, bias=False)
        )
        (mlp): MLP(
          (c_fc): Linear(in_features=16, out_features=64, bias=False)
          (c_proj): Linear(in_features=64, out_features=16, bias=False)
        )
      )
    )
  )
  (lm_head): Linear(in_features=16, out_features=10, bias=False)
)

In [10]:
# parameters passed into the function with their defaults
unembedding_lr = 0.004 # is this for the lm_head ? A synonym for that?
embedding_lr = 0.2
matrix_lr = 0.2
weight_decay = 0.0

In [11]:
# to make it easier to later copy and paste this code into gpt.py, I'm going to:
self = gpt

In [12]:
model_dim = self.config.n_embd; model_dim

16

In [13]:
ddp, rank, local_rank, world_size = get_dist_info()
ddp, rank, local_rank, world_size

(False, 0, 0, 1)

In [14]:
# seperate params into 3 groups
matrix_params = list(self.transformer.h.parameters()) # so "matrix" is all except wte and lm_head
embedding_params = list(self.transformer.wte.parameters())
lm_head_params = list(self.lm_head.parameters())
len(matrix_params), len(embedding_params), len(lm_head_params) # expect 6*2=12, 1, 1 

(12, 1, 1)

In [15]:
assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params)

In [16]:
# create the AdamW optimizer for the embedding and lm_head
# scale the LR for by proportional to 1/sqrt(model_dim)
# from his note, seems like the passed in / default LRs assume a 768 dim model so we scale the LR
# if this model has different dimensions
# not sure why the wording is "scale ... for by proportional to" vs "scale by" and if I'm missing
# something or understanding ∝ wrong
dmodel_lr_scale = (model_dim / 768) ** -0.5; dmodel_lr_scale

6.92820323027551

In [17]:
if rank == 0:
    print(f"Scaling the LR for the AdamW parameters proportional to 1/sqrt({model_dim}/768) = {dmodel_lr_scale}")

Scaling the LR for the AdamW parameters proportional to 1/sqrt(16/768) = 6.92820323027551


In [18]:
adam_groups = [
    dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale),
    dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale),
]

In [19]:
# assume these 2 betas correspond to the 2 groups
adamw_kwargs = dict(betas=(0.8, 0.95), eps=1e-10, weight_decay=weight_decay)

In [20]:
# DistAdamW comes from his adamw.py and is used for distributed training, set to None for now
# so we'll fail if we try to do distributed training and haven't yet "copied" it
DistAdamW = None

In [21]:
AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True)

What is fused? ChatGPT: multiple operations have been combined into a single, lower-level kernel (often a CUDA kernel) instead of running as many separate operations

In [22]:
adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs)
adamw_optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.8, 0.95)
    capturable: False
    decoupled_weight_decay: True
    differentiable: False
    eps: 1e-10
    foreach: None
    fused: True
    lr: 0.027712812921102038
    maximize: False
    weight_decay: 0.0

Parameter Group 1
    amsgrad: False
    betas: (0.8, 0.95)
    capturable: False
    decoupled_weight_decay: True
    differentiable: False
    eps: 1e-10
    foreach: None
    fused: True
    lr: 1.385640646055102
    maximize: False
    weight_decay: 0.0
)

In [23]:
# Create the Muon optimizer for the linear layers
muon_kwargs = dict(lr=matrix_lr, momentum=0.95)

Hmm, he implements Muon (not just DistMuon) in `muon.py`. This doesn't seem like the right time to go deep into that. FOR NOW, I'm going to break the no copy / paste rule and copy his muon.py wholesale so I can use it here.

In [24]:
from my_nanochat.muon import Muon, DistMuon

In [25]:
MuonFactory = DistMoon if ddp else Muon
muon_optimizer = MuonFactory(matrix_params, **muon_kwargs)
muon_optimizer

Muon (
Parameter Group 0
    lr: 0.2
    momentum: 0.95
    nesterov: True
    ns_steps: 5

Parameter Group 1
    lr: 0.2
    momentum: 0.95
    nesterov: True
    ns_steps: 5
)

In [26]:
# combine the two optimizers into one list
optimizers = [adamw_optimizer, muon_optimizer]

In [27]:
for opt in optimizers:
    for group in opt.param_groups:
        group["initial_lr"] = group["lr"] # guessing for reporting (nope, or not ony, realized this in challenge 13)

In [28]:
muon_optimizer

Muon (
Parameter Group 0
    initial_lr: 0.2
    lr: 0.2
    momentum: 0.95
    nesterov: True
    ns_steps: 5

Parameter Group 1
    initial_lr: 0.2
    lr: 0.2
    momentum: 0.95
    nesterov: True
    ns_steps: 5
)

Now add `setup_optimizers()` to `my_gpt.py`

In [29]:
gpt = GPT(my_config)

In [30]:
gpt.setup_optimizers()

Scaling the LR for the AdamW parameters proportional to 1/sqrt(16/768) = 6.92820323027551


[AdamW (
 Parameter Group 0
     amsgrad: False
     betas: (0.8, 0.95)
     capturable: False
     decoupled_weight_decay: True
     differentiable: False
     eps: 1e-10
     foreach: None
     fused: True
     initial_lr: 0.027712812921102038
     lr: 0.027712812921102038
     maximize: False
     weight_decay: 0.0
 
 Parameter Group 1
     amsgrad: False
     betas: (0.8, 0.95)
     capturable: False
     decoupled_weight_decay: True
     differentiable: False
     eps: 1e-10
     foreach: None
     fused: True
     initial_lr: 1.385640646055102
     lr: 1.385640646055102
     maximize: False
     weight_decay: 0.0
 ),
 Muon (
 Parameter Group 0
     initial_lr: 0.2
     lr: 0.2
     momentum: 0.95
     nesterov: True
     ns_steps: 5
 
 Parameter Group 1
     initial_lr: 0.2
     lr: 0.2
     momentum: 0.95
     nesterov: True
     ns_steps: 5
 )]

### Understand AdamW optimizer

Now that the code is in place, understand a little about AdamW. Ok, after asking ChatGPT and reading the [torch docs](https://docs.pytorch.org/docs/stable/generated/torch.optim.AdamW.html) I get the idea. Wondering where it stores the first and second moments and if can see those in param groups once we start training. Try the simple example from above but using AdamW.

In [31]:
x = torch.randn(5, 2)
y = (x[:,0] * 5 + x[:,1] * 7).unsqueeze(-1)
model = torch.nn.Linear(2,1, bias=False)
optimizer = torch.optim.AdamW(model.parameters())

In [32]:
model.weight

Parameter containing:
tensor([[-0.4048, -0.1024]], requires_grad=True)

In [33]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: True
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [34]:
for i in range(10_000):
    optimizer.zero_grad()
    F.mse_loss(model(x),y).backward()
    if i % 1000 == 0:
        print(f"step: {i}, loss: {F.mse_loss(model(x),y).item()}, learning rate: {optimizer.param_groups[0]['lr']}")
    optimizer.step()

step: 0, loss: 60.29609298706055, learning rate: 0.001
step: 1000, loss: 44.197731018066406, learning rate: 0.001
step: 2000, loss: 31.580692291259766, learning rate: 0.001
step: 3000, loss: 21.744022369384766, learning rate: 0.001
step: 4000, loss: 14.211898803710938, learning rate: 0.001
step: 5000, loss: 8.635725975036621, learning rate: 0.001
step: 6000, loss: 4.728221893310547, learning rate: 0.001
step: 7000, loss: 2.219465732574463, learning rate: 0.001
step: 8000, loss: 0.8228629231452942, learning rate: 0.001
step: 9000, loss: 0.21109020709991455, learning rate: 0.001


In [35]:
model.weight

Parameter containing:
tensor([[4.9032, 6.8311]], requires_grad=True)

Funny, at least on some runs ^ it's ending up in very good local minimums

In [36]:
optimizer.state_dict()

{'state': {0: {'step': tensor(10000.),
   'exp_avg': tensor([[-0.0249, -0.3682]]),
   'exp_avg_sq': tensor([[0.0260, 2.1372]])}},
 'param_groups': [{'lr': 0.001,
   'betas': (0.9, 0.999),
   'eps': 1e-08,
   'weight_decay': 0.01,
   'amsgrad': False,
   'maximize': False,
   'foreach': None,
   'capturable': False,
   'differentiable': False,
   'fused': None,
   'decoupled_weight_decay': True,
   'params': [0]}]}

Yes, looks like the holds maintains the moving averages

In [38]:
x = torch.randn(100, 2) # more data points, less likely to find such a good local minimum ?
y = (x[:,0] * 5 + x[:,1] * 7).unsqueeze(-1)
model = torch.nn.Linear(2,1, bias=False)
optimizer = torch.optim.AdamW(model.parameters())
for i in range(20_000):
    optimizer.zero_grad()
    F.mse_loss(model(x),y).backward()
    if i % 1000 == 1:
        print(f"step: {i}, loss: {F.mse_loss(model(x),y).item():.2f}, grad: {model.weight.grad}, exp_avg: {optimizer.state_dict()['state'][0]['exp_avg']}")
    optimizer.step()

step: 1, loss: 83.30, grad: tensor([[-14.5306, -14.7523]]), exp_avg: tensor([[-1.4533, -1.4755]])
step: 1001, loss: 57.78, grad: tensor([[-11.8233, -12.4782]]), exp_avg: tensor([[-11.8491, -12.5000]])
step: 2001, loss: 38.56, grad: tensor([[ -9.3578, -10.3839]]), exp_avg: tensor([[ -9.3814, -10.4041]])
step: 3001, loss: 24.34, grad: tensor([[-7.1029, -8.4409]]), exp_avg: tensor([[-7.1245, -8.4596]])
step: 4001, loss: 14.24, grad: tensor([[-5.0566, -6.6397]]), exp_avg: tensor([[-5.0760, -6.6570]])
step: 5001, loss: 7.53, grad: tensor([[-3.2504, -4.9908]]), exp_avg: tensor([[-3.2671, -5.0064]])
step: 6001, loss: 3.51, grad: tensor([[-1.7556, -3.5249]]), exp_avg: tensor([[-1.7687, -3.5385]])
step: 7001, loss: 1.44, grad: tensor([[-0.6828, -2.2918]]), exp_avg: tensor([[-0.6911, -2.3028]])
step: 8001, loss: 0.51, grad: tensor([[-0.1229, -1.3413]]), exp_avg: tensor([[-0.1260, -1.3493]])
step: 9001, loss: 0.14, grad: tensor([[ 0.0154, -0.6806]]), exp_avg: tensor([[ 0.0152, -0.6859]])
step: 10

In [39]:
model.weight

Parameter containing:
tensor([[5.0000, 6.9997]], requires_grad=True)