This is starting as a copy of the `baby-pretrain` notebook from challenge 13 but I will run it with a GPU. Look at `getting-ready.ipynb` in this folder first, then once on GPU machine run `train-tokenizer.ipynb` then run this notebook.

In [1]:
import sys
sys.path.append('../my_nanochat')
from my_nanochat.my_gpt import GPTConfig, GPT
import my_nanochat.my_tokenizer
from my_nanochat.my_dataset import text_iterator
from my_nanochat.my_dataloader import tokenizing_distributed_data_loader
from my_nanochat.my_tokenizer import MyTokenizer
from my_nanochat.my_common import get_base_dir
import torch
import math
import os
from contextlib import nullcontext

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.device_count()

1

In [4]:
device = "cuda"

# model architecture
depth = 4
max_seq_len = 128

# training horizon
num_iterations = 1000

# optimization (not sure why this section is called that yet)
device_batch_size = 1
total_batch_size = 128 # (device_batch_size x max_seq_len)

# these next 4 are for the optimizers and we already saw them in setup_optimizers()
embedding_lr = 0.2
unembedding_lr = 0.004
weight_decay = 0.0
matrix_lr = 0.02

grad_clip = 1.0

# these 3 look like they control something we haven't seen yet, some type of LR adjuster
warmup_ratio = 0.0
warmdown_ratio = 0.2
final_lr_fraction = 0.0

In [5]:
device_type = device
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()

In [6]:
tokenizer = my_nanochat.my_tokenizer.get_tokenizer()
vocab_size = tokenizer.get_vocab_size()
vocab_size

65537

In [7]:
# model kwargs are derived from desired depth of model
num_layers = depth
model_dim = depth * 64 # so for example in the default in GPTConfig it's 12 * 64 = 768)
num_heads = max(1, (model_dim + 127) // 128)
num_kv_heads = num_heads
num_layers, model_dim, num_heads, num_kv_heads

(4, 256, 2, 2)

In [8]:
# figure out the needed gradient accumulation to reach the desired total batch size
tokens_per_fwdbwd = device_batch_size * max_seq_len
grad_accum_steps = total_batch_size // tokens_per_fwdbwd
tokens_per_fwdbwd, grad_accum_steps

(128, 1)

In [9]:
model_config_kwargs = dict(
    sequence_len=max_seq_len,
    vocab_size=vocab_size, 
    n_layer=num_layers,
    n_head=num_heads,
    n_kv_head=num_kv_heads,
    n_embd=model_dim,
)
with torch.device("meta"):
    model_config = GPTConfig(**model_config_kwargs)
    model = GPT(model_config)
model.to_empty(device=device)

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(65537, 256)
    (h): ModuleList(
      (0-3): 4 x Block(
        (attn): CausalSelfAttention(
          (c_q): Linear(in_features=256, out_features=256, bias=False)
          (c_k): Linear(in_features=256, out_features=256, bias=False)
          (c_v): Linear(in_features=256, out_features=256, bias=False)
          (c_proj): Linear(in_features=256, out_features=256, bias=False)
        )
        (mlp): MLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=False)
          (c_proj): Linear(in_features=1024, out_features=256, bias=False)
        )
      )
    )
  )
  (lm_head): Linear(in_features=256, out_features=65537, bias=False)
)

In [10]:
model.init_weights()

In [11]:
model.get_device()

device(type='cuda', index=0)

In [12]:
orig_model = model # original, uncompiled model -- looks like even in this minimal notebook we might use it

In [13]:
model = torch.compile(model, dynamic=False)
model

OptimizedModule(
  (_orig_mod): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(65537, 256)
      (h): ModuleList(
        (0-3): 4 x Block(
          (attn): CausalSelfAttention(
            (c_q): Linear(in_features=256, out_features=256, bias=False)
            (c_k): Linear(in_features=256, out_features=256, bias=False)
            (c_v): Linear(in_features=256, out_features=256, bias=False)
            (c_proj): Linear(in_features=256, out_features=256, bias=False)
          )
          (mlp): MLP(
            (c_fc): Linear(in_features=256, out_features=1024, bias=False)
            (c_proj): Linear(in_features=1024, out_features=256, bias=False)
          )
        )
      )
    )
    (lm_head): Linear(in_features=256, out_features=65537, bias=False)
  )
)

In [14]:
num_params = sum([param.numel() for param in model.parameters()])
num_params

36700672

In [15]:
total_tokens = total_batch_size * num_iterations
total_tokens # total number of training tokens

128000

In [16]:
# initialize optimizer
optimizers = model.setup_optimizers(
    unembedding_lr=unembedding_lr,
    embedding_lr=embedding_lr,
    matrix_lr=matrix_lr,
    weight_decay=weight_decay,
)
adamw_optimizer, muon_optimizer = optimizers

Scaling the LR for the AdamW parameters proportional to 1/sqrt(256/768) = 1.7320508075688774


In [17]:
# initialize DataLoader
train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train", device=device)
x, y = next(train_loader)
x.shape, y.shape

(torch.Size([1, 128]), torch.Size([1, 128]))

In [18]:
# set up hyperparameter scheulders

In [19]:
# learning rate scheduler
def get_lr_multiplier(it):
    warmup_iters = round(warmup_ratio * num_iterations)
    warmdown_iters = round(warmdown_ratio * num_iterations)
    if it < warmup_iters:
        return (it + 1) / warmup_iters
    elif it <= num_iterations - warmdown_iters:
        return 1.0
    else:
        progress = (num_iterations - it) / warmdown_iters
        return progress * 1.0 + (1 - progress) * final_lr_fraction

def get_muon_momentum(it):
    frac = min(it / 300, 1)
    momentum = (1 - frac) * 0.85  + frac * 0.95
    return momentum

### the training loop!

In [20]:
for step in range(num_iterations):
    for micro_step in range(grad_accum_steps):
        with autocast_ctx: # before I added this in was getting BackendCompilerFailed: backend='inductor' raised: RuntimeError: expected mat1 and mat2 to have the same dtype, but got: c10::BFloat16 != float
            loss = model(x, y)
        train_loss = loss.detach()
        loss = loss / grad_accum_steps # seems import to understand, but n/a here since grad_accum_steps is 1, see his comment
        loss.backward()
        x, y = next(train_loader)
    # gradient clipping
    if grad_clip > 0.0:
        torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip) # check exactly what this does, it's not a simple cip
    # step optimizers
    lrm = get_lr_multiplier(step)
    for opt in optimizers:
        for group in opt.param_groups:
            group["lr"] = group["initial_lr"] * lrm
    muon_momentum = get_muon_momentum(step)
    for group in muon_optimizer.param_groups:
        group["momentum"] = muon_momentum
    for opt in optimizers:
        opt.step()
    model.zero_grad(set_to_none=True)

    if step % 10 == 0:
        print(f"step: {step}, loss: {train_loss}")

  return torch._C._get_cublas_allow_tf32()
/tmp/tmpnn2muj26/cuda_utils.c:6:10: fatal error: Python.h: No such file or directory
    6 | #include <Python.h>
      |          ^~~~~~~~~~
compilation terminated.
/tmp/tmp8927zd2g/cuda_utils.c:6:10: fatal error: Python.h: No such file or directory
    6 | #include <Python.h>
      |          ^~~~~~~~~~
compilation terminated.
W1101 13:18:45.481000 34730 torch/_inductor/utils.py:1558] [0/0] Not enough SMs to use max_autotune_gemm mode


step: 0, loss: 11.090370178222656
step: 10, loss: 7.793054103851318
step: 20, loss: 7.423770427703857
step: 30, loss: 8.758064270019531
step: 40, loss: 6.388361930847168
step: 50, loss: 6.340607643127441
step: 60, loss: 8.319405555725098
step: 70, loss: 8.77592658996582
step: 80, loss: 8.11382007598877
step: 90, loss: 9.916312217712402
step: 100, loss: 5.970464706420898
step: 110, loss: 7.699770927429199
step: 120, loss: 8.744855880737305
step: 130, loss: 8.410645484924316
step: 140, loss: 6.649989128112793
step: 150, loss: 7.96115255355835
step: 160, loss: 4.203775405883789
step: 170, loss: 8.52189826965332
step: 180, loss: 7.574532508850098
step: 190, loss: 7.510982990264893
step: 200, loss: 8.312272071838379
step: 210, loss: 7.491954326629639
step: 220, loss: 7.4583659172058105
step: 230, loss: 7.543668746948242
step: 240, loss: 8.850008010864258
step: 250, loss: 7.53331184387207
step: 260, loss: 7.910400390625
step: 270, loss: 8.707399368286133
step: 280, loss: 7.886048316955566
st

In [23]:
torch.save(orig_model.state_dict(), "model.pth")

In [24]:
!ls -lh model.pth

-rw-rw-r-- 1 paperspace paperspace 109M Nov  1 13:23 model.pth


In [25]:
# show top 3 next tokens for a few prompts
for prompt in ['The person', 'He went to', '1 + 2 = ', 'first of', '3 cats and 2', 'mom and', 'the red', 'She']:
    with autocast_ctx:
        logits = orig_model(torch.tensor([tokenizer.encode(prompt)], device=device)).detach()
    top_3_next_tokens = torch.topk(logits[0,-1,:], k=3).indices
    print(f"{prompt}{'|'.join([tokenizer.decode([token]) for token in top_3_next_tokens])}")

The person,|.| and
He went to the| be| a
1 + 2 = 19|20|16
first of the| a|,
3 cats and 2,|.| and
mom and the| a| 
the red,| and|.
She,| of|.
