# Setup

In [1]:
# my virtual environments are rarely properly connected to jupyter so this fixes that. 
# you prolly won't need this cell but running it won't hurt anything either
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

# Instantiate a brand new model

In [2]:
# config file
from config import ModelConfig, TrainConfig
cfg = ModelConfig()
tcfg = TrainConfig()
print(cfg, '\n\n', tcfg)

# import the tokenizer specified by cfg
from tools import import_from_nested_path
imported_objects = import_from_nested_path(['tokenizers', cfg.tokenizer], 'tokenizer', ['get_tokenizer'])
get_tokenizer = imported_objects.get('get_tokenizer')
tokenizer = get_tokenizer(size = cfg.vocab_len)

# model modules
from modules.model import Model
model = Model(cfg).to(cfg.device)

import torch
#model = torch.compile(model) 
# ^if this takes too long & you're debugging you can comment it out, but def leave it on for full training runs if you've got CUDA

# print the number of parameters in the model
print("\nnumber of parameters: %.2fM\n" % (model.get_num_params()/1e6,))
print(model)

ModelConfig(dim=32, device='cpu', out_weight_share=True, linear_bias=False, tokenizer='bpe_v2_tinyStories', vocab_len=512, num_layers=2, second_resid_norm=False, mlp_hidden_mult=4, mlp_nonlinearity='SiLU', mlp_gated=True, num_q_heads=2, num_kv_heads=1, head_dim=16, theta=10000, max_seq_len=64, scale_first_resid=True, norm_type='RMSNorm', norm_affine=True, norm_bias=True, eps=1e-06) 

 TrainConfig(weight_decay=0.05, batch_size=32, max_iters=10, eval_interval=2, eval_samples=1, checkpoint_interval=None, beta1=0.9, beta2=0.95, epsilon=1e-08, lr_init=1e-06, lr_max=0.1, lr_min=0.001, warmup_iters=0, final_flat_iters=1, anneal_type='cos', num_restarts=3, T_mult=2)

number of parameters: 0.04M

Model(
  (token_embedder): Embedding(640, 32)
  (layers): ModuleList(
    (0-1): 2 x Layer(
      (pre_attn_norm): Norm()
      (attn): MQA(
        (Wq): Linear(in_features=32, out_features=32, bias=False)
        (Wk): Linear(in_features=32, out_features=16, bias=False)
        (Wv): Linear(in_featur

# Training

In [3]:
from tools import get_data_loader
from train import scheduler_lambda, get_optimizer, train

optimizer = get_optimizer(model, tcfg)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_lambda)

train_data_loader = get_data_loader(batch_size=tcfg.batch_size, split='train')
test_data_loader = get_data_loader(batch_size=tcfg.batch_size, split='validation')

num decayed parameter tensors: 15, with 42,944 parameters
num non-decayed parameter tensors: 10, with 320 parameters
using fused AdamW: False


Found cached dataset json (/Users/tunadorable/.cache/huggingface/datasets/noanabeshima___json/noanabeshima--TinyStoriesV2-40971520ba3bacdf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Found cached dataset json (/Users/tunadorable/.cache/huggingface/datasets/noanabeshima___json/noanabeshima--TinyStoriesV2-40971520ba3bacdf/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [4]:
if False: # set to true if you'd like to see a graph of the learning rate schedule
    import matplotlib.pyplot as plt
    
    # Generate learning rate values
    lrs = [scheduler_lambda(i) for i in range(tcfg.max_iters)]
    
    # Plot the learning rates
    plt.figure(figsize=(10, 5))
    plt.plot(lrs, label='Learning Rate')
    plt.title('Learning Rate Schedule')
    plt.xlabel('Iteration')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.legend()
    plt.show()

In [5]:
model, optimizer, log_data = train(
    model, 
    tokenizer, 
    cfg, 
    optimizer,
    scheduler,
    tcfg, 
    train_data_loader,
    test_data_loader,
    #log_data: list = None, 
    #detect_anomoly = False # use if you're getting crazy errors about a the gradient being broken
)

 10%|███▊                                  | 1/10 [00:00<00:05,  1.63it/s]

step 0000: lr 0.010000, train loss 6.3042, val loss 6.3048, ppl 547, grad norm 0.4558, time elapsed: 0.49 seconds


 30%|███████████▍                          | 3/10 [00:01<00:03,  2.02it/s]

step 0002: lr 0.009831, train loss 5.8510, val loss 5.8374, ppl 343, grad norm 0.6575, time elapsed: 1.38 seconds


 50%|███████████████████                   | 5/10 [00:02<00:02,  2.14it/s]

step 0004: lr 0.000269, train loss 5.7458, val loss 5.7082, ppl 301, grad norm 0.5919, time elapsed: 2.26 seconds


 70%|██████████████████████████▌           | 7/10 [00:03<00:01,  2.18it/s]

step 0006: lr 0.006944, train loss 5.4193, val loss 5.3550, ppl 212, grad norm 0.5948, time elapsed: 3.14 seconds


 90%|██████████████████████████████████▏   | 9/10 [00:04<00:00,  2.21it/s]

step 0008: lr 0.001123, train loss 5.2815, val loss 5.2791, ppl 196, grad norm 0.5617, time elapsed: 4.01 seconds


100%|█████████████████████████████████████| 10/10 [00:04<00:00,  2.16it/s]

step 0009: lr 0.000100, train loss 5.3043, val loss 5.3293, ppl 206, grad norm 0.5209, time elapsed: 4.51 seconds





# inference test before you decide to save it

In [6]:
from inference import generate
prompt = "Once upon a time"
model.eval()
output = generate(
    prompt, 
    model, 
    tokenizer,
    #temperature = 0.9,
    #top_k = 32,
    #top_p = 0.9,
    #max_gen_len = 512,
    #memory_saver_div = 4,
)
model.train()
print(output[0])

Once upon a time timeunnmat,Onceay likece like uponny inkebleret saw One momll play Heckreing in aning a me. li of an to time wasar thom itenl sty sakrckkeel Maxainve me h to


# Saving your final model
if `tcfg.checkpoint_interval != None` then checkpoints have already been saved

you DO still need to do this even if you had been saving checkpoints; the final state has not yet been saved

In [7]:
from tools import save_model
save_model(model, cfg, tcfg, log_data)