# Setup

In [1]:
# my virtual environments are rarely properly connected to jupyter so this fixes that. 
# you prolly won't need this cell but running it won't hurt anything either
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

# Instantiate a brand new model

In [2]:
# config file
from config import ModelConfig, TrainConfig
cfg = ModelConfig()
tcfg = TrainConfig()
print(cfg, '\n\n', tcfg)

# import the tokenizer specified by cfg
from tools import import_from_nested_path
imported_objects = import_from_nested_path(['custom_tokenizers', cfg.tokenizer], 'tokenizer', ['get_tokenizer'])
get_tokenizer = imported_objects.get('get_tokenizer')
tokenizer = get_tokenizer(size = cfg.vocab_len)

# the actual model modules (MLP, attention mechanism, norm, layer, etc)
from modules.model import Model
model = Model(cfg).to(cfg.device)

import torch
# this makes the model way more efficient
if model.device == 'cuda':
    model = torch.compile(model) 
    # ^if this takes too long & you're debugging you can comment it out, but def leave it on for full training runs

# print the number of parameters in the model
print("\nnumber of parameters: %.2fM\n" % (model.get_num_params()/1e6,))
print(model)

ModelConfig(dim=16, device='cpu', out_weight_share=True, linear_bias=False, tokenizer='bpe_tinyStories', vocab_len=1024, num_layers=2, second_resid_norm=False, mlp_hidden_mult=4, mlp_nonlinearity='SiLU', mlp_gated=True, num_q_heads=2, num_kv_heads=1, head_dim=8, theta=10000, max_seq_len=32, scale_first_resid=True, norm_type='RMSNorm', norm_affine=True, norm_bias=True, eps=1e-06) 

 TrainConfig(model_name='2024-06-25|02-08-33', dataset_name='noanabeshima/TinyStoriesV2', data_subset=None, streaming=True, micro_batch_size=8, grad_accum_steps=2, max_iters=100, eval_interval=5, eval_samples=1, checkpoint_interval=None, beta1=0.9, beta2=0.95, epsilon=1e-08, weight_decay=0.05, grad_clip=1.0, lr_init=1e-06, lr_max=0.1, lr_min=0.001, warmup_iters=10, final_flat_iters=10, anneal_type='cos', num_restarts=0, T_mult=2)

number of parameters: 0.02M

Model(
  (token_embedder): Embedding(1024, 16)
  (layers): ModuleList(
    (0-1): 2 x Layer(
      (pre_attn_norm): Norm()
      (attn): SelfAttention(


In [3]:
from tools import get_data_loaders

train_data_loader, test_data_loader = get_data_loaders(
    tcfg.dataset_name, 
    batch_size = tcfg.micro_batch_size, 
    streaming = tcfg.streaming,
    subset_name = tcfg.data_subset
)

In [4]:
from train import scheduler_lambda, get_optimizer, train

optimizer = get_optimizer(model, tcfg)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_lambda)

num decayed parameter tensors: 15, with 21,952 parameters
num non-decayed parameter tensors: 10, with 160 parameters
using fused AdamW: False


In [5]:
if False: # set to true if you'd like to see a graph of the learning rate schedule
    import matplotlib.pyplot as plt
    
    # Generate learning rate values
    lrs = [scheduler_lambda(i) for i in range(tcfg.max_iters)]
    
    # Plot the learning rates
    plt.figure(figsize=(10, 5))
    plt.plot(lrs, label='Learning Rate')
    plt.title('Learning Rate Schedule')
    plt.xlabel('Iteration')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.legend()
    plt.show()

# Training

In [6]:
model, optimizer, log_data = train(
    model, 
    tokenizer, 
    cfg, 
    optimizer,
    scheduler,
    tcfg, 
    train_data_loader,
    test_data_loader,
    #log_data: list = None, # for picking up training from a checkpoint
    #detect_anomoly = False # use if you're getting crazy errors about a the gradient being broken
)

step: 0000, time elapsed: 0.00s, tokens/s: 00000000, train loss: 6.9228, val loss: 6.9284, ppl: 1021, lr: 0.00000100, grad norm: 0.0000
step: 0005, time elapsed: 3.81s, tokens/s: 00000671, train loss: 5.9325, val loss: 5.8296, ppl: 340, lr: 0.05000050, grad norm: 0.6055
step: 0010, time elapsed: 6.31s, tokens/s: 00001024, train loss: 5.6440, val loss: 5.4848, ppl: 241, lr: 0.10000000, grad norm: 3.3160
step: 0015, time elapsed: 9.00s, tokens/s: 00000952, train loss: 5.1938, val loss: 5.2965, ppl: 200, lr: 0.09904887, grad norm: 4.3311
step: 0020, time elapsed: 11.28s, tokens/s: 00001120, train loss: 4.7176, val loss: 5.4959, ppl: 244, lr: 0.09623203, grad norm: 0.5750
step: 0025, time elapsed: 13.72s, tokens/s: 00001051, train loss: 4.9000, val loss: 4.7746, ppl: 118, lr: 0.09165774, grad norm: 0.3688
step: 0030, time elapsed: 16.29s, tokens/s: 00000993, train loss: 4.7444, val loss: 4.7900, ppl: 120, lr: 0.08550179, grad norm: 0.5170
step: 0035, time elapsed: 18.71s, tokens/s: 0000105

# inference test before you decide to save it

In [7]:
from inference import generate
prompt = "Once upon a time"
model.eval()
output = generate(
    prompt, 
    model, 
    tokenizer,
    #temperature = 0.9,
    #top_k = 32,
    #top_p = 0.9,
    #max_gen_len = 512,
    #memory_saver_div = 4,
)
model.train()
print(output[0])

Once upon a time, a small little named Lily. a was to a a had a small so and little big It was cat. The her to the


# Saving your final model
if `tcfg.checkpoint_interval != None` then checkpoints have already been saved

you DO still need to do this even if you had been saving checkpoints; the final state has not yet been saved

In [8]:
from tools import save_model
save_model(model, cfg, tcfg, log_data)