# Setup

In [1]:
# my virtual environments are rarely properly connected to jupyter so this fixes that. 
# you prolly won't need this cell but running it won't hurt anything either
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

# Instantiate a brand new model

In [2]:
# config file
from config import ModelConfig, TrainConfig
cfg = ModelConfig()
tcfg = TrainConfig()
print(cfg, '\n\n', tcfg)

# import the tokenizer specified by cfg
from tools import import_from_nested_path
imported_objects = import_from_nested_path(['custom_tokenizers', cfg.tokenizer], 'tokenizer', ['get_tokenizer'])
get_tokenizer = imported_objects.get('get_tokenizer')
tokenizer = get_tokenizer(size = cfg.vocab_len)

# the actual model modules (MLP, attention mechanism, norm, layer, etc)
from modules.model import Model
model = Model(cfg).to(cfg.device)

import torch
# this makes the model way more efficient
if model.device == 'cuda':
    model = torch.compile(model) 
    # ^if this takes too long & you're debugging you can comment it out, but def leave it on for full training runs

# print the number of parameters in the model
print("\nnumber of parameters: %.2fM\n" % (model.get_num_params()/1e6,))
print(model)

ModelConfig(dim=128, device='cpu', out_weight_share=True, linear_bias=False, tokenizer='bpe_tinyStories', vocab_len=2048, num_layers=6, second_resid_norm=False, mlp_hidden_mult=4, mlp_nonlinearity='SiLU', mlp_gated=True, num_q_heads=4, num_kv_heads=1, head_dim=32, theta=10000, max_seq_len=512, scale_first_resid=True, norm_type='RMSNorm', norm_affine=True, norm_bias=True, eps=1e-06) 

 TrainConfig(model_name='templateGPT_1m', dataset_name='noanabeshima/TinyStoriesV2', data_subset=None, streaming=True, micro_batch_size=24, grad_accum_steps=1, max_iters=4, eval_interval=1, eval_samples=1, checkpoint_interval=None, beta1=0.9, beta2=0.95, epsilon=1e-08, weight_decay=0.05, grad_clip=1.0, lr_init=0.0001, lr_max=0.1, lr_min=0.01, warmup_iters=0, final_flat_iters=0, anneal_type='cos', num_restarts=0, T_mult=2)

number of parameters: 1.30M

Model(
  (token_embedder): Embedding(2048, 128)
  (layers): ModuleList(
    (0-5): 6 x Layer(
      (pre_attn_norm): Norm()
      (attn): SelfAttention(
    

In [3]:
from tools import get_data_loaders

train_data_loader, test_data_loader = get_data_loaders(
    tcfg.dataset_name, 
    batch_size = tcfg.micro_batch_size, 
    streaming = tcfg.streaming,
    subset_name = tcfg.data_subset
)

In [4]:
from train import scheduler_lambda, get_optimizer, train

optimizer = get_optimizer(model, tcfg)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_lambda)

num decayed parameter tensors: 43, with 1,293,568 parameters
num non-decayed parameter tensors: 26, with 3,328 parameters
using fused AdamW: False


In [5]:
if False: # set to true if you'd like to see a graph of the learning rate schedule
    import matplotlib.pyplot as plt
    
    # Generate learning rate values
    lrs = [scheduler_lambda(i) for i in range(tcfg.max_iters)]
    
    # Plot the learning rates
    plt.figure(figsize=(10, 5))
    plt.plot(lrs, label='Learning Rate')
    plt.title('Learning Rate Schedule')
    plt.xlabel('Iteration')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.legend()
    plt.show()

# Training

In [None]:
model, optimizer, log_data = train(
    model, 
    tokenizer, 
    cfg, 
    optimizer,
    scheduler,
    tcfg, 
    train_data_loader,
    test_data_loader,
    #log_data: list = None, # for picking up training from a checkpoint
    #detect_anomoly = False # use if you're getting crazy errors about a the gradient being broken
)

step: 0000, time elapsed: 0.00s, tokens/s: 00000000, train loss: 7.6550, val loss: 7.6505, ppl: 2.10e+03, lr: 0.10000000, grad norm: 0.0000
step: 0001, time elapsed: 142.52s, tokens/s: 00000086, train loss: 7.8351, val loss: 7.9547, ppl: 2.85e+03, lr: 0.08681981, grad norm: 0.5202


# inference test before you decide to save it

In [7]:
from inference import generate
prompt = "Once upon a time"
model.eval()
output = generate(
    prompt, 
    model, 
    tokenizer,
    #temperature = 0.9,
    #top_k = 32,
    #top_p = 0.9,
    #max_gen_len = 512,
    #memory_saver_div = 4,
)
model.train()
print(output[0])

Once upon a time a a a a a the a. named time was aOne. a a big a a. was a a a a, little a was a a big was a boyOne a a a was little with a a was Tim. to was day He and,. a to the. They big. very the The and dog a was a,. little They he to know to wanted They of are for But ", They and The will the Sue a to. ran in It.. He. to the the She. and. help big see was and Max was They boy,,. a tried and her a all his mean the. are.ill was I. The F veryill She He. him her was, to, She was a very He even I went.' and help." a One he the her saw can. of with She have the he did He when po.
 day had you was.,,, of wasg., n to He Hev, and a. new to.


# Saving your final model
if `tcfg.checkpoint_interval != None` then checkpoints have already been saved

you DO still need to do this even if you had been saving checkpoints; the final state has not yet been saved

In [8]:
from tools import save_model
save_model(model, cfg, tcfg, log_data)