# Setup

In [1]:
# my virtual environments are rarely properly connected to jupyter so this fixes that. 
# you won't need this cell but running it won't hurt anything either
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, './venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

# Instantiate a brand new model

In [2]:
# tokenizer
from tokenizer import get_tokenizer
tokenizer = get_tokenizer(size = 2048) # size options are 95(character-wise), 128, 256, 512, 1024, 2048 & 4096

# config file
from config import ModelConfig, TrainConfig
cfg = ModelConfig()
cfg.vocab_len = tokenizer.vocab_len
tcfg = TrainConfig()
print(cfg, '\n\n', tcfg)

# model modules
from model import customGPT
model = customGPT(cfg).to(cfg.device)

# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e3, 'K parameters\n')
print(model)

ModelConfig(dim=64, vocab_len=2051, device='cpu', num_layers=4, second_resid_norm=False, mlp_hidden_mult=4, mlp_bias=False, mlp_nonlinearity='SiLU', mlp_gated=True, num_q_heads=16, num_kv_heads=4, head_dim=16, theta=10000, max_seq_len=256, scale_first_resid=True, norm_type='RMSNorm', norm_affine=True, norm_bias=True, eps=1e-06, max_batch_size=1) 

 TrainConfig(weight_decay=0.02, batch_size=32, max_iters=5, eval_interval=2, eval_samples=1, checkpoint_interval=None, lr_max=0.1, lr_min=1e-05, warmup_iters=0, final_flat_iters=0, anneal_type='cos', num_restarts=3, T_mult=2)
492.864 K parameters

customGPT(
  (token_embedder): Embedding(2051, 64)
  (layers): ModuleList(
    (0-3): 4 x ResidualLayer(
      (pre_attn_norm): Norm()
      (attn): MQSA(
        (Wq): Linear(in_features=64, out_features=256, bias=False)
        (Wk): Linear(in_features=64, out_features=64, bias=False)
        (Wv): Linear(in_features=64, out_features=64, bias=False)
        (Wo): Linear(in_features=256, out_featur

# Training

In [3]:
import torch
from tools import get_data_loader
from train import scheduler_lambda, train

optimizer = torch.optim.AdamW(model.parameters(), lr = tcfg.lr_max, weight_decay = tcfg.weight_decay)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_lambda)

train_data_loader = get_data_loader(batch_size=tcfg.batch_size, split='train')
test_data_loader = get_data_loader(batch_size=tcfg.batch_size, split='validation')

Found cached dataset json (/Users/tunadorable/.cache/huggingface/datasets/noanabeshima___json/noanabeshima--TinyStoriesV2-226173b7dd235c68/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
Found cached dataset json (/Users/tunadorable/.cache/huggingface/datasets/noanabeshima___json/noanabeshima--TinyStoriesV2-226173b7dd235c68/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


In [4]:
if False: # set to true if you'd like to see a graph of the learning rate schedule
    import matplotlib.pyplot as plt
    
    # Generate learning rate values
    lrs = [scheduler_lambda(i) for i in range(tcfg.max_iters)]
    
    # Plot the learning rates
    plt.figure(figsize=(10, 5))
    plt.plot(lrs, label='Learning Rate')
    plt.title('Learning Rate Schedule')
    plt.xlabel('Iteration')
    plt.ylabel('Learning Rate')
    plt.grid(True)
    plt.legend()
    plt.show()

In [5]:
model, optimizer, log_data = train(
    model, 
    tokenizer, 
    cfg, 
    optimizer,
    scheduler,
    tcfg, 
    train_data_loader,
    test_data_loader,
    #log_data: list = None, 
    #detect_anomoly = False # use if you're getting crazy errors about a the gradient being broken
)

 20%|██████                        | 1/5 [00:02<00:08,  2.11s/it]

step 0000: lr 0.010000, train loss 59.7845, val loss 59.6585, ppl 81163229730732260594810880, time elapsed: 1.25 seconds


 60%|██████████████████            | 3/5 [00:04<00:03,  1.61s/it]

step 0002: lr 0.001465, train loss 45.5886, val loss 44.9978, ppl 34857197010824462336, time elapsed: 3.94 seconds


100%|██████████████████████████████| 5/5 [00:07<00:00,  1.52s/it]

step 0004: lr 0.003087, train loss 23.0972, val loss 23.1613, ppl 11450800128, time elapsed: 6.68 seconds





# inference test before you decide to save it
if `tcfg.checkpoint_interval != None` then checkpoints have already been saved

In [6]:
from inference import generate
prompt = "Once upon a time"
model.eval()
output = generate(
    prompt, 
    model, 
    tokenizer,
    #max_gen_len = 512,
    temperature = 0.7,
    #memory_saver_div = 8,
    #top_p = 0.9,
    #top_k = 32,
)
model.train()
print(output)

Once upon a timeuse use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use use 

# Saving your final model
you DO still need to do this even if you had been saving checkpoints; the final state has not yet been saved

In [7]:
from tools import save_model
save_model(model, cfg, tcfg, log_data)