# Setup

In [3]:
# my virtual environments are rarely properly connected to jupyter so this fixes that
# running it shouldn't break anything for u
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv')
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path)

In [4]:
# importing the model config
from config import *

# importing N-GPT
from model import cosine_norm, Model

# imports for the tokenizer
import pickle
from tokenizer.tokenizer import BPE_Tokenizer

# used in the training loop
import time
import math

# used to save & load models
import json
from dataclasses import asdict

# Instantiate a brand new model

In [6]:
cfg = ModelConfig()
print(cfg)
tcfg = TrainConfig()
print(tcfg)

# size options are 512, 1024 and 2048
with open(f'tokenizer/models/{cfg.vocab_len - 3}.model', 'rb') as f:
        tokenizer_data = pickle.load(f)
tokenizer = BPE_Tokenizer(tokenizer_data['merges']) 

ModelConfig(dim=128, device=None, max_seq_len=384, theta=10000, vocab_len=2048, num_layers=8, num_heads=4, mlp_hidden_mult=4)
TrainConfig(model_name='N-GPT_2m', micro_batch_size=4, grad_accum_steps=16, max_iters=1000, eval_interval=100, beta1=0.9, beta2=0.95, epsilon=1e-08, lr_init=0.0005, lr_final=1e-08)


In [7]:
model = Model(cfg).to(cfg.device)

# print the number of parameters in the model
print(f'{model.get_num_params()} parameters')
print(model)

1844560 parameters
Model(
  (precompute_freqs): PrecomputeRotaryFrequencies()
  (token_embedder): Embedding(2048, 128)
  (layers): ModuleList(
    (0-7): 8 x Layer(
      (attn): SelfAttention(
        (Wq): Linear(in_features=128, out_features=128, bias=False)
        (Wk): Linear(in_features=128, out_features=128, bias=False)
        (Wv): Linear(in_features=128, out_features=128, bias=False)
        (s_qk): Scale()
        (Wo): Linear(in_features=128, out_features=128, bias=False)
      )
      (alpha_A): Scale()
      (mlp): MLP(
        (Wup): Linear(in_features=128, out_features=341, bias=False)
        (Wgate): Linear(in_features=128, out_features=341, bias=False)
        (Wdown): Linear(in_features=341, out_features=128, bias=False)
        (s_u): Scale()
        (s_v): Scale()
      )
      (alpha_M): Scale()
    )
  )
  (output): Linear(in_features=128, out_features=2048, bias=False)
  (s_z): Scale()
  (criterion): CrossEntropyLoss()
)


# Training

In [9]:
# load the dataset
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# the first 200 characters. It's just one continuous text document with all of the works of shakespeare back-to-back
print(text[:200])

# Train and test splits
data = torch.tensor(tokenizer.encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be our training dataset, the rest for validation
train_data = data[:n]
val_data = data[n:]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [10]:
# data loading for training which generates a small batch of data of inputs x and targets y
def get_batch(split, batch_size):
    # whether we grab from our training or validation dataset
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - cfg.max_seq_len, (batch_size,))
    x = torch.stack([data[i:i+cfg.max_seq_len] for i in ix])
    y = torch.stack([data[i+1:i+cfg.max_seq_len+1] for i in ix])
    x, y = x.to(cfg.device), y.to(cfg.device)
    return x, y

In [11]:
@torch.no_grad()
def estimate_loss(model, batch_size, eval_iters = 3): # to estimate loss during the training loop
    out = {}
    model.eval() # sets model to eval mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size)
            logits, loss = model(X, target_token_ids=Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # just resets to training mode
    return out

In [12]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=tcfg.lr_init, weight_decay=0.0)
    # No weight decay to keep vectors on the unit hypersphere

# Learning rate schedule without warmup
def lr_lambda(current_iter):
    # Cosine decay phase only
    cosine_decay = 0.5 * (1 + math.cos(math.pi * current_iter / tcfg.max_iters))
    return max(cosine_decay, tcfg.lr_final / tcfg.lr_init)
        
# Scheduler using cosine decay
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [13]:
start_time = time.time()
model.train()

for iter in range(tcfg.max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % tcfg.eval_interval == 0 or iter == tcfg.max_iters - 1:
        current_time = time.time()
        elapsed_time = current_time - start_time
        losses = estimate_loss(model, tcfg.micro_batch_size)
        current_lr = optimizer.param_groups[0]['lr']
        print(f"step {iter:04d}: lr {current_lr:.6f}, train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, time elapsed: {elapsed_time:.2f} seconds")
        
    # setup for training
    optimizer.zero_grad()
    loss_accum = 0.0

    # we can simulate a larget batch size by accumulating gradients over many micro batches
    for micro_step in range(tcfg.grad_accum_steps):
        # sample a batch of data
        xb, yb = get_batch('train', tcfg.micro_batch_size)
        
        # train
        logits, loss = model(input_token_ids = xb, target_token_ids = yb)
        
        # accounting for the size of the micro batch
        loss = loss / tcfg.grad_accum_steps
        # adding the micro batch's loss to the total loss
        loss_accum += loss.detach()
        loss.backward()
        
    # update the parameters
    optimizer.step()
    
    # Apply cosine normalization & absolute value constraints after optimization step
    model.enforce_constraints()  

    # Update the learning rate
    scheduler.step()

step 0000: lr 0.000500, train loss 39.0102, val loss 38.8883, time elapsed: 0.00 seconds
step 0100: lr 0.000488, train loss 7.0910, val loss 7.0731, time elapsed: 164.32 seconds
step 0200: lr 0.000452, train loss 6.5016, val loss 6.5262, time elapsed: 324.85 seconds
step 0300: lr 0.000397, train loss 6.1771, val loss 6.2195, time elapsed: 524.57 seconds
step 0400: lr 0.000327, train loss 5.9875, val loss 6.0033, time elapsed: 800.14 seconds
step 0500: lr 0.000250, train loss 5.8212, val loss 5.8767, time elapsed: 1073.34 seconds
step 0600: lr 0.000173, train loss 5.7313, val loss 5.8072, time elapsed: 1345.06 seconds
step 0700: lr 0.000103, train loss 5.6668, val loss 5.7730, time elapsed: 1617.06 seconds
step 0800: lr 0.000048, train loss 5.5971, val loss 5.7359, time elapsed: 1889.12 seconds
step 0900: lr 0.000012, train loss 5.6274, val loss 5.6946, time elapsed: 2161.36 seconds
step 0999: lr 0.000000, train loss 5.6317, val loss 5.7475, time elapsed: 2437.86 seconds


In [14]:
# checking to make sure the absolute value-ing worked
print(model.layers[0].alpha_A.s.data)
# checking to make sure the cosine normalization worked
print(model.layers[0].mlp.Wup.weight.norm(dim=1))
print(model.token_embedder.weight.norm(dim=1))

tensor([[0.1167, 0.1951, 0.1287, 0.1083, 0.1109, 0.1502, 0.1045, 0.1445, 0.1062,
         0.1298, 0.1070, 0.1279, 0.1209, 0.1164, 0.1321, 0.1170, 0.0900, 0.1096,
         0.1366, 0.1098, 0.1176, 0.1111, 0.1199, 0.1226, 0.1073, 0.1137, 0.1615,
         0.2416, 0.1228, 0.1286, 0.1177, 0.1306, 0.1100, 0.1014, 0.2246, 0.1477,
         0.1544, 0.1089, 0.1097, 0.1163, 0.2003, 0.1139, 0.2482, 0.1566, 0.1577,
         0.1065, 0.0985, 0.1066, 0.1257, 0.2109, 0.1540, 0.1266, 0.0994, 0.1061,
         0.1442, 0.1269, 0.1469, 0.0940, 0.1620, 0.1511, 0.1205, 0.1216, 0.2291,
         0.1227, 0.1070, 0.1037, 0.1172, 0.1163, 0.2006, 0.1323, 0.1226, 0.1115,
         0.1220, 0.1071, 0.1381, 0.1233, 0.1568, 0.0963, 0.2137, 0.0963, 0.1283,
         0.1736, 0.1019, 0.1969, 0.1122, 0.1587, 0.1104, 0.1092, 0.1429, 0.1119,
         0.1074, 0.1081, 0.1091, 0.1165, 0.1104, 0.1822, 0.1078, 0.1122, 0.1100,
         0.1046, 0.1160, 0.2186, 0.1992, 0.1038, 0.1265, 0.1115, 0.1284, 0.1844,
         0.1335, 0.0992, 0.1

# inference test before you decide to save it

In [16]:
from inference import generate
output = generate(
    "JULIET:\nO Romeo, Romeo! wherefore art thou", 
    model, 
    tokenizer, 
    temperature=0.01, # really weird that we've gotta use a pretty damn low temperature
    max_gen_len = 128
)
print(output)

                                                                                  

JULIET:
O Romeo, Romeo! wherefore art thou art not in your parged.

KING RICHARD II:
I will be feton, that we will be damn'd,
For my lord, that now feton to our bavour.

GLOUCESTER:
My lord to the barget with his prages,
I will be davestes, I will be feton of your parging.

QUEEN MARGARET:
I will be davour'd the pavour,
That now in your got'd a bay'd in his birps.

KING RICHARD II:
Tavest of




# Saving your model

In [18]:
os.makedirs(f'models/{tcfg.model_name}', exist_ok=True)

# saving model
torch.save(model.state_dict(), f'models/{tcfg.model_name}/model.pth')

# saving configs
cfg_dict = asdict(cfg)
with open(f'models/{tcfg.model_name}/model_config.json', 'w') as f:
    json.dump(cfg_dict, f)
tcfg_dict = asdict(tcfg)
with open(f'models/{tcfg.model_name}/train_config.json', 'w') as f:
    json.dump(tcfg_dict, f)

print(f'model successfully saved to models/{tcfg.model_name}/')

model successfully saved to models/N-GPT_2m/
