In [2]:
# my virtual environments are rarely properly connected to jupyter so this fixes that
# running it shouldn't break anything for u
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

In [3]:
# importing the model config
from config import *

# importing N-GPT
from model import Model

# the config
from config import ModelConfig

# imports for the tokenizer
import pickle
from tokenizer.tokenizer import BPE_Tokenizer

# does the actual inference
from inference import generate

# used to save & load models
import json
from dataclasses import asdict

# Load a Pretrained Model

In [5]:
model_name = 'N-GPT_2m'

# Deserialize the JSON file back to a dictionary
with open(f'models/{model_name}/model_config.json', 'r') as f:
    config_dict = json.load(f)

# Convert the dictionary back to a Config object
cfg = ModelConfig(**config_dict)
cfg.device = ('cuda' if torch.cuda.is_available() 
              else 'mps' if torch.backends.mps.is_available() 
              else 'cpu')

with open(f'tokenizer/models/{cfg.vocab_len - 3}.model', 'rb') as f:
        tokenizer_data = pickle.load(f)
tokenizer = BPE_Tokenizer(tokenizer_data['merges']) 

# Initialize a blank model
model = Model(cfg).to(cfg.device) 

# Load the saved state dictionary
path = f'models/{model_name}/model.pth'
model.load_state_dict(torch.load(path, map_location="cpu"))

# print the number of parameters in the model
print(f'{model.get_num_params()} parameters')

# If you only plan to do inference, switch to evaluation mode
model.eval()

1844560 parameters


  model.load_state_dict(torch.load(path, map_location="cpu"))


Model(
  (precompute_freqs): PrecomputeRotaryFrequencies()
  (token_embedder): Embedding(2048, 128)
  (layers): ModuleList(
    (0-7): 8 x Layer(
      (attn): SelfAttention(
        (Wq): Linear(in_features=128, out_features=128, bias=False)
        (Wk): Linear(in_features=128, out_features=128, bias=False)
        (Wv): Linear(in_features=128, out_features=128, bias=False)
        (s_qk): Scale()
        (Wo): Linear(in_features=128, out_features=128, bias=False)
      )
      (alpha_A): Scale()
      (mlp): MLP(
        (Wup): Linear(in_features=128, out_features=341, bias=False)
        (Wgate): Linear(in_features=128, out_features=341, bias=False)
        (Wdown): Linear(in_features=341, out_features=128, bias=False)
        (s_u): Scale()
        (s_v): Scale()
      )
      (alpha_M): Scale()
    )
  )
  (output): Linear(in_features=128, out_features=2048, bias=False)
  (s_z): Scale()
  (criterion): CrossEntropyLoss()
)

# Inference

In [7]:
output = generate(
    "JULIET:\nO Romeo, Romeo! wherefore art thou", 
    model, 
    tokenizer, 
    temperature=0.01, # really weird that we've gotta use a pretty damn low temperature
    max_gen_len = 128
)
print(output)

                                                                                  

JULIET:
O Romeo, Romeo! wherefore art thou art we will not in his parging.

QUEEN MARGARET:
My lord, I will be fot of this cuts,
For my father is evestes to thy barn,
And they have stow to our prace, that now
That he shall be starl,
For my lord to his bay'd in your vow,
But that we shall be star'd the penes; I have in a man?

KING RICHARD II:
The garg'd a man's crow, as my lord
The birp'ding in his parged




In [9]:
from collections import defaultdict
from tabulate import tabulate

# Dictionary to store parameters grouped by name
params = defaultdict(list)
scale_names = ['alpha_A.s', 'alpha_M.s', 's_qk.s', 's_u.s', 's_v.s', 's_z.s']

# Collect all parameters
for name, param in model.named_parameters():
    # Check if the parameter name ends with any of our target scale names
    if any(name.endswith(scale_name) for scale_name in scale_names):
        base_name = name.split('.')[-2] + '.' + name.split('.')[-1]  # Get the base parameter name
        layer_num = name.split('.')[1] if 'layers' in name else 'output'
        
        params[base_name].append({
            'layer': layer_num,
            'shape': tuple(param.shape),
            'mean': torch.mean(param).item(),
            'std': torch.std(param).item(),
            'min': torch.min(param).item(),
            'max': torch.max(param).item()
        })

# Print results for each parameter type
for param_name in sorted(params.keys()):
    print(f"\n=== {param_name} Parameters ===")
    table_data = [[
        p['layer'],
        str(p['shape']),
        f"{p['mean']:.4f}",
        f"{p['std']:.4f}",
        f"{p['min']:.4f}",
        f"{p['max']:.4f}"
    ] for p in sorted(params[param_name], key=lambda x: str(x['layer']))]
    
    print(tabulate(
        table_data,
        headers=['Layer', 'Shape', 'Mean', 'Std', 'Min', 'Max'],
        tablefmt='simple',
        floatfmt='.4f'
    ))


=== alpha_A.s Parameters ===
  Layer  Shape       Mean     Std     Min     Max
-------  --------  ------  ------  ------  ------
      0  (1, 128)  0.1314  0.0335  0.0900  0.2482
      1  (1, 128)  0.1291  0.0399  0.0907  0.2531
      2  (1, 128)  0.1323  0.0391  0.0935  0.2985
      3  (1, 128)  0.1343  0.0418  0.0886  0.2879
      4  (1, 128)  0.1329  0.0419  0.0918  0.2612
      5  (1, 128)  0.1328  0.0439  0.0925  0.3031
      6  (1, 128)  0.1355  0.0407  0.0958  0.3079
      7  (1, 128)  0.1334  0.0382  0.0922  0.2873

=== alpha_M.s Parameters ===
  Layer  Shape       Mean     Std     Min     Max
-------  --------  ------  ------  ------  ------
      0  (1, 128)  0.1271  0.0310  0.0934  0.2501
      1  (1, 128)  0.1320  0.0434  0.0924  0.2908
      2  (1, 128)  0.1414  0.0499  0.0945  0.3001
      3  (1, 128)  0.1425  0.0445  0.0947  0.2871
      4  (1, 128)  0.1511  0.0504  0.0979  0.2980
      5  (1, 128)  0.1463  0.0464  0.0961  0.2909
      6  (1, 128)  0.1534  0.0533  0.095