In [2]:
# my virtual environments are rarely properly connected to jupyter so this fixes that
# running it shouldn't break anything for u
import sys
import os
current_dir = os.getcwd()  # Get the current working directory
venv_dir = os.path.join(current_dir, 'venv') 
python_version = str(sys.version_info.major) + '.' + str(sys.version_info.minor)
site_packages_path = os.path.join(venv_dir, 'lib', 'python' + python_version, 'site-packages')
sys.path.append(site_packages_path) 

In [4]:
# importing the model config
from config import *

# importing N-GPT
from model import Model

# the config
from config import ModelConfig

# imports for the tokenizer
import pickle
from tokenizer.tokenizer import BPE_Tokenizer

# does the actual inference
from inference import generate

# used to save & load models
import json
from dataclasses import asdict

# Load a Pretrained Model

In [7]:
model_name = 'N-GPT_2m'

# Deserialize the JSON file back to a dictionary
with open(f'models/{model_name}/model_config.json', 'r') as f:
    config_dict = json.load(f)

# Convert the dictionary back to a Config object
cfg = ModelConfig(**config_dict)
cfg.device = ('cuda' if torch.cuda.is_available() 
              else 'mps' if torch.backends.mps.is_available() 
              else 'cpu')

with open(f'tokenizer/models/{cfg.vocab_len - 3}.model', 'rb') as f:
        tokenizer_data = pickle.load(f)
tokenizer = BPE_Tokenizer(tokenizer_data['merges']) 

# Initialize a blank model
model = Model(cfg).to(cfg.device) 

# Load the saved state dictionary
path = f'models/{model_name}/model.pth'
model.load_state_dict(torch.load(path, map_location="cpu"))

# print the number of parameters in the model
print(f'{model.get_num_params()} parameters')

# If you only plan to do inference, switch to evaluation mode
model.eval()

2042106 parameters


  model.load_state_dict(torch.load(path, map_location="cpu"))


Model(
  (precompute_freqs): PrecomputeRotaryFrequencies()
  (token_embedder): Embedding(2048, 128)
  (layers): ModuleList(
    (0-8): 9 x Layer(
      (attn): SelfAttention(
        (Wq): Linear(in_features=128, out_features=128, bias=False)
        (Wk): Linear(in_features=128, out_features=128, bias=False)
        (Wv): Linear(in_features=128, out_features=128, bias=False)
        (Wo): Linear(in_features=128, out_features=128, bias=False)
      )
      (mlp): MLP(
        (Wup): Linear(in_features=128, out_features=341, bias=False)
        (Wgate): Linear(in_features=128, out_features=341, bias=False)
        (Wdown): Linear(in_features=341, out_features=128, bias=False)
      )
    )
  )
  (output): Linear(in_features=128, out_features=2048, bias=False)
  (criterion): CrossEntropyLoss()
)

# Inference

In [17]:
output = generate(
    "JULIET:\nO Romeo, Romeo! wherefore art thou", 
    model, 
    tokenizer, 
    temperature=0.01, # really weird that we've gotta use a pretty damn low temperature
    max_gen_len = 128
)
print(output)

                                                                                  

JULIET:
O Romeo, Romeo! wherefore art thou hast hurdlest
And hurf at his vowly stared to the man.

DUKE VINCENTIO:
You had all a man! 'tis stir of mine.

First Gentleman:
What is our brother! what thou hast well, sir!

DUKE VINCENTIO:
Then is the vengel of mine man; sir?

DUKE VINCENTIO:
O to thy honour, sir! 'tis so o'er?

KING RICHARD III:
O to the vengelly in his man!

ISABELLA:
Then is you, when thou art well to be hurdow.

ROMEO:
Your honour were towic




In [10]:
from collections import defaultdict
from tabulate import tabulate

# Dictionary to store parameters grouped by name
params = defaultdict(list)
scale_names = ['a_A.s', 'a_M.s', 's_qk.s', 's_u.s', 's_v.s', 's_z.s']

# Collect all parameters
for name, param in model.named_parameters():
    # Check if the parameter name ends with any of our target scale names
    if any(name.endswith(scale_name) for scale_name in scale_names):
        base_name = name.split('.')[-2] + '.' + name.split('.')[-1]  # Get the base parameter name
        layer_num = name.split('.')[1] if 'layers' in name else 'output'
        
        params[base_name].append({
            'layer': layer_num,
            'shape': tuple(param.shape),
            'mean': torch.mean(param).item(),
            'std': torch.std(param).item(),
            'min': torch.min(param).item(),
            'max': torch.max(param).item()
        })

# Print results for each parameter type
for param_name in sorted(params.keys()):
    print(f"\n=== {param_name} Parameters ===")
    table_data = [[
        p['layer'],
        str(p['shape']),
        f"{p['mean']:.4f}",
        f"{p['std']:.4f}",
        f"{p['min']:.4f}",
        f"{p['max']:.4f}"
    ] for p in sorted(params[param_name], key=lambda x: str(x['layer']))]
    
    print(tabulate(
        table_data,
        headers=['Layer', 'Shape', 'Mean', 'Std', 'Min', 'Max'],
        tablefmt='simple',
        floatfmt='.4f'
    ))


=== a_A Parameters ===
  #  Shape      Mean     Std
---  -------  ------  ------
  1  (128,)   0.0865  0.0208
  2  (128,)   0.1271  0.0533
  3  (128,)   0.1219  0.0382
  4  (128,)   0.1242  0.0436
  5  (128,)   0.1300  0.0481
  6  (128,)   0.1226  0.0308
  7  (128,)   0.1418  0.0579
  8  (128,)   0.1408  0.0554
  9  (128,)   0.1450  0.0518

=== a_M Parameters ===
  #  Shape      Mean     Std
---  -------  ------  ------
  1  (128,)   0.1422  0.0579
  2  (128,)   0.1437  0.0586
  3  (128,)   0.1453  0.0657
  4  (128,)   0.1548  0.0723
  5  (128,)   0.1507  0.0679
  6  (128,)   0.1578  0.0691
  7  (128,)   0.1711  0.0730
  8  (128,)   0.1835  0.0705
  9  (128,)   0.2169  0.0900

=== s_qk Parameters ===
  #  Shape      Mean     Std
---  -------  ------  ------
  1  (4, 32)  0.1353  0.0170
  2  (4, 32)  0.1214  0.0162
  3  (4, 32)  0.1287  0.0189
  4  (4, 32)  0.1176  0.0208
  5  (4, 32)  0.1230  0.0219
  6  (4, 32)  0.1250  0.0241
  7  (4, 32)  0.1253  0.0278
  8  (4, 32)  0.1186  0.0236