## Train openwebtext and shakespeare GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them
### For openwebtext, 4 heads and 4 transformer blocks and for shakespeare, half are used
### Tiktoken gpt2 Tokenization is used, currently gradient accumulation is unimplemented

## Based on the training and inference results, the overall loss for every shakespeare model trained with batchnorm was very low (under 1.0 at least), while the inference yielded poor results. Strangely however, the network did not seem to overfit as much with the checkpoints used for inference (eval loss was always under training loss)

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn

In [2]:
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


### Train GPT2 with Shakespeare GELU BatchNorm, custom_ln is Identity layer
### Params similiar to nanoGPT (https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py) except for gpt model params

In [8]:

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size=64",
        "dataset.name=tiny_shakespeare",
        "run.export=True",
        "run.epochs=100",
        "run.max_iters=5000",
        "run.eval_epoch_interval=1", 
        "run.eval_iters=200",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash':

2024-02-13 13:44:11.648799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-13 13:44:13,308 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-13 13:44:13,313 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-13 13:44:13,316 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m
[ [36m2024-02-13 13:44:13,566 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-13 13:44:13,573 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-13_13:44:13[0m
[ [36m2024-02-13 13:44:13,577 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-13 13:44:13,588 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader

KeyboardInterrupt: 

## Inference using karpathy's script
### To check if our script is faulty

In [9]:
"""
sample.py from nanoGPT, adjusted for our models
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from qtransform.model.gpt import GPTConfig, GPT
import omegaconf

# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.



dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
#exec(open('configurator.py').read()) # overrides from command line or config file
# -----------------------------------------------------------------------------

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

def inference_karpathy(ckpt_path: str, start: str = "\n"):

    # model
    if init_from == 'resume':
        checkpoint = torch.load(ckpt_path, map_location=device)
        print(checkpoint.keys())
        gptconf = GPTConfig(**checkpoint['model_cfg']["args"])
        model = GPT(gptconf)
        state_dict = checkpoint['model_state_dict']
        unwanted_prefix = '_orig_mod.'
        for k,v in list(state_dict.items()):
            if k.startswith(unwanted_prefix):
                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
        model.load_state_dict(state_dict)
    elif init_from.startswith('gpt2'):
        # init from a given GPT-2 model
        model = GPT.from_pretrained(init_from, dict(dropout=0.0))

    model.eval()
    model.to(device)
    if compile:
        model = torch.compile(model) # requires PyTorch 2.0 (optional)

    # look for the meta pickle in case it is available in the dataset folder
    load_meta = False
    if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
        meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
        load_meta = os.path.exists(meta_path)
    if load_meta:
        print(f"Loading meta from {meta_path}...")
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        # TODO want to make this more general to arbitrary encoder/decoder schemes
        stoi, itos = meta['stoi'], meta['itos']
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l])
    else:
        # ok let's assume gpt-2 encodings by default
        print("No meta.pkl found, assuming GPT-2 encodings...")
        enc = tiktoken.get_encoding("gpt2")
        encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
        decode = lambda l: enc.decode(l)

    # encode the beginning of the prompt
    if start.startswith('FILE:'):
        with open(start[5:], 'r', encoding='utf-8') as f:
            start = f.read()
    start_ids = encode(start)
    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
                print(decode(y[0].tolist()))
                print('---------------')


### Write inference of Shakespeare GELU BatchNorm, custom_ln is Identity layer to file

In [12]:
CHECKPOINT_PATH="/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1"

args = [
        "run=infer",
        "run.from_checkpoint="+CHECKPOINT_PATH,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)


[ [36m2024-02-13 13:52:43,381 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None

In [20]:
test = torch.load("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1")

[ [36m2024-02-14 08:44:40,317 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-14 08:44:40,322 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-14 08:44:40,326 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m


## Load a shakespeare model from nanoGPT to check if inference script is faulty
#### Model had a loss of around 0.7 after training and predicted words that resembled shakespeare

In [7]:
CHECKPOINT_NANOGPT_PATH = "/home/mabot004/nanoGPT/out-shakespeare/ckpt.pt"
checkpoint = torch.load(CHECKPOINT_NANOGPT_PATH)

In [8]:
#model_cfg instead of model_args, model_state_dict instead of model
#no tokenizer config -> specify in hydra config
checkpoint.keys()

dict_keys(['model', 'optimizer', 'model_args', 'iter_num', 'best_val_loss', 'config'])

In [23]:
checkpoint["epoch"] = checkpoint["iter_num"]
checkpoint["model_state_dict"] = checkpoint["model"]
checkpoint["model_cfg"] = checkpoint["model_args"]
del checkpoint["iter_num"]
del checkpoint["model"]
del checkpoint["model_args"]
checkpoint["tokenizer_cfg"] = {'dtype': 'float32', 
                               'meta_file': 'meta.pkl', 
                               'wrapper': 'TikTokenizer', 
                               'encoding': 'gpt2', 
                               'module': 'tiktoken', 
                               'meta': {
                                   'max_token_value': 50256, 
                                   'encoding': 'gpt2', 
                                   'dtype': 'float32', 
                                   'num_tokens': 338027, 
                                   'module': 'tiktoken'
                                }
                              }
checkpoint["model_cfg"] = {
    "cls": "GPT",
    "calc_loss_in_model": True,
    "args": {
      "n_layer" : checkpoint["model_cfg"]["n_layer"],
      "n_head" : checkpoint["model_cfg"]["n_head"],
      "n_embd" : checkpoint["model_cfg"]["n_embd"],
      "dropout" : checkpoint["model_cfg"]["dropout"],
      "bias" :  checkpoint["model_cfg"]["bias"],
      "block_size" : checkpoint["model_cfg"]["block_size"],
      "vocab_size" : checkpoint["model_cfg"]["vocab_size"],
      "transformer_active_func": "GELU",
      "norm_layer": "LayerNorm",
      "flash": False 
    }}

KeyError: 'iter_num'

In [36]:
torch.save(checkpoint, "karpathy_shakespeare")

#### Since karpathy used a larger vocabulary than the tokenizer, some tokens could not be encoded
#### Even though karpathy's inference generated good sentences, ours does not

In [5]:
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/karpathy_shakespeare",
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "device=cuda",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 09:17:24,190 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None

thread '<unnamed>' panicked at src/lib.rs:201:64:
no entry found for key
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: no entry found for key

#### Make our checkpoint compatible with karpathy's inference script and see if inference is bettereki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1

In [35]:
SHAKESPEARE_QTRANSFORM_PATH = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1"
checkpoint_qtransform = torch.load(SHAKESPEARE_QTRANSFORM_PATH)

In [16]:
checkpoint_qtransform["model_cfg"]

{'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50256, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}

In [36]:
checkpoint_qtransform["model_args"] = dict(checkpoint_qtransform["model_cfg"]["args"])
del checkpoint_qtransform["model_args"]["transformer_active_func"]
del checkpoint_qtransform["model_args"]["norm_layer"]
del checkpoint_qtransform["model_args"]["flash"]
checkpoint_qtransform["iter_num"] = checkpoint_qtransform["epoch"]
checkpoint_qtransform["optimizer"] = checkpoint_qtransform["optimizer_state_dict"]
checkpoint_qtransform["model"] = checkpoint_qtransform["model_state_dict"]
checkpoint_qtransform["best_val_loss"] = checkpoint_qtransform["metrics"]
checkpoint_qtransform["config"] = {
    'out_dir': 'out-shakespeare',
    'eval_interval': 250,
     'log_interval': 10,
     'eval_iters': 200,
     'eval_only': False,
     'always_save_checkpoint': False,
     'init_from': 'scratch',
     'wandb_log': False,
     'wandb_project': 'shakespeare',
     'wandb_run_name': 'mini-gpt',
     'dataset': 'shakespeare',
     'gradient_accumulation_steps': 1,
     'batch_size': 64,
     'block_size': 256,
     'n_layer': 2,
     'n_head': 2,
     'n_embd': 256,
     'dropout': 0.0,
     'bias': True,
     'learning_rate': 0.001,
     'max_iters': 5000,
     'weight_decay': 0.1,
     'beta1': 0.9,
     'beta2': 0.99,
     'grad_clip': 1.0,
     'decay_lr': True,
     'warmup_iters': 100,
     'lr_decay_iters': 5000,
     'min_lr': 0.0001,
     'backend': 'nccl',
     'device': 'cuda',
     'dtype': 'bfloat16',
     'compile': True}
del checkpoint_qtransform["model_state_dict"]
del checkpoint_qtransform["optimizer_state_dict"]
del checkpoint_qtransform["epoch"]
del checkpoint_qtransform["model_cfg"]
del checkpoint_qtransform["tokenizer_cfg"]
del checkpoint_qtransform["metrics"]
del checkpoint_qtransform["quant_cfg"]
del checkpoint_qtransform["quantized"]


In [37]:
checkpoint.keys()

dict_keys(['model', 'optimizer', 'model_args', 'iter_num', 'best_val_loss', 'config'])

In [38]:
checkpoint_qtransform.keys()

dict_keys(['model_args', 'iter_num', 'optimizer', 'model', 'best_val_loss', 'config'])

In [40]:
torch.save(checkpoint_qtransform, "/home/mabot004/nanoGPT/out-shakespeare/qtransform_shakespeare_karpathy.pt")

### Despite using karpathy's inference script, our model still generates nonsense sentences

### Train with karpathy's params for Shakespeare (https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py)
#### Eval loss is lower than training loss, so it does not seem to overfit
#### Loss does not seem to lower significantly after the 3rd/4th epoch

In [3]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py
#karpathy evaluates after 250 iterations, we implemented eval to do so after every epoch -> max_iters = 5000 / 200
eval_epoch_interval = 1 # keep frequent because we'll overfit
eval_iters = 200
max_iters = 250
epochs = 10 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher

#not implemented currently
lr_decay_iters = 5000 # make equal to max_iters usually

#not used currently
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}, 'qu

2024-02-20 13:11:43.581820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-20 13:11:45,001 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-20 13:11:45,005 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-20_13:11:45[0m
[ [36m2024-02-20 13:11:45,008 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-20 13:11:45,014 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader: 2[0m
[ [36m2024-02-20 13:11:45,304 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: tiny_shakespeare, with encoding: gpt2 and dtype: float32[0m
[ [36m2024-02-20 13:11:45,311 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/tiny_shakespeare/tokenized/gpt2/tiny_shakespeare-float32.bin"[0m
[ [36m2024-02-20 13:11:45,315 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [

UnboundLocalError: local variable 'data' referenced before assignment

In [49]:
#check if custom_ln layers had their params back propagated
import re
ckpt_shakespeare = torch.load("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6")
#custom_ln are identity layers in this case
list(filter(lambda x: re.search(r'custom_ln[1-2]', x), ckpt_shakespeare["model_state_dict"].keys()))

[]

In [4]:
#run inference again, this time with karpathy's params
#generated file: /home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/out_infer/INFER_2024-02-14_11:29:32_CHECKPOINT.out
args = [
        "run=infer",
        #"run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}}, 'from_checkpoint': '/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_0

### Run perplexity benchmark
#### Inference generates nonsense, perplexity very high

In [6]:


args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)

[ [36m2024-02-20 09:23:50,763 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.4}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False, 'args': {'block_size': 64}}, 'quantization': {'quantize': False}, 'pipe': '/dev/nu

### Use karpathy's inference script

In [10]:
inference_karpathy('/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3')

dict_keys(['model_state_dict', 'optimizer_state_dict', 'epoch', 'model_cfg', 'tokenizer_cfg', 'metrics', 'quant_cfg', 'quantized'])
[ [36m2024-02-20 09:36:30,098 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-20 09:36:31,222 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.51M[0m
No meta.pkl found, assuming GPT-2 encodings...
















 for


















































 man hands













































 war


rea












 and














,



















































 be you
--










 lips










 for















:





































 though


 and



::
 of or: child

D all


 and














KeyboardInterrupt: 

### Load checkpoint on our own and forward pass it some data

In [20]:
CHECKPOINT_PATH = '/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3'
checkpoint = torch.load(CHECKPOINT_PATH)
gptconf = GPTConfig(**checkpoint['model_cfg']["args"])
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = GPT(gptconf).to(device=device)
model.eval()
start = "\n"
from tiktoken import get_encoding
gpt2_encoding = get_encoding("gpt2")
input = torch.Tensor(gpt2_encoding.encode_ordinary(start)).to(device=device, dtype=torch.long).unsqueeze(dim=0)
logits = model(input)


[ [36m2024-02-20 09:43:16,617 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-20 09:43:17,586 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.51M[0m
torch.Size([1, 1])


RuntimeError: The shape of the 2D attn_mask is torch.Size([64, 64]), but should be (1, 1).

### Train with LayerNorm and check if inference is going better
#### The loss after the first epoch is way higher than with batchnorm (4.5 with layernorm compared to 1.7 with batchnorm)
#### Using layernorm for inference did not help either

In [50]:
args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeLN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:45:55,161 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'b

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5dc88ad3f0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1443, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/opt/conda/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/opt/conda/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x

[ [36m2024-02-14 11:50:25,041 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mAVERAGE EVAL LOSS FOR EPOCH 4/200: 4.248138904571533[0m
[ [36m2024-02-14 11:50:25,044 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m4.154836225509643[0m
[ [36m2024-02-14 11:50:25,648 [0m][[2;37mqtransform.utils.helper[0m][[32mINFO[0m] - [32mModel checkpoint saved to /home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:45:55__epoch:4[0m
[ [36m2024-02-14 11:50:25,651 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mEPOCH: 5/200[0m
[ [36m2024-02-14 11:50:25,875 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 0 loss: 0.42351655960083007[0m
[ [36m2024-02-14 11:50:26,157 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 10 loss: 4.157746601104736[0m
[ [36m2024-02-14 11:50:26,437 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 20 loss: 4.187510585784912[0m
[ [36m2024-

KeyboardInterrupt: 

In [51]:
#layernorm did not help either
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:45:55__epoch:4",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:51:24,747 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'None', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}}, 'f

## Test if inference changes when we use karpathy's script

#### Strangely, it complains that the padding tokens used in batchnorm arent on the gpu, but it did not complain in our script
#### The text also contains a large amount of newline tokens, but semantically the words make more sense than in our infer script

In [4]:
inference_karpathy('/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6')

[ [36m2024-02-14 13:04:20,749 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-14 13:04:20,753 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-14 13:04:20,757 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m
dict_keys(['model_state_dict', 'optimizer_state_dict', 'epoch', 'model_cfg', 'tokenizer_cfg', 'metrics', 'quant_cfg', 'quantized'])
[ [36m2024-02-14 13:04:21,465 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-14 13:04:22,808 [0m]

In [56]:
chkpt = torch.load("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6")

In [60]:
chkpt.keys()

dict_keys(['model_state_dict', 'optimizer_state_dict', 'epoch', 'model_cfg', 'tokenizer_cfg', 'metrics', 'quant_cfg', 'quantized'])

#### Assuming that the model was only trained with a third of the dataset, it should have a much better inference if trained with the entire dataset

In [3]:
#generated checkpoint has a ridiculously high perplexity (50.000)
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-15_12:28:12__epoch:1",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}}, 'from_checkpoint': '/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-15_1

### Benchmark trained model which has a loss of 0.1 during training
#### Perplexity is about 50.000 which is ridiculously high

In [4]:
args = [ 
    "run=bench",
    "run.from_checkpoint=GPT_2024-02-15_12:28:12__epoch:1",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.4}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False, 'args': {'block_size': 64}}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run

### For some reason, karpathy's model has an even higher perplexity than ours while still generating better text

In [3]:
args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/karpathy_shakespeare",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.4}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False, 'args': {'block_size': 64}}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run

### Manually load a checkpoint and feed it with some data to see what the actual perplexity is

In [19]:
#generated config from loading karpathy's shakespeare model for benchmarking
from qtransform.run import load_model, forward_pass, InferType
from qtransform.run.bench import measure_perplexity
from omegaconf import DictConfig
import torch.nn.functional as F
CHECKPOINT_KARPATHY = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/karpathy_shakespeare"
CHECKPOINT = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-15_12:28:12__epoch:1"
cfg = DictConfig({'data': {'dtype': 'float32'}, 
 'device': 'cuda', 'debug': False, 
 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 
             'module': 'huggingface', 
             'name': 'tiny_shakespeare', 
             'root_path': '~/.qtransform/datasets', 
             'dataset_dir': ['${dataset.root_path}','${dataset.module}', '${dataset.name}'], 
             'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.4}, 
             'tokenizer': {'dtype': '${data.dtype}', 
                           'meta_file': 'meta.pkl', 
                           'wrapper': 'TikTokenizer', 
                           'encoding': 'gpt2', 
                           'module': 'tiktoken'}, 
             'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 
             'type': 'huggingface', 
             'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 
 'seed': 1234567890, 
 'model': {'calc_loss_in_model': False, 
           'args': {'block_size': 64}}, 
 'quantization': {'quantize': False}, 
 'pipe': '/dev/null', 
 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 
 'run': {'command': 'bench', 
         'el': 2, 
         'num_samples': 100, 
         'out_dir': '', 
         'checkpoint_dir': 'models', 
         'from_checkpoint': CHECKPOINT, 
         'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}}}})
from qtransform.dataset import get_data, get_loader, DatasetWrapper
data_wrapper: DatasetWrapper = get_data(cfg.dataset)

#dataset hydra config expects block size, currently set in command line. TODO: infer from onnx metadata or checkpoint metadata
data_wrapper.load_dataset()
dataset_bench = data_wrapper.dataset_info.bench
dataloader = get_loader(dataloader_cfg = cfg.dataset.dataloader, data = dataset_bench)
device = device=torch.device('cuda')
models = load_model(cfg, device=device)
model = models[0].model
model.eval()
tokenizer = models[0].tokenizer
inputs,labels = next(iter(dataloader))
inputs = inputs.to(device=device)
labels = labels.to(device=device)
logits = forward_pass(InferType.CHECKPOINT, model, inputs)
#print(measure_perplexity(logits, labels))



#### The model is quite uncertain the next token as the highest softmaxed prediction for the first word is 0.00008
#### That would explain the ludicruous perplexity

In [76]:
logits[0][0].sort().values[-10:]

tensor([0.0328, 0.0329, 0.0339, 0.0339, 0.0339, 0.0348, 0.0352, 0.0374, 0.0381,
        0.0975], device='cuda:0', grad_fn=<SliceBackward0>)

In [79]:
#step between the highest prediction (0.09) and second highest (0.03) is not that high when softmaxed
softmaxed = F.softmax(logits, dim=-1)
softmaxed[0][0].sort().values[-10:]

tensor([2.0560e-05, 2.0561e-05, 2.0582e-05, 2.0582e-05, 2.0583e-05, 2.0600e-05,
        2.0610e-05, 2.0656e-05, 2.0669e-05, 2.1933e-05], device='cuda:0',
       grad_fn=<SliceBackward0>)

### Manual implementation of cross entropy loss has way less perplexity than the one from pytorch
### that is due to softmaxing along the wrong dimension (dim=1 instead of dim=2)

In [44]:
#softmaxed cross entropy
block_size = logits.size()[1]
loss_softmaxed = torch.zeros(block_size).to(device=device)
for word in range(block_size):
    softmaxed_word = softmaxed[0][word]
    loss_softmaxed[word] = -1 * torch.log(softmaxed_word[labels[0][word]])
loss_softmaxed.mean()

tensor(10.8245, device='cuda:0', grad_fn=<MeanBackward0>)

In [None]:
logits[0][0]

In [25]:
#that is the loss when softmaxing along the block_size dimension (dim=1) instead of the vocab dimension (dim=2)
loss_softmaxed

tensor([4.1674, 4.1772, 4.1843, 4.1687, 4.1681, 4.1566, 4.1753, 4.1671, 4.1730,
        4.1766, 4.1443, 4.1577, 4.1743, 4.1747, 4.0866, 4.1656, 4.1658, 4.1785,
        4.1569, 4.1370, 4.1582, 4.1536, 4.1606, 4.1764, 4.0813, 4.1711, 4.1604,
        4.1514, 4.1851, 4.1588, 4.1673, 4.1773, 4.1634, 4.1400, 4.1794, 4.1625,
        4.1651, 4.1787, 4.1669, 4.1546, 4.1550, 4.1693, 4.0746, 4.1546, 4.1659,
        4.1807, 4.1696, 4.1445, 4.1617, 4.1490, 4.1536, 4.1835, 4.0742, 4.1566,
        4.1614, 4.1836, 4.1829, 4.1541, 4.1517, 4.1672, 4.1561, 4.1467, 4.1519,
        4.1597], device='cuda:0', grad_fn=<CopySlices>)

In [74]:
#non-softmaxed cross entropy
#logits need to be softmaxed in order to remove negative values
block_size = logits.size()[1]
loss = torch.zeros(block_size).to(device=device)
for word in range(block_size):
    logits_word = logits[0][word]
    loss[word] = -1 * torch.log(logits_word[labels[0][word]])
loss

tensor([0.6963, 0.4673,    nan, 0.1722, 2.2465, 2.5382, 1.1921, 0.8894,    nan,
           nan, 1.9743,    nan,    nan,    nan,    nan,    nan, 0.5171,    nan,
           nan, 3.8941,    nan,    nan,    nan,    nan,    nan, 1.6858, 0.7186,
        0.5291,    nan, 2.5193,    nan, 1.2689,    nan, 1.4735, 0.7614,    nan,
           nan, 0.8097, 1.6700, 3.3430, 3.8243,    nan,    nan,    nan, 0.5673,
           nan,    nan, 1.8583,    nan,    nan,    nan,    nan,    nan, 1.7914,
        0.7474, 0.5505,    nan, 1.8451,    nan,    nan, 1.4996,    nan, 2.8847,
           nan], device='cuda:0', grad_fn=<CopySlices>)

In [51]:
torch.exp(loss.mean())

tensor(57170.2422, device='cuda:0', grad_fn=<ExpBackward0>)

In [29]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.randint(5, (3,), dtype=torch.int64)
loss = F.cross_entropy(input, target)
print(f'{input}\n\n{target}\n\n{loss}')

tensor([[-0.4150, -0.7571, -0.0677,  1.1603, -1.5602],
        [-0.5520,  0.4000,  1.0496,  1.3627,  1.4628],
        [ 0.6792, -0.5768,  0.1862,  0.8596,  0.3433]], requires_grad=True)

tensor([2, 0, 4])

2.1891212463378906


In [60]:
logits_viewed = logits.view(-1, logits.size(-1))
labels_viewed = labels.view(-1)
block_size = logits_viewed.size()[0]
loss_manual = torch.zeros(block_size).to(device=device)
input_softmaxed = F.softmax(logits_viewed, dim=1)
for word in range(block_size):
    softmaxed_word = input_softmaxed[word]
    input_softmaxed[word] = -1 * torch.log(softmaxed_word[labels_viewed[word]])
input_softmaxed.mean()

tensor(10.8225, device='cuda:0', grad_fn=<MeanBackward0>)

In [56]:
logits_viewed.size()

torch.Size([768, 50256])

# Compare ReLU and GELU

## Both GeLU and GELU reached a loss of around 1.0 after approx. 450 batches
## ReLU loss after 1000 batches: 0.10457251816987992, GeLU loss after 1000 batches: 0.0994626946747303

In [6]:

TRAIN_SIZE = 0.8
EVAL_SIZE = 0.2
args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size=64",
        "dataset.name=tiny_shakespeare",
        "dataset.sizes.train=0.8",
        "dataset.sizes.eval=0.1",
        "run.export=True",
        "run.epochs=1",
        "run.max_iters=1000",
        "run.eval_epoch_interval=1", 
        "run.eval_iters=200",
        "run.gradient_accumulation_steps=1",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.8, 'eval': 0.1, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': 

RuntimeError: 

### Test loss with ReLU

In [5]:
# GELU loss after 1000 batches: 0.10986531674861907
TRAIN_SIZE = 0.8
EVAL_SIZE = 0.2
args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_ReBN",
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size=64",
        "dataset.name=tiny_shakespeare",
        "dataset.sizes.train=0.8",
        "dataset.sizes.eval=0.1",
        "run.export=True",
        "run.epochs=1",
        "run.max_iters=1000",
        "run.eval_epoch_interval=1", 
        "run.eval_iters=200",
        "run.gradient_accumulation_steps=1",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.8, 'eval': 0.1, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'ReLU', 'norm_layer': 'BatchNorm', 'flash': 

2024-02-16 11:48:08.863247: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-16 11:48:10,382 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-16 11:48:10,390 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-16_11:48:10[0m
[ [36m2024-02-16 11:48:10,394 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-16 11:48:10,848 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader: 2[0m
[ [36m2024-02-16 11:48:10,869 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: tiny_shakespeare, with encoding: gpt2 and dtype: float32[0m
[ [36m2024-02-16 11:48:10,875 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/tiny_shakespeare/tokenized/gpt2/tiny_shakespeare-float32.bin"[0m
[ [36m2024-02-16 11:48:10,879 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [

RuntimeError: 

## LayerNorm, GELU with batch size of 64 and no gradient accumulation
### Model overfitted after the first epoch already, loss did not change much after the second epoch

In [3]:
TRAIN_SIZE = "0.8"
EVAL_SIZE = "0.2"
eval_epoch_interval = 1 # keep frequent because we'll overfit
eval_iters = 200
max_iters = 250
epochs = 10 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher

#not implemented currently
lr_decay_iters = 5000 # make equal to max_iters usually

#not used currently
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeLN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.sizes.train="+TRAIN_SIZE,
        "dataset.sizes.eval="+EVAL_SIZE,
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.8, 'eval': 0.2, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'LayerNorm', 'flash': False}}, 'qua

2024-02-20 11:43:52.576049: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-20 11:43:54,018 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-20 11:43:54,022 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-20_11:43:54[0m
[ [36m2024-02-20 11:43:54,024 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-20 11:43:54,031 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader: 2[0m
[ [36m2024-02-20 11:43:57,584 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: tiny_shakespeare, with encoding: gpt2 and dtype: float32[0m
[ [36m2024-02-20 11:43:57,593 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/tiny_shakespeare/tokenized/gpt2/tiny_shakespeare-float32.bin"[0m
[ [36m2024-02-20 11:43:57,597 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [

KeyboardInterrupt: 

In [4]:
#inference generated nonsense
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_11:43:54__epoch:2",
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "device=cuda",
        "debug=True"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': [{'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}], 'milestones': None, 'warmup_iters': 100, 'min_lr': 6e-05}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name'

KeyboardInterrupt: 

In [5]:
args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_11:43:54__epoch:2",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)

[ [36m2024-02-20 11:57:57,445 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.4}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False, 'args': {'block_size': 64}}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim':

### Make console cmd from args

In [5]:
def get_cmd_from_args(args: list[str]):
    return "python -m qtransform " + ' '.join(args)