## Train openwebtext and shakespeare GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them
### For openwebtext, 4 heads and 4 transformer blocks and for shakespeare, half are used
### Tiktoken gpt2 Tokenization is used, we do not currently have gradient accumulation implemented

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn

In [2]:
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


### Train GPT2 with Shakespeare GELU BatchNorm, custom_ln is Identity layer
### Params similiar to nanoGPT (https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py) except for gpt model params

In [8]:

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size=64",
        "dataset.name=tiny_shakespeare",
        "run.export=True",
        "run.epochs=100",
        "run.max_iters=5000",
        "run.eval_epoch_interval=1", 
        "run.eval_iters=200",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash':

2024-02-13 13:44:11.648799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-13 13:44:13,308 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-13 13:44:13,313 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-13 13:44:13,316 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m
[ [36m2024-02-13 13:44:13,566 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-13 13:44:13,573 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-13_13:44:13[0m
[ [36m2024-02-13 13:44:13,577 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-13 13:44:13,588 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader

KeyboardInterrupt: 

### Write inference of Shakespeare GELU BatchNorm, custom_ln is Identity layer to file

In [12]:
CHECKPOINT_PATH="/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1"

args = [
        "run=infer",
        "run.from_checkpoint="+CHECKPOINT_PATH,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)


[ [36m2024-02-13 13:52:43,381 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None

In [20]:
test = torch.load("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1")

[ [36m2024-02-14 08:44:40,317 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-14 08:44:40,322 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-14 08:44:40,326 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m


## Load a shakespeare model from nanoGPT to check if inference script is faulty
#### Model had a loss of around 0.7 after training and predicted words that resembled shakespeare

In [7]:
CHECKPOINT_NANOGPT_PATH = "/home/mabot004/nanoGPT/out-shakespeare/ckpt.pt"
checkpoint = torch.load(CHECKPOINT_NANOGPT_PATH)

In [8]:
#model_cfg instead of model_args, model_state_dict instead of model
#no tokenizer config -> specify in hydra config
checkpoint.keys()

dict_keys(['model', 'optimizer', 'model_args', 'iter_num', 'best_val_loss', 'config'])

In [23]:
checkpoint["epoch"] = checkpoint["iter_num"]
checkpoint["model_state_dict"] = checkpoint["model"]
checkpoint["model_cfg"] = checkpoint["model_args"]
del checkpoint["iter_num"]
del checkpoint["model"]
del checkpoint["model_args"]
checkpoint["tokenizer_cfg"] = {'dtype': 'float32', 
                               'meta_file': 'meta.pkl', 
                               'wrapper': 'TikTokenizer', 
                               'encoding': 'gpt2', 
                               'module': 'tiktoken', 
                               'meta': {
                                   'max_token_value': 50256, 
                                   'encoding': 'gpt2', 
                                   'dtype': 'float32', 
                                   'num_tokens': 338027, 
                                   'module': 'tiktoken'
                                }
                              }
checkpoint["model_cfg"] = {
    "cls": "GPT",
    "calc_loss_in_model": True,
    "args": {
      "n_layer" : checkpoint["model_cfg"]["n_layer"],
      "n_head" : checkpoint["model_cfg"]["n_head"],
      "n_embd" : checkpoint["model_cfg"]["n_embd"],
      "dropout" : checkpoint["model_cfg"]["dropout"],
      "bias" :  checkpoint["model_cfg"]["bias"],
      "block_size" : checkpoint["model_cfg"]["block_size"],
      "vocab_size" : checkpoint["model_cfg"]["vocab_size"],
      "transformer_active_func": "GELU",
      "norm_layer": "LayerNorm",
      "flash": False 
    }}

KeyError: 'iter_num'

In [36]:
torch.save(checkpoint, "karpathy_shakespeare")

#### Since karpathy used a larger vocabulary than the tokenizer, some tokens could not be encoded
#### Even though karpathy's inference generated good sentences, ours does not

In [5]:
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/karpathy_shakespeare",
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "device=cuda",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 09:17:24,190 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None

thread '<unnamed>' panicked at src/lib.rs:201:64:
no entry found for key
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: no entry found for key

#### Make our checkpoint compatible with karpathy's inference script and see if inference is bettereki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1

In [35]:
SHAKESPEARE_QTRANSFORM_PATH = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1"
checkpoint_qtransform = torch.load(SHAKESPEARE_QTRANSFORM_PATH)

In [16]:
checkpoint_qtransform["model_cfg"]

{'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50256, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}

In [36]:
checkpoint_qtransform["model_args"] = dict(checkpoint_qtransform["model_cfg"]["args"])
del checkpoint_qtransform["model_args"]["transformer_active_func"]
del checkpoint_qtransform["model_args"]["norm_layer"]
del checkpoint_qtransform["model_args"]["flash"]
checkpoint_qtransform["iter_num"] = checkpoint_qtransform["epoch"]
checkpoint_qtransform["optimizer"] = checkpoint_qtransform["optimizer_state_dict"]
checkpoint_qtransform["model"] = checkpoint_qtransform["model_state_dict"]
checkpoint_qtransform["best_val_loss"] = checkpoint_qtransform["metrics"]
checkpoint_qtransform["config"] = {
    'out_dir': 'out-shakespeare',
    'eval_interval': 250,
     'log_interval': 10,
     'eval_iters': 200,
     'eval_only': False,
     'always_save_checkpoint': False,
     'init_from': 'scratch',
     'wandb_log': False,
     'wandb_project': 'shakespeare',
     'wandb_run_name': 'mini-gpt',
     'dataset': 'shakespeare',
     'gradient_accumulation_steps': 1,
     'batch_size': 64,
     'block_size': 256,
     'n_layer': 2,
     'n_head': 2,
     'n_embd': 256,
     'dropout': 0.0,
     'bias': True,
     'learning_rate': 0.001,
     'max_iters': 5000,
     'weight_decay': 0.1,
     'beta1': 0.9,
     'beta2': 0.99,
     'grad_clip': 1.0,
     'decay_lr': True,
     'warmup_iters': 100,
     'lr_decay_iters': 5000,
     'min_lr': 0.0001,
     'backend': 'nccl',
     'device': 'cuda',
     'dtype': 'bfloat16',
     'compile': True}
del checkpoint_qtransform["model_state_dict"]
del checkpoint_qtransform["optimizer_state_dict"]
del checkpoint_qtransform["epoch"]
del checkpoint_qtransform["model_cfg"]
del checkpoint_qtransform["tokenizer_cfg"]
del checkpoint_qtransform["metrics"]
del checkpoint_qtransform["quant_cfg"]
del checkpoint_qtransform["quantized"]


In [37]:
checkpoint.keys()

dict_keys(['model', 'optimizer', 'model_args', 'iter_num', 'best_val_loss', 'config'])

In [38]:
checkpoint_qtransform.keys()

dict_keys(['model_args', 'iter_num', 'optimizer', 'model', 'best_val_loss', 'config'])

In [40]:
torch.save(checkpoint_qtransform, "/home/mabot004/nanoGPT/out-shakespeare/qtransform_shakespeare_karpathy.pt")

### Despite using karpathy's inference script, our model still generates garbage

#### Copy karpathy's params one by one and see if that fixes the issue, it probably won't though
##### Update: it did not. Hopefully that is due to using BatchNorm

In [47]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py
#karpathy evaluates after 250 iterations, we implemented eval to do so after every epoch -> max_iters = 5000 / 200
eval_epoch_interval = 1 # keep frequent because we'll overfit
eval_iters = 200
max_iters = 250
epochs = 200 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher

#not implemented currently
lr_decay_iters = 5000 # make equal to max_iters usually

#not used currently
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:20:19,566 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'b

Exception ignored in: <function _releaseLock at 0x7f5f126b1750>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


RuntimeError: DataLoader worker (pid(s) 49086) exited unexpectedly

Exception ignored in: <function Socket.__del__ at 0x7f5f12bf0ee0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/zmq/sugar/socket.py", line 178, in __del__
    def __del__(self):
KeyboardInterrupt: 


In [49]:
#check if custom_ln layers had their params back propagated
import re
ckpt_shakespeare = torch.load("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6")
#custom_ln are identity layers in this case
list(filter(lambda x: re.search(r'custom_ln[1-2]', x), ckpt_shakespeare["model_state_dict"].keys()))

[]

In [48]:
#run inference again, this time with karpathy's params
#generated file: /home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/out_infer/INFER_2024-02-14_11:29:32_CHECKPOINT.out
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:29:31,267 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}

### Train with LayerNorm and check if inference is going better
#### The loss after the first epoch is way higher than with batchnorm (4.5 with layernorm compared to 1.7 with batchnorm)

In [50]:
args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeLN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:45:55,161 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'b

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f5dc88ad3f0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1443, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/opt/conda/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/opt/conda/lib/python3.10/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/opt/conda/lib/python3.10/selectors.py", line 416, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x

[ [36m2024-02-14 11:50:25,041 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mAVERAGE EVAL LOSS FOR EPOCH 4/200: 4.248138904571533[0m
[ [36m2024-02-14 11:50:25,044 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m4.154836225509643[0m
[ [36m2024-02-14 11:50:25,648 [0m][[2;37mqtransform.utils.helper[0m][[32mINFO[0m] - [32mModel checkpoint saved to /home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:45:55__epoch:4[0m
[ [36m2024-02-14 11:50:25,651 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mEPOCH: 5/200[0m
[ [36m2024-02-14 11:50:25,875 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 0 loss: 0.42351655960083007[0m
[ [36m2024-02-14 11:50:26,157 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 10 loss: 4.157746601104736[0m
[ [36m2024-02-14 11:50:26,437 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32m  batch 20 loss: 4.187510585784912[0m
[ [36m2024-

KeyboardInterrupt: 

In [51]:
#layernorm did not help either
args = [
        "run=infer",
        "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:45:55__epoch:4",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-14 11:51:24,747 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'None', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None}}, 'f

In [44]:
"""
sample.py from nanoGPT, adjusted for our models
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from qtransform.model.gpt import GPTConfig, GPT
import omegaconf

# -----------------------------------------------------------------------------
init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
out_dir = 'out' # ignored if init_from is not 'resume'
start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
num_samples = 10 # number of samples to draw
max_new_tokens = 500 # number of tokens generated in each sample
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
seed = 1337
#device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.

device = 'cpu'


dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
compile = False # use PyTorch 2.0 to compile the model to be faster
#exec(open('configurator.py').read()) # overrides from command line or config file
# -----------------------------------------------------------------------------

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)


MODEL_PATH = "/home/mabot004/nanoGPT/out-shakespeare/ckpt.pt"

# model
if init_from == 'resume':
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(MODEL_PATH)
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
elif init_from.startswith('gpt2'):
    # init from a given GPT-2 model
    model = GPT.from_pretrained(init_from, dict(dropout=0.0))

model.eval()
model.to(device)
if compile:
    model = torch.compile(model) # requires PyTorch 2.0 (optional)

# look for the meta pickle in case it is available in the dataset folder
load_meta = False
if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
    meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
    load_meta = os.path.exists(meta_path)
if load_meta:
    print(f"Loading meta from {meta_path}...")
    with open(meta_path, 'rb') as f:
        meta = pickle.load(f)
    # TODO want to make this more general to arbitrary encoder/decoder schemes
    stoi, itos = meta['stoi'], meta['itos']
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])
else:
    # ok let's assume gpt-2 encodings by default
    print("No meta.pkl found, assuming GPT-2 encodings...")
    enc = tiktoken.get_encoding("gpt2")
    encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
    decode = lambda l: enc.decode(l)

# encode the beginning of the prompt
if start.startswith('FILE:'):
    with open(start[5:], 'r', encoding='utf-8') as f:
        start = f.read()
start_ids = encode(start)
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

# run generation
with torch.no_grad():
    with ctx:
        for k in range(num_samples):
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
            print(decode(y[0].tolist()))
            print('---------------')


[ [36m2024-02-14 10:05:50,939 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=2, n_head=2, n_embd=256, dropout=0.0, bias=True, flash=False, transformer_active_func='ReLU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-14 10:05:51,331 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 14.98M[0m
No meta.pkl found, assuming GPT-2 encodings...






































ENB
ENENENENENENENENENENENENENENENENENENENENEN
DENENENENENENENENENENENENEN prosper are are Here HereENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENENEN nasalE

KeyboardInterrupt: 

### Make console cmd from args

In [5]:
def get_cmd_from_args(args: list[str]):
    return "python -m qtransform " + ' '.join(args)