## Train a GPT2 Transformer model with wikitext using BatchNorm/ LayerNorm and ReLU/ GELU

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

import re
with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


### Hyperparameters
#### 10.000 Iterations in total with a small GPT model

#### Using tiktoken gpt2 Tokenizer

In [2]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py and https://github.com/karpathy/nanoGPT/blob/master/config/train_gpt2.py
#karpathy used a larger transformer model for openwebtext alongside more epochs

DATASET = "wikitext"
SUBSET = "wikitext-103-raw-v1"

eval_epoch_interval = str(1) # every epoch, meaning after max_iters
eval_iters = str(50)
max_iters = str(200)
epochs = "20" #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = "2"
batch_size = "32"

grad_clip="1.0"

block_size = "256"
n_layer = "6"
n_head = "6"
n_embd = "384"
dropout = "0.2"

learning_rate = str(1e-3) # with baby networks can afford to go a bit higher
seed = "1337"

step_size = epochs
run_args = [
    'run=train',
    'run.export=False',
    'run.epochs='+epochs,
    'run.max_iters='+max_iters,
    'run.eval_epoch_interval='+eval_epoch_interval,
    'run.eval_iters='+eval_iters,
    'run.grad_clip='+grad_clip,
    'run.gradient_accumulation_steps='+gradient_accumulation_steps,
]
model_args = [
    'model.args.dropout='+dropout,
    'model.args.n_layer='+n_layer,
    'model.args.n_head='+n_head,
    'model.args.n_embd='+n_embd,
    'model.args.block_size='+n_embd,
]
dataset_args = [
    'dataset=huggingface',
    'dataset.name='+DATASET,
    'dataset.subset='+SUBSET,
    'dataset/tokenizer=tiktoken',
    'dataset.tokenizer.encoding=gpt2',
    'dataset.dataloader.shuffle=False',
    'dataset.dataloader.batch_size='+batch_size,
]
other_args = [
    'seed='+seed,
    'optim.args.learning_rate='+learning_rate,
    'optim.scheduler.schedulers.1.args.step_size='+step_size,
    'device=cuda',
    'debug=True'
]

### Training was performed with added ones to attention matrix due to bug

#### ReLU BatchNorm

In [3]:
model = "gpt_2_h2l2e256b64_ReBN"
model_args.append("model="+model)
qtransform.notebook_run(run_args+model_args+dataset_args+other_args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, '

2024-03-06 09:55:11.258833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-06 09:55:11,951 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-06 09:55:12,051 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 09:55:12,054 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 09:55:12,056 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 09:55:12,059 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 09:55:12,674 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-03-06 09:55:12,678 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-03-06_09:55:12[0m
[ [36m2024-03-06 09:55:12,681 [0m][[2;37mqtransform[0m][[32mINFO[0m] -

KeyboardInterrupt: 

#### ReLU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_ReLN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU BatchNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeBN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeLN"
args.append("model="+model)
qtransform.notebook_run(args)

### Inference

In [2]:
args_infer = [
        "run=infer",
        "device=cuda",
        "run.num_samples=20", 
        "run.max_new_tokens=100",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]

#### ReLU BatchNorm

In [10]:
CHECKPOINT="/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-06_09:55:12__epoch:5"
args_infer.append("run.from_checkpoint="+CHECKPOINT)
qtransform.notebook_run(args_infer, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 20, 'max_new_tokens': 100, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': None, 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encod

### Using specific wikitext tokenizer from https://huggingface.co/Kristijan/wikitext-103-tokenizer

In [5]:
dataset_args = [
    'dataset=huggingface',
    'dataset.name=wikitext',
    'dataset.subset=wikitext-103-raw-v1',
    'dataset/tokenizer=transformers',
    'dataset.tokenizer.pretrained_tokenizer=GPT2TokenizerFast',
    'dataset.tokenizer.encoding=Kristijan/wikitext-103-tokenizer ',
    'dataset.dataloader.shuffle=False',
    'dataset.dataloader.batch_size=32'
]

In [16]:
def del_keyword(liste, regex):
    idxs = [i for i, item in enumerate(liste) if re.search(regex, item)]
    for idx in idxs:
        del liste[idx]

#### ReLU BatchNorm

In [6]:
model = "gpt_2_h2l2e256b64_ReBN"
#delete previous model, from stackoverflow post: https://stackoverflow.com/a/4146090
del_keyword(model_args, r'^model=')
model_args.append("model="+model)
qtransform.notebook_run(run_args+model_args+dataset_args+other_args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TransformersTokenizer', 'pretrained_tokenizer': 'GPT2TokenizerFast', 'encoding': 'Kristijan/wikitext-103-tokenizer', 'module': 'transformers', 'fast': True}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'mod

2024-03-06 13:07:38.596044: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-06 13:07:39,175 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-06 13:07:39,282 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 13:07:39,286 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 13:07:39,290 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 13:07:39,293 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 13:07:40,003 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-03-06 13:07:40,009 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-03-06_13:07:40[0m
[ [36m2024-03-06 13:07:40,012 [0m][[2;37mqtransform[0m][[32mINFO[0m] -

loading file vocab.json from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/vocab.json
loading file merges.txt from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/merges.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


[ [36m2024-03-06 13:07:44,818 [0m][[2;37mqtransform.dataset.tokenizer.transformers[0m][[34mDEBUG[0m] - [34mUsing tokenizer class: GPT2TokenizerFast with encoding: Kristijan/wikitext-103-tokenizer[0m
[ [36m2024-03-06 13:07:44,823 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: wikitext, with encoding: Kristijan/wikitext-103-tokenizer and dtype: float32[0m
[ [36m2024-03-06 13:07:44,833 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/wikitext/tokenized/Kristijan/wikitext-103-tokenizer/train-wikitext-103-raw-v1-float32.bin"[0m
[ [36m2024-03-06 13:07:44,837 [0m][[2;37mqtransform.dataset[0m][[34mDEBUG[0m] - [34mOffset is 0, start is 0.0, end is 1.0[0m
[ [36m2024-03-06 13:07:44,839 [0m][[2;37mqtransform.dataset[0m][[34mDEBUG[0m] - [34mTokenized file has 121454329.0 tokens of datatype: float32. Attempting to start at token: 0[0m
[ [

KeyboardInterrupt: 

#### Loss seemed to stagnate after 200 batches after each epoch, maybe reduce iterations per epoch and increase epochs

In [None]:
del_keyword(run_args, r'^run.max_iters=')
run_args.append('run.max_iters=200')
run_args.append(r'run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-06_13:07:40__epoch:1')
qtransform.notebook_run(run_args+model_args+dataset_args+other_args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TransformersTokenizer', 'pretrained_tokenizer': 'GPT2TokenizerFast', 'encoding': 'Kristijan/wikitext-103-tokenizer', 'module': 'transformers', 'fast': True}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'mod

loading file vocab.json from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/vocab.json
loading file merges.txt from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/merges.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


[ [36m2024-03-06 13:26:10,138 [0m][[2;37mqtransform.dataset.tokenizer.transformers[0m][[34mDEBUG[0m] - [34mUsing tokenizer class: GPT2TokenizerFast with encoding: Kristijan/wikitext-103-tokenizer[0m
[ [36m2024-03-06 13:26:10,144 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: wikitext, with encoding: Kristijan/wikitext-103-tokenizer and dtype: float32[0m
[ [36m2024-03-06 13:26:10,151 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/wikitext/tokenized/Kristijan/wikitext-103-tokenizer/train-wikitext-103-raw-v1-float32.bin"[0m
[ [36m2024-03-06 13:26:10,155 [0m][[2;37mqtransform.dataset[0m][[34mDEBUG[0m] - [34mOffset is 0, start is 0.0, end is 1.0[0m
[ [36m2024-03-06 13:26:10,159 [0m][[2;37mqtransform.dataset[0m][[34mDEBUG[0m] - [34mTokenized file has 121454329.0 tokens of datatype: float32. Attempting to start at token: 0[0m
[ [

In [3]:
CHECKPOINT="/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-06_13:26:09__epoch:9"
args_infer.append("run.from_checkpoint="+CHECKPOINT)
qtransform.notebook_run(args_infer, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 20, 'max_new_tokens': 100, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': None, 'onnx_model': {'path': None, 'tokenizer': {'module': 'tiktoken', 'enc

2024-03-07 09:10:45.573430: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-07 09:10:46,274 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-07 09:10:46,379 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-07 09:10:46,383 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-07 09:10:46,386 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-07 09:10:46,389 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[0m
[ [36m2024-03-07 09:10:50,474 [0m][[2;37murllib3.connectionpool[0m][[34mDEBUG[0m] - [34mStarting new HTTPS connection (1): huggingface.co:443[0m
[ [36m2024-03-07 09:10:50,693 [0m][[2;37murllib3.connectionpool[0m][[34mDEBUG[0m] - [34mhttps://huggingface.co:443 "HEAD /Kristijan/wikitext-103-toke

loading file vocab.json from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/vocab.json
loading file merges.txt from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/merges.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


[ [36m2024-03-07 09:10:50,907 [0m][[2;37mqtransform.dataset.tokenizer.transformers[0m][[34mDEBUG[0m] - [34mUsing tokenizer class: GPT2TokenizerFast with encoding: Kristijan/wikitext-103-tokenizer[0m
[ [36m2024-03-07 09:10:50,911 [0m][[2;37mqtransform.run[0m][[34mDEBUG[0m] - [34m{'max_token_value': 28439, 'encoding': 'Kristijan/wikitext-103-tokenizer', 'dtype': 'float32', 'num_tokens': 121999113, 'module': 'transformers', 'fast': True}[0m
[ [36m2024-03-07 09:10:51,443 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mRunning inference from CHECKPOINT.[0m
[ [36m2024-03-07 09:10:55,789 [0m][[2;37mqtransform.run.infer[0m][[34mDEBUG[0m] - [34mHighest predicted token: 26491[0m
[ [36m2024-03-07 09:10:55,796 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mGenerating sample: 1/20[0m

 when November @-@ combatants, Burton said of universities. 
ays in graphic exaggerated detail. He is conventionally considered the cannibalism in the cannibalism

#### Benchmarking

In [None]:
args_benchmarking = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-06_13:26:09__epoch:9",
    "run.num_samples=50",
    'dataset=huggingface',
    'dataset.name=wikitext',
    'dataset.subset=wikitext-103-raw-v1',
    'dataset/tokenizer=transformers',
    'dataset.tokenizer.encoding=Kristijan/wikitext-103-tokenizer',
    'dataset.tokenizer.pretrained_tokenizer=GPT2TokenizerFast',
    'dataset.dataloader.shuffle=False',
    "+model.args.block_size="+block_size,
    "dataset.dataloader.shuffle=False",
]
qtransform.notebook_run(args_benchmarking, logging.INFO)

### Another attempt after fixing attention mask bug

In [3]:
model = "gpt_2_h2l2e256b64_ReBN"
model_args.append("model="+model)
model_args = [
    'model.args.dropout=0.1',
    'model.args.n_layer=2',
    'model.args.n_head=6',
    'model.args.n_embd=384',
    'model.args.block_size=256',
    'model=gpt_2_h2l2e256b64_ReBN'
]
qtransform.notebook_run(run_args+model_args+dataset_args+other_args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 6, 'n_embd': 384, '

2024-03-08 11:37:30.974117: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-08 11:37:31,683 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-08 11:37:31,790 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-08 11:37:31,794 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-08 11:37:31,797 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-08 11:37:31,800 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-08 11:37:32,447 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-03-08 11:37:32,454 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-03-08_11:37:32[0m
[ [36m2024-03-08 11:37:32,458 [0m][[2;37mqtransform[0m][[32mINFO[0m] -

KeyboardInterrupt: 

In [3]:
qtransform.notebook_run(["run=infer", 
                            "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-08_11:37:32__epoch:7",
                             "run.max_new_tokens=100"
                        ], logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 100, 'temperature': 0.8, 'top_k': 200, 'start': 'My name is Mariama, my favorite', 'compile': False, 'out_dir': None, 'onnx_model': {'path': None, 'tokenizer

loading configuration file config.json from cache at /home/mabot004/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.37.0",
  

[ [36m2024-03-08 12:15:14,070 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mRunning inference from CHECKPOINT.[0m
[ [36m2024-03-08 12:15:15,228 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mGenerating sample: 1/10[0m
My name is Mariama, my favorite person.

I'm my own worst enemy.

It's one of those things I keep forgetting about every day: I love you, I have love for you, I'd like to be your friend.

But I was told I was a complete retard. I'm just a kid.

And when I was 10 years old, I decided to try and leave the world of metal and metalcore to my parents. After that, I enrolled in a band called Black Music
---------------

[ [36m2024-03-08 12:15:16,224 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mGenerating sample: 2/10[0m
My name is Mariama, my favorite character in the manga. But since I'm the only one who can name him, I do not think it is appropriate to use Mariama.

Mariama

You see, I'm also known for being a bit of a hard t

In [4]:
start="due east from Gould City to"
qtransform.notebook_run(["run=infer", 
                         "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-08_11:37:32__epoch:7",
                         "run.max_new_tokens=100",
                         "run.start="+start], logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 100, 'temperature': 0.8, 'top_k': 200, 'start': 'due east from Gould City to', 'compile': False, 'out_dir': None, 'onnx_model': {'path': None, 'tokenizer': {

loading configuration file config.json from cache at /home/mabot004/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.37.0",
  

[ [36m2024-03-08 12:17:14,752 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mRunning inference from CHECKPOINT.[0m
[ [36m2024-03-08 12:17:15,740 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mGenerating sample: 1/10[0m
due east from Gould City to the shore of Lake Superior. The city's parking garage is among the most significant in Europe, with more than four-thousand parking spaces across Europe.

The city's parking lots are at least 50 percent more valuable than the entire West Side, according to the 2010 GSA report.

With a median value of about $40 million, the city has the second-largest share in parking in cities like London, Chicago, New York, and Philadelphia. In Chicago, the city's
---------------

[ [36m2024-03-08 12:17:16,721 [0m][[2;37mqtransform.run.infer[0m][[32mINFO[0m] - [32mGenerating sample: 2/10[0m
due east from Gould City to the city entrance.

From there, the car's rear brake would be used to do the same thing as the steering

In [2]:
args_benchmarking = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/wikitext/GPT_wikitext_2024-03-08_11:37:32__epoch:7",
    "run.num_samples=50",
    'dataset=huggingface',
    'dataset.name=wikitext',
    'dataset.subset=wikitext-103-raw-v1',
    'dataset/tokenizer=transformers',
    'dataset.tokenizer.encoding=Kristijan/wikitext-103-tokenizer',
    'dataset.tokenizer.pretrained_tokenizer=GPT2TokenizerFast',
    'dataset.dataloader.shuffle=False',
    "+model.args.block_size=256",
    "dataset.dataloader.shuffle=False",
]
qtransform.notebook_run(args_benchmarking, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TransformersTokenizer', 'pretrained_tokenizer': 'GPT2TokenizerFast', 'encoding': 'Kristijan/wikitext-103-tokenizer', 'module': 'transformers', 'fast': True}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 12}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 123456789

2024-03-08 12:26:56.317427: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[0m


loading file vocab.json from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/vocab.json
loading file merges.txt from cache at /home/mabot004/.cache/huggingface/hub/models--Kristijan--wikitext-103-tokenizer/snapshots/347b90366a52a49e5071ab18cf4bb06dabfc6f82/merges.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None


[ [36m2024-03-08 12:27:01,795 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: wikitext, with encoding: Kristijan/wikitext-103-tokenizer and dtype: float32[0m
[ [36m2024-03-08 12:27:01,804 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/wikitext/tokenized/Kristijan/wikitext-103-tokenizer/train-wikitext-103-raw-v1-float32.bin"[0m
[ [36m2024-03-08 12:27:01,808 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoaded data has 121454329 tokens.[0m
[ [36m2024-03-08 12:27:01,811 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/wikitext/tokenized/Kristijan/wikitext-103-tokenizer/eval-wikitext-103-raw-v1-float32.bin"[0m
[ [36m2024-03-08 12:27:01,814 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoaded data has 254799 tokens.[0

STAGE:2024-03-08 12:27:05 10884:10884 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2024-03-08 12:27:06 10884:10884 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2024-03-08 12:27:06 10884:10884 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


TypeError: 'torch.device' object is not callable