## Train openwebtext and shakespeare GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them
### For openwebtext, 4 heads and 4 transformer blocks and for shakespeare, half are used
### Tiktoken gpt2 Tokenization is used

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform

In [2]:
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


### Train GPT2 with Shakespeare GELU BatchNorm, custom_ln is Identity layer
### Params similiar to nanoGPT (https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py) except for gpt model params

In [8]:

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size=64",
        "dataset.name=tiny_shakespeare",
        "run.export=True",
        "run.epochs=100",
        "run.max_iters=5000",
        "run.eval_epoch_interval=1", 
        "run.eval_iters=200",
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 2, 'n_head': 2, 'n_embd': 256, 'dropout': 0.0, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash':

2024-02-13 13:44:11.648799: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-13 13:44:13,308 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-13 13:44:13,313 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-13 13:44:13,316 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m
[ [36m2024-02-13 13:44:13,566 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-13 13:44:13,573 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-13_13:44:13[0m
[ [36m2024-02-13 13:44:13,577 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-13 13:44:13,588 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader

KeyboardInterrupt: 

### Write inference of Shakespeare GELU BatchNorm, custom_ln is Identity layer to file

In [12]:
CHECKPOINT_PATH="/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/outputs/models/GPT_2024-02-13_13:44:13__epoch:1"

args = [
        "run=infer",
        "run.from_checkpoint="+CHECKPOINT_PATH,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=10", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)


[ [36m2024-02-13 13:52:43,381 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'test': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 10, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'out_dir': 'out_infer', 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encoding': 'gpt2', 'meta_path': None

### Make console cmd from args

In [5]:
def get_cmd_from_args(args: list[str]):
    return "python -m qtransform " + ' '.join(args)