# Train openwebtext GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them

In [3]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml
with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


####################
# TODO: it seems that shuffling with large datasets causes the entire dataset to be loaded into memory
####################

## Train ReLU and BatchNorm with karpathy's parameters
### Our tokeniization of Openwebtext does not add padding after each sample, that could be implemented to keep context within samples
### eval is nan due to an error in the currently installed torch version on the cluster, eval is computed with torch version 2.2, however that disables gpu training due to old nvidia drivers

In [5]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py and https://github.com/karpathy/nanoGPT/blob/master/config/train_gpt2.py
eval_epoch_interval = str(1) # every epoch, meaning after max_iters
eval_iters = str(200)
max_iters = str(500)
epochs = "20" #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = "2"
batch_size = "32"
block_size = "256"

grad_clip="1.0"

n_layer = "6"
n_head = "6"
n_embd = "384"
dropout = "0.2"

learning_rate = str(1e-3) # with baby networks can afford to go a bit higher
seed = "1337"

step_size = epochs

model = "gpt_2_h2l2e256b64_ReBN" #RELU BatchNorm
args = [
    'seed='+seed,
     'run=train',
     'run.export=False',
     'run.epochs='+epochs,
     'run.max_iters='+max_iters,
     'run.eval_epoch_interval='+eval_epoch_interval,
     'run.eval_iters='+eval_iters,
     'run.grad_clip='+grad_clip,
    'run.gradient_accumulation_steps='+gradient_accumulation_steps,
     'model='+model,
     'model.args.dropout='+dropout,
    'model.args.n_layer='+n_layer,
    'model.args.n_head='+n_head,
    'model.args.n_embd='+n_embd,
    'model.args.block_size='+n_embd,
     'dataset=huggingface',
     'dataset/tokenizer=tiktoken',
     'dataset.tokenizer.encoding=gpt2',
     'dataset.name=openwebtext',
     'dataset.dataloader.shuffle=False',
     'dataset.dataloader.batch_size='+batch_size,
     'optim.args.learning_rate='+learning_rate,
     'optim.scheduler.schedulers.1.args.step_size='+step_size,
     'device=cuda',
     'debug=True',
]
qtransform.notebook_run(args)

[ [36m2024-02-28 12:04:26,290 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size'

KeyboardInterrupt: 

## Inference

In [None]:
CHECKPOINT = ""
args = [
        "run=infer",
        #"run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6",
        "run.from_checkpoint="+CHECKPOINT,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

## Benchmarking

In [None]:
args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)

In [None]:
print("hallo")