# Train openwebtext GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


####################
# TODO 1: it seems that shuffling with large datasets causes the entire dataset to be loaded into memory
# TODO 2: iterating through the dataloader always returns the same values
####################

## Train GELU and BatchNorm with karpathy's parameters
### Our tokeniization of Openwebtext does not add padding after each sample, that could be implemented to keep context within samples

In [3]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py
#karpathy evaluates after 250 iterations, we implemented eval to do so after every epoch -> max_iters = 5000 / 200
#loss of model generally stays the same after max. 7 epochs, which makes 200 epochs more than excessive
eval_epoch_interval = 1 # keep frequent because we'll overfit
eval_iters = 200
max_iters = 250
epochs = 10 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher

#not implemented currently
lr_decay_iters = 5000 # make equal to max_iters usually

#not used currently
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

args = [
        "run=train", 
        "model=gpt_2_h2l2e256b64_GeBN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=openwebtext",
        "optim.args.learning_rate="+str(learning_rate),
        "run.export=True",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "device=cuda",
        "debug=True"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}, 'quantiza

KeyboardInterrupt: 

## Inference

In [None]:
CHECKPOINT = ""
args = [
        "run=infer",
        #"run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6",
        "run.from_checkpoint="+CHECKPOINT,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

## Benchmarking

In [None]:
args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-20_09:03:43__epoch:3",
    "run.num_samples=100",
    "dataset=huggingface",
    "dataset.name=tiny_shakespeare",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=64",
    "dataset.sizes.bench=0.4"
]
qtransform.notebook_run(args)