## Train a GPT2 Transformer model with wikitext using BatchNorm/ LayerNorm and ReLU/ GELU

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml
with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


### Hyperparameters
#### 10.000 Iterations in total with a small GPT model

In [2]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py and https://github.com/karpathy/nanoGPT/blob/master/config/train_gpt2.py
#karpathy used a larger transformer model for openwebtext alongside more epochs

DATASET = "wikitext"
SUBSET = "wikitext-103-raw-v1"

eval_epoch_interval = str(1) # every epoch, meaning after max_iters
eval_iters = str(200)
max_iters = str(500)
epochs = "20" #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = "2"
batch_size = "32"
block_size = "256"

grad_clip="1.0"

n_layer = "6"
n_head = "6"
n_embd = "384"
dropout = "0.2"

learning_rate = str(1e-3) # with baby networks can afford to go a bit higher
seed = "1337"

step_size = epochs
args = [
    'seed='+seed,
    'run=train',
    'run.export=False',
    'run.epochs='+epochs,
    'run.max_iters='+max_iters,
    'run.eval_epoch_interval='+eval_epoch_interval,
    'run.eval_iters='+eval_iters,
    'run.grad_clip='+grad_clip,
    'run.gradient_accumulation_steps='+gradient_accumulation_steps,
    'model.args.dropout='+dropout,
    'model.args.n_layer='+n_layer,
    'model.args.n_head='+n_head,
    'model.args.n_embd='+n_embd,
    'model.args.block_size='+n_embd,
    'dataset=huggingface',
    'dataset.name='+DATASET,
    'dataset.subset='+SUBSET,
    'dataset/tokenizer=tiktoken',
    'dataset.tokenizer.encoding=gpt2',
    'dataset.dataloader.shuffle=False',
    'dataset.dataloader.batch_size='+batch_size,
    'optim.args.learning_rate='+learning_rate,
    'optim.scheduler.schedulers.1.args.step_size='+step_size,
    'device=cuda',
    'debug=True',
]

#### ReLU BatchNorm

In [3]:
model = "gpt_2_h2l2e256b64_ReBN"
args.append("model="+model)
qtransform.notebook_run(args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, '

2024-03-06 09:55:11.258833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-06 09:55:11,951 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-06 09:55:12,051 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 09:55:12,054 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 09:55:12,056 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-06 09:55:12,059 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-06 09:55:12,674 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-03-06 09:55:12,678 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-03-06_09:55:12[0m
[ [36m2024-03-06 09:55:12,681 [0m][[2;37mqtransform[0m][[32mINFO[0m] -

KeyboardInterrupt: 

#### ReLU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_ReLN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU BatchNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeBN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeLN"
args.append("model="+model)
qtransform.notebook_run(args)

### Inference

In [5]:
args_infer = [
        "run=infer",
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=20", 
        "run.max_new_tokens=100",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]

In [None]:
CHECKPOINT=""
args_infer.append("run.from_checkpoint="+CHECKPOINT)