## Train a GPT2 Transformer model with wikitext using BatchNorm/ LayerNorm and ReLU/ GELU

In [2]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml
with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/makuh001/eki/eki-transformer-dev/qtransform/qtransform/conf


### Hyperparameters
#### 10.000 Iterations in total with a small GPT model

In [11]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py and https://github.com/karpathy/nanoGPT/blob/master/config/train_gpt2.py
#karpathy used a larger transformer model for openwebtext alongside more epochs

DATASET = "wikitext"
SUBSET = "wikitext-103-raw-v1"

eval_epoch_interval = str(1) # every epoch, meaning after max_iters
eval_iters = str(200)
max_iters = str(500)
epochs = "20" #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = "2"
batch_size = "32"

grad_clip="1.0"

dropout = "0.1"

learning_rate = str(1e-3) # with baby networks can afford to go a bit higher
seed = "1337"

step_size = epochs
args = [
    'seed='+seed,
    'run=train',
    'run.export=False',
    'run.epochs='+epochs,
    'run.max_iters='+max_iters,
    'run.eval_epoch_interval='+eval_epoch_interval,
    'run.eval_iters='+eval_iters,
    'run.grad_clip='+grad_clip,
    #run.gradient_accumulation_steps='+gradient_accumulation_steps,
    'model.args.dropout='+dropout,
    'dataset=huggingface',
    'dataset.name='+DATASET,
    'dataset.subset='+SUBSET,
    'dataset/tokenizer=tiktoken',
    'dataset.tokenizer.encoding=gpt2',
    'dataset.dataloader.shuffle=False',
    'dataset.dataloader.batch_size='+batch_size,
    #'optim.args.learning_rate='+learning_rate,
    #'optim.scheduler.schedulers.1.args.step_size='+step_size,
    'device=cuda',
    #'debug=True',
]

#### ReLU BatchNorm

In [12]:
model = "BENCH_gpt2_ReBNP_smaller"
args.append("model="+model)
qtransform.notebook_run(args, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'wikitext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'subset': 'wikitext-103-raw-v1', 'type': 'huggingface', 'splits': {'names': {'train': 'train', 'eval': 'validation', 'bench': 'test'}, 'sizes': {'train': 0.9, 'eval': 0.05, 'bench': 0.05}}, 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': True, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 4, 'n_head': 4, 'n_embd': 256, 

KeyboardInterrupt: 

#### ReLU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_ReLN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU BatchNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeBN"
args.append("model="+model)
qtransform.notebook_run(args)

#### GELU LayerNorm

In [None]:
model = "gpt_2_h2l2e256b64_GeLN"
args.append("model="+model)
qtransform.notebook_run(args)

### Inference

In [9]:
args_infer = [
        "run=infer",
        "device=cuda",
        "run.num_samples=20", 
        "run.max_new_tokens=100",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]

#### ReLU BatchNorm

In [10]:
CHECKPOINT="GPT_wikitext_2024-03-06_09:55:12__epoch:5"
args_infer.append("run.from_checkpoint="+CHECKPOINT)
qtransform.notebook_run(args_infer, logging.INFO)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 20, 'max_new_tokens': 100, 'temperature': 0.8, 'top_k': 200, 'start': '\n', 'compile': False, 'out_dir': None, 'onnx_model': {'path': None, 'tokenizer': {'name': 'tiktoken', 'encod