# Train openwebtext GPT2 models with either gelu or relu and layernorm or batchnorm and run inference on them

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml
with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)

/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


In [2]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from qtransform.model import gpt as qtransform_gpt


seed = 1337
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn

def sample(ckpt_path, start: str = "\n", max_new_tokens: int = 500):
    # -----------------------------------------------------------------------------
    init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
    out_dir = 'out' # ignored if init_from is not 'resume'
    num_samples = 10 # number of samples to draw
    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
    device = 'cuda' if torch.cuda.is_available() else 'cpu' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
    dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
    compile = False # use PyTorch 2.0 to compile the model to be faster
    #exec(open('configurator.py').read()) # overrides from command line or config file
    # -----------------------------------------------------------------------------
    device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
    
    # model
    if init_from == 'resume':
        # init from a model saved in a specific directory
        checkpoint = torch.load(ckpt_path, map_location=device)
        gptconf = qtransform_gpt.GPTConfig(**checkpoint['model_cfg']["args"])
        state_dict = checkpoint['model_state_dict']
        model = qtransform_gpt.GPT(gptconf)
        unwanted_prefix = '_orig_mod.'
        for k,v in list(state_dict.items()):
            if k.startswith(unwanted_prefix):
                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
        model.load_state_dict(state_dict)
    elif init_from.startswith('gpt2'):
        # init from a given GPT-2 model
        model = GPT.from_pretrained(init_from, dict(dropout=0.0))
    
    model.eval()
    model.to(device)
    if compile:
        model = torch.compile(model) # requires PyTorch 2.0 (optional)

    # look for the meta pickle in case it is available in the dataset folder
    load_meta = False
    if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
        meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
        load_meta = os.path.exists(meta_path)
    if load_meta:
        print(f"Loading meta from {meta_path}...")
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        # TODO want to make this more general to arbitrary encoder/decoder schemes
        stoi, itos = meta['stoi'], meta['itos']
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l])
    else:
        # ok let's assume gpt-2 encodings by default
        print("No meta.pkl found, assuming GPT-2 encodings...")
        enc = tiktoken.get_encoding("gpt2")
        encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
        decode = lambda l: enc.decode(l)

    # encode the beginning of the prompt
    if start.startswith('FILE:'):
        with open(start[5:], 'r', encoding='utf-8') as f:
            start = f.read()
    start_ids = encode(start)
    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
                print(decode(y[0].tolist()))
                print('---------------')


####################
# TODO: it seems that shuffling with large datasets causes the entire dataset to be loaded into memory
####################

## Train ReLU and BatchNorm with karpathy's parameters
### Our tokeniization of Openwebtext does not add padding after each sample, that could be implemented to keep context within samples
### eval is nan due to an error in the currently installed torch version on the cluster, eval is computed with torch version 2.2, however that disables gpu training due to old nvidia drivers

In [3]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py and https://github.com/karpathy/nanoGPT/blob/master/config/train_gpt2.py
#karpathy used a larger transformer model for openwebtext alongside more epochs
eval_epoch_interval = str(1) # every epoch, meaning after max_iters
eval_iters = str(200)
max_iters = str(500)
epochs = "20" #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = "2"
batch_size = "32"
block_size = "256"

grad_clip="1.0"

n_layer = "6"
n_head = "6"
n_embd = "384"
dropout = "0.2"

learning_rate = str(1e-3) # with baby networks can afford to go a bit higher
seed = "1337"

step_size = epochs

model = "gpt_2_h2l2e256b64_ReBN" #RELU BatchNorm

In [5]:

args = [
    'seed='+seed,
     'run=train',
     'run.export=False',
     'run.epochs='+epochs,
     'run.max_iters='+max_iters,
     'run.eval_epoch_interval='+eval_epoch_interval,
     'run.eval_iters='+eval_iters,
     'run.grad_clip='+grad_clip,
    'run.gradient_accumulation_steps='+gradient_accumulation_steps,
     'model='+model,
     'model.args.dropout='+dropout,
    'model.args.n_layer='+n_layer,
    'model.args.n_head='+n_head,
    'model.args.n_embd='+n_embd,
    'model.args.block_size='+n_embd,
     'dataset=huggingface',
     'dataset/tokenizer=tiktoken',
     'dataset.tokenizer.encoding=gpt2',
     'dataset.name=openwebtext',
     'dataset.dataloader.shuffle=False',
     'dataset.dataloader.batch_size='+batch_size,
     'optim.args.learning_rate='+learning_rate,
     'optim.scheduler.schedulers.1.args.step_size='+step_size,
     'device=cuda',
     'debug=True',
]
qtransform.notebook_run(args)

[ [36m2024-02-28 12:04:26,290 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size'

KeyboardInterrupt: 

In [4]:
CHECKPOINT = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/owt/GPT_2024-02-28_12:04:26__epoch:4"
epochs = "16" #from 4th epoch, go until 20th
args = [
    'seed='+seed,
     'run=train',
     'run.export=False',
     'run.epochs='+epochs,
     'run.max_iters='+max_iters,
     'run.eval_epoch_interval='+eval_epoch_interval,
     'run.eval_iters='+eval_iters,
     'run.grad_clip='+grad_clip,
    'run.gradient_accumulation_steps='+gradient_accumulation_steps,
    'run.from_checkpoint='+CHECKPOINT,
     'model='+model,
     'model.args.dropout='+dropout,
    'model.args.n_layer='+n_layer,
    'model.args.n_head='+n_head,
    'model.args.n_embd='+n_embd,
    'model.args.block_size='+n_embd,
     'dataset=huggingface',
     'dataset/tokenizer=tiktoken',
     'dataset.tokenizer.encoding=gpt2',
     'dataset.name=openwebtext',
     'dataset.dataloader.shuffle=False',
     'dataset.dataloader.batch_size='+batch_size,
     'optim.args.learning_rate='+learning_rate,
     'optim.scheduler.schedulers.1.args.step_size='+step_size,
     'device=cuda',
     'debug=True',
]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 32}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 384, 'vocab_size': 50304, 'transformer_active_func': 'ReLU', 'norm_layer': 'BatchNorm', 'flash': False}}, 'quantization

2024-03-01 11:22:52.172185: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-03-01 11:22:52,928 [0m][[2;37mtensorflow[0m][[34mDEBUG[0m] - [34mFalling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.[0m
[ [36m2024-03-01 11:22:53,036 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-01 11:22:53,038 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-01 11:22:53,040 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 7 to 5[0m
[ [36m2024-03-01 11:22:53,041 [0m][[2;37mh5py._conv[0m][[34mDEBUG[0m] - [34mCreating converter from 5 to 7[0m
[ [36m2024-03-01 11:22:53,727 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-03-01 11:22:53,730 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-03-01_11:22:53[0m
[ [36m2024-03-01 11:22:53,732 [0m][[2;37mqtransform[0m][[32mINFO[0m] -

## Inference
### torch version 2.0 causes issues with inference during eval mode, version 2.2 does not. 
### for now, use karpathy's inference script

In [6]:
CHECKPOINT = "/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/owt/GPT_2024-02-28_12:04:26__epoch:4"
args = [
        "run=infer",
        #"run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/GPT_2024-02-14_11:20:19__epoch:6",
        "run.from_checkpoint="+CHECKPOINT,
        "device=cuda", 
        "run.out_dir=out_infer",
        "run.num_samples=3", 
        "run.max_new_tokens=500",
        "run.temperature=0.8",
        "run.top_k=200",
        "run.start='\n'",
        "debug=True"
    ]
qtransform.notebook_run(args)

[ [36m2024-02-28 13:03:49,810 [0m][[2;37mhydra.core.utils[0m][[34mDEBUG[0m] - [34mSetting JobRuntime:name=app[0m
{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': True, 'dataset': {'wrapper': '???', 'module': '???', 'name': '???', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.0, 'eval': 0.0, 'bench': 0.0}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl'}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_lr': True, 'schedulers': {'1': {'name': 'StepLR', 'args': {'step_size': 1, 'gamma': 0.1}}}, 'milestones': None, 'warmup_epochs': 2}}, 'run': {'command': 'infer', 'checkpoint_dir': 'models', 'num_samples': 3, 'max_new_tokens': 500, 'temperature': 0.8, 'top_k': 200,

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [10]:
#context between samples of dataset is very different, either learn longer or add padding between each sample
sample("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/owt/GPT_2024-02-28_12:04:26__epoch:4")

[ [36m2024-02-28 13:06:19,783 [0m][[2;37mqtransform.model.gpt[0m][[34mDEBUG[0m] - [34mApplied config: 
GPTConfig(block_size=384, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='ReLU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-28 13:06:19,786 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=384, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='ReLU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-28 13:06:20,694 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.49M[0m
No meta.pkl found, assuming GPT-2 encodings...

 leaving any some,


 Brek.[ were beganJeffk was soldier and Alliance held to look, but the Nights Outerisa, under search to frozenoth, the supplies and Nights battle outpost was troops.


KeyboardInterrupt: 

In [None]:
sample("/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/owt/GPT_2024-03-01_11:22:53__epoch:20")

[ [36m2024-03-01 12:02:34,008 [0m][[2;37mqtransform.model.gpt[0m][[34mDEBUG[0m] - [34mApplied config: 
GPTConfig(block_size=384, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='ReLU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-03-01 12:02:34,010 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=384, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='ReLU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-03-01 12:02:35,336 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.49M[0m
No meta.pkl found, assuming GPT-2 encodings...

The the blizzard

G hang by snowtrooth.[11]











The incompetenttroopers, a AT-STJeffren Bureau, Breks requested racermi smuggling aboard the Gupta, Brek ordered the famous supplies

## Benchmarking

In [5]:
args = [ 
    "run=bench",
    "run.from_checkpoint=/home/mabot004/eki-transformer-dev/shakespeare_owt_benchmarking/owt/GPT_2024-02-28_12:04:26__epoch:4",
    "run.num_samples=50",
    "dataset=huggingface",
    "dataset.name=openwebtext",
    "dataset/tokenizer=tiktoken",
    "dataset.tokenizer.encoding=gpt2",
    "+model.args.block_size=256",
    "dataset.sizes.bench=0.4",
    "dataset.dataloader.shuffle=False",
]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'openwebtext', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.4}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': False, 'num_workers': 2, 'batch_size': 12}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1234567890, 'model': {'calc_loss_in_model': False, 'args': {'block_size': 256}}, 'quantization': {'quantize': False}, 'pipe': '/dev/null', 'optim': {'optimizer': 'AdamW', 'args': {'learning_rate': 0.00015, 'weight_decay': 0.1, 'betas': [0.9, 0.95]}, 'scheduler': {'decay_

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 39.25 GiB total capacity; 37.92 GiB already allocated; 12.88 MiB free; 38.12 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print("hallo")