### Compare karpathy qtransform
This notebook aims to debug our qtransform application, as currently our models reach a low loss during training but generate poor results during inference

In [1]:
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
import qtransform
import torch
from brevitas import nn as qnn
# Manually load some logging conf
config_path = qtransform.get_module_config_path()
print(config_path)
import logging
import yaml

with open(os.path.join(config_path, 'hydra','job_logging', 'custom.yaml'), 'r') as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)

logging.config.dictConfig(config)
logging.getLogger().setLevel(logging.INFO)


/home/mabot004/eki-transformer-dev/qtransform/eki/lib/python3.10/site-packages/qtransform-0.0.2.dev0-py3.10.egg/qtransform/conf


seed is different (1337 instead of 123456789)

In [7]:
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
import tiktoken
from qtransform.model import gpt as qtransform_gpt #import GPTConfig, GPT
import model as karpathy_model


seed = 1337
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn

def sample(ckpt_path, karpathy: bool, start: str = "\n"):
    # -----------------------------------------------------------------------------
    init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
    out_dir = 'out' # ignored if init_from is not 'resume'
    num_samples = 10 # number of samples to draw
    max_new_tokens = 500 # number of tokens generated in each sample
    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
    device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
    dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
    compile = False # use PyTorch 2.0 to compile the model to be faster
    #exec(open('configurator.py').read()) # overrides from command line or config file
    # -----------------------------------------------------------------------------
    device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
    
    # model
    if init_from == 'resume':
        # init from a model saved in a specific directory
        checkpoint = torch.load(ckpt_path, map_location=device)
        if karpathy:
            gptconf = karpathy_model.GPTConfig(**checkpoint['model_args'])
            state_dict = checkpoint['model']
            model = karpathy_model.GPT(gptconf)
        else:
            gptconf = qtransform_gpt.GPTConfig(**checkpoint['model_cfg']["args"])
            state_dict = checkpoint['model_state_dict']
            model = qtransform_gpt.GPT(gptconf)
        unwanted_prefix = '_orig_mod.'
        for k,v in list(state_dict.items()):
            if k.startswith(unwanted_prefix):
                state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
        model.load_state_dict(state_dict)
    elif init_from.startswith('gpt2'):
        # init from a given GPT-2 model
        model = GPT.from_pretrained(init_from, dict(dropout=0.0))

    model.eval()
    model.to(device)
    if compile:
        model = torch.compile(model) # requires PyTorch 2.0 (optional)

    # look for the meta pickle in case it is available in the dataset folder
    load_meta = False
    if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
        meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
        load_meta = os.path.exists(meta_path)
    if load_meta:
        print(f"Loading meta from {meta_path}...")
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        # TODO want to make this more general to arbitrary encoder/decoder schemes
        stoi, itos = meta['stoi'], meta['itos']
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l])
    else:
        # ok let's assume gpt-2 encodings by default
        print("No meta.pkl found, assuming GPT-2 encodings...")
        enc = tiktoken.get_encoding("gpt2")
        encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
        decode = lambda l: enc.decode(l)

    # encode the beginning of the prompt
    if start.startswith('FILE:'):
        with open(start[5:], 'r', encoding='utf-8') as f:
            start = f.read()
    start_ids = encode(start)
    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
                print(decode(y[0].tolist()))
                print('---------------')


In [12]:
#karpathy's model yields good results
sample("/home/mabot004/nanoGPT/out-shakespeare/ckpt.pt", karpathy = True)

number of parameters: 29.94M
No meta.pkl found, assuming GPT-2 encodings...

We are consul:

You we shall have so his power to bear and
To give away the whole bastard, or else he shall have been,
And tell them and they are not speak.

First Citizen:
Not here we be admitted to see him:
The gods of such power in the tribunes, and the rock to give us,
We'll have show him, and we march upon the Roman.
First Citizen:
Second Citizen:
Say, we are given to our general

The Volscian:
SICINIUS: 'IUS:

Not at Senator:
Wherein Marcius.
First Senator:
Since we're people, you have done
AUFIDIUS:
Unseparable, the people.

You are in this

BRUTUS:
Your voices?
SICINIUS:

Let me all the people,

Second Senator:
MENENIUS: I will not us us hear'd with tribunes can of thisICINIUS:
MENENIUS: for you say,

They are thus, what's their voices?
BRUTUS:

SICINIUS:
What we'll be so very consulch you, he's the people,
Threely Marcius?
Are you were a gods are worthy Marcius,
Here comes.
You are thus!
MENENIUS:
Sci

KeyboardInterrupt: 

In [3]:
#meanwhile ours generates nonsense
sample("/home/mabot004/nanoGPT/GPT_2024-02-21_11:08:06__epoch:1", karpathy = False)

[ [36m2024-02-21 12:09:11,346 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: detected 128 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.[0m
[ [36m2024-02-21 12:09:11,349 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNote: NumExpr detected 128 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.[0m
[ [36m2024-02-21 12:09:11,352 [0m][[2;37mnumexpr.utils[0m][[32mINFO[0m] - [32mNumExpr defaulting to 8 threads.[0m
[ [36m2024-02-21 12:09:11,949 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='BatchNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-21 12:09:13,086 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.51M[0m
No meta.pkl found, assuming GPT-2 encodings

KeyboardInterrupt: 

In [17]:
checkpoint = torch.load("/home/mabot004/nanoGPT/GPT_2024-02-21_11:08:06__epoch:1")
checkpoint["model_cfg"]

{'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50256, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}

our model lowers its loss much faster than karpathy because the learning rate is divided by 10 after each epoch

In [6]:
with open('train_shakespeare.out', 'r') as karpathy_result:
    result = karpathy_result.read()
result.split("\n")

['Overriding config with config/train_shakespeare.py:',
 '# train a miniature character-level shakespeare model',
 '# good for debugging and playing on macbooks and such',
 '',
 "out_dir = 'out-shakespeare'",
 "eval_interval = 250 # keep frequent because we'll overfit",
 'eval_iters = 200',
 "log_interval = 10 # don't print too too often",
 '',
 '# we expect to overfit on this small dataset, so only save when val improves',
 'always_save_checkpoint = False',
 '',
 'wandb_log = False # override via command line if you like',
 "wandb_project = 'shakespeare'",
 "wandb_run_name = 'mini-gpt'",
 '',
 "dataset = 'shakespeare'",
 'gradient_accumulation_steps = 1',
 'batch_size = 64',
 'block_size = 256 # context of up to 256 previous characters',
 '',
 '# baby GPT model :)',
 'n_layer = 6',
 'n_head = 6',
 'n_embd = 384',
 'dropout = 0.2',
 '',
 'learning_rate = 1e-3 # with baby networks can afford to go a bit higher',
 'max_iters = 5000',
 'lr_decay_iters = 5000 # make equal to max_iters usua

In [2]:
def train_qtransform():
    #from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py
    #karpathy evaluates after 250 iterations, we implemented eval to do so after every epoch -> max_iters = 5000 / 200
    eval_epoch_interval = 1 # keep frequent because we'll overfit
    eval_iters = 200
    max_iters = 250
    epochs = 10 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
    gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
    batch_size = 64
    block_size = 256 # context of up to 256 previous characters

    # baby GPT model :)
    n_layer = 6
    n_head = 6
    n_embd = 384
    dropout = 0.2

    learning_rate = 1e-3 # with baby networks can afford to go a bit higher

    #not implemented currently
    lr_decay_iters = 5000 # make equal to max_iters usually

    #not used currently
    min_lr = 1e-4 # learning_rate / 10 usually
    beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

    args = [
            "seed=1337", #same seed as karpathy
            "run=train", 
            "run.export=False",
            "run.epochs="+str(epochs),
            "run.max_iters="+str(max_iters),
            "run.eval_epoch_interval=1", 
            "run.eval_iters="+str(eval_iters),
            "run.grad_clip=1.0",
            "model=gpt_2_h2l2e256b64_GeBN",
            "model.args.n_layer="+str(n_layer),
            "model.args.n_head="+str(n_head),
            "model.args.n_embd="+str(n_embd),
            "model.args.dropout="+str(dropout),
            "dataset=huggingface", 
            "dataset/tokenizer=tiktoken",
            "dataset.tokenizer.encoding=gpt2",
            "dataset.dataloader.batch_size="+str(batch_size),
            "dataset.name=tiny_shakespeare",
            "optim.args.learning_rate="+str(learning_rate),
            "device=cuda"
        ]
    qtransform.notebook_run(args)
train_qtransform()

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'BatchNorm', 'flash': False}}, 'quantiza

2024-02-21 13:06:15.097201: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ [36m2024-02-21 13:06:16,570 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mRunning Training[0m
[ [36m2024-02-21 13:06:16,577 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mtime is: 2024-02-21_13:06:16[0m
[ [36m2024-02-21 13:06:16,580 [0m][[2;37mqtransform[0m][[32mINFO[0m] - [32mDevice specified: cuda. Using device: cuda[0m
[ [36m2024-02-21 13:06:16,588 [0m][[2;37mqtransform.run.train[0m][[32mINFO[0m] - [32mnumber of torch dataloader: 2[0m
[ [36m2024-02-21 13:06:16,886 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mLoading dataset: tiny_shakespeare, with encoding: gpt2 and dtype: float32[0m
[ [36m2024-02-21 13:06:16,894 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [32mAttempting to retrieve tokenized dataset under "/home/mabot004/.qtransform/datasets/huggingface/tiny_shakespeare/tokenized/gpt2/tiny_shakespeare-float32.bin"[0m
[ [36m2024-02-21 13:06:16,899 [0m][[2;37mqtransform.dataset[0m][[32mINFO[0m] - [

KeyboardInterrupt: 

### LayerNorm and GELU, just like Karpathy's model. Learning rate is set to decay after 5 epochs out of 10
#### We overfit after the first epoch, however the eval losses are significantly lower compared to karpathy (karpathy had an eval loss about 10 times as high as his training loss)
#### Our eval loss stays roughly the same after the first epoch

In [3]:
#from: https://github.com/karpathy/nanoGPT/blob/master/config/train_shakespeare_char.py
#karpathy evaluates after 250 iterations, we implemented eval to do so after every epoch -> max_iters = 5000 / 200
eval_epoch_interval = 1 # keep frequent because we'll overfit
eval_iters = 200
max_iters = 250
epochs = 10 #eval after every epoch, karpathy has 5000 max_iters in total -> epoch = max_iters / eval_interval 
gradient_accumulation_steps = 1 #one large batch, potentially do gradient_accumulation_steps = 8 and batch_size = 8
batch_size = 64
block_size = 256 # context of up to 256 previous characters

# baby GPT model :)
n_layer = 6
n_head = 6
n_embd = 384
dropout = 0.2

learning_rate = 1e-3 # with baby networks can afford to go a bit higher

#not implemented currently
lr_decay_iters = 5000 # make equal to max_iters usually

#not used currently
min_lr = 1e-4 # learning_rate / 10 usually
beta2 = 0.99 # make a bit bigger because number of tokens per iter is small

args = [
        "seed=1337", #same seed as karpathy
        "run=train", 
        "run.export=False",
        "run.epochs="+str(epochs),
        "run.max_iters="+str(max_iters),
        "run.eval_epoch_interval=1", 
        "run.eval_iters="+str(eval_iters),
        "run.grad_clip=1.0",
        "model=gpt_2_h2l2e256b64_GeLN",
        "model.args.n_layer="+str(n_layer),
        "model.args.n_head="+str(n_head),
        "model.args.n_embd="+str(n_embd),
        "model.args.dropout="+str(dropout),
        "dataset=huggingface", 
        "dataset/tokenizer=tiktoken",
        "dataset.tokenizer.encoding=gpt2",
        "dataset.dataloader.batch_size="+str(batch_size),
        "dataset.name=tiny_shakespeare",
        "optim.args.learning_rate="+str(learning_rate),
        "optim.scheduler.schedulers.1.args.step_size="+str(epochs//2),
        "device=cuda"
    ]
qtransform.notebook_run(args)

{'data': {'dtype': 'float32'}, 'device': 'cuda', 'debug': False, 'dataset': {'wrapper': 'HuggingfaceDatasetWrapper', 'module': 'huggingface', 'name': 'tiny_shakespeare', 'root_path': '~/.qtransform/datasets', 'dataset_dir': ['${dataset.root_path}', '${dataset.module}', '${dataset.name}'], 'sizes': {'train': 0.3, 'eval': 0.05, 'bench': 0.3}, 'tokenizer': {'dtype': '${data.dtype}', 'meta_file': 'meta.pkl', 'wrapper': 'TikTokenizer', 'encoding': 'gpt2', 'module': 'tiktoken'}, 'dataloader': {'shuffle': True, 'num_workers': 2, 'batch_size': 64}, 'type': 'huggingface', 'args': {'block_size': '${model.args.block_size}', 'cache_dir': None, 'data_column_name': 'text', 'batches': 1000, 'chunking': False, 'chunk_size': 100}}, 'seed': 1337, 'model': {'calc_loss_in_model': True, 'cls': 'GPT', 'args': {'n_layer': 6, 'n_head': 6, 'n_embd': 384, 'dropout': 0.2, 'bias': True, 'block_size': 64, 'vocab_size': 50304, 'transformer_active_func': 'GELU', 'norm_layer': 'LayerNorm', 'flash': False}}, 'quantiza

Inference is a lot better, but not quite there yet due to the newlines and the repeated nonsense tokens (FOLOLOL...)

In [6]:
sample("/home/mabot004/nanoGPT/GPT_2024-02-21_13:08:16__epoch:10", karpathy = False)

[ [36m2024-02-21 13:16:23,498 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='LayerNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-21 13:16:24,485 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.52M[0m
No meta.pkl found, assuming GPT-2 encodings...































































OLANUS:
I must

CORFIDIUS:
Whatest me to do be I Conspirator:
How are speak, let me,
Say, we do you are our house Tribunes?

AUFIDIUS:
Herere 'IUS:
But!

CORIOLANUS:
Masters, my Aufidius,
And I have
And show'd,
Even since you!

CORIOLANUS:
I'll hear you answer'd, that thou Mars,
Have thou thy eye, nor theoth talk'd of all's are,
In time, there was, by's one,--

AUFIDIUS:
With the present Lord!

AUFIDIUS:
I have say,
Than I am a soldier,
I'll be

In [8]:
sample("/home/mabot004/nanoGPT/GPT_2024-02-21_13:08:16__epoch:10", karpathy = False, start="First citizen:")

[ [36m2024-02-21 13:19:28,063 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mModel config: GPTConfig(block_size=64, vocab_size=50256, n_layer=6, n_head=6, n_embd=384, dropout=0.2, bias=True, flash=False, transformer_active_func='GELU', norm_layer='LayerNorm', single_output=False, custom_ln=False)[0m
[ [36m2024-02-21 13:19:29,109 [0m][[2;37mqtransform.model.gpt[0m][[32mINFO[0m] - [32mnumber of parameters: 33.52M[0m
No meta.pkl found, assuming GPT-2 encodings...
First citizen::Before:Before citizen: citizen::::::::::::::::::::::: brace::::::::::::: citizen::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: cause:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

In [None]:
sample("/home/mabot004/nanoGPT/GPT_2024-02-21_13:08:16__epoch:10", karpathy = False, start="world")

In [2]:
#our learning rate stagnates after some time due to the scheduler adjusting the learning rate to be negligible (1e-10)
import torch
from torch.optim import lr_scheduler
linear = torch.nn.Linear(1,1)
optimizer = torch.optim.SGD(linear.parameters(), lr=1e-4)
min_lr = 1e-6
scheduler =lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
steps = 10
for step in range(steps):
    scheduler.step()
    print(scheduler.get_last_lr()[0], min_lr)

1e-05 1e-06
1.0000000000000002e-06 1e-06
1.0000000000000002e-07 1e-06
1.0000000000000004e-08 1e-06
1.0000000000000005e-09 1e-06
1.0000000000000006e-10 1e-06
1.0000000000000006e-11 1e-06
1.0000000000000006e-12 1e-06
1.0000000000000007e-13 1e-06
1.0000000000000008e-14 1e-06


