## qt Training Playground

In [1]:
from datasets import load_from_disk
from utils.torch_datasets import MiniPileDataset, ExampleCorpusDataset, RedditCommentsDataset
from utils.tokenizer import get_tokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

tokenizer = get_tokenizer()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = RedditCommentsDataset(split='train', block_size=20)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# Print a few samples
for i, (x, y) in enumerate(loader):
    print(f"\nSample {i+1}")
    print("Input IDs:", x[0].tolist())
    print("Input Text:", tokenizer.decode(x[0].tolist()))
    print("Label IDs:", y[0].tolist())
    print("Label Text:", tokenizer.decode(y[0].tolist()))
    if i == 4:
        break

Indexing train split: 100%|██████████| 4/4 [00:06<00:00,  1.70s/it]



Sample 1
Input IDs: [14, 32838, 2413, 12, 36802, 6200, 12, 23195, 22296, 12, 1495, 12, 39390, 12, 5219, 12, 19472, 12, 310, 23195]
Input Text: /crucial-mx300-275gb-25-solid-state-drive-ct275
Label IDs: [32838, 2413, 12, 36802, 6200, 12, 23195, 22296, 12, 1495, 12, 39390, 12, 5219, 12, 19472, 12, 310, 23195, 36802]
Label Text: crucial-mx300-275gb-25-solid-state-drive-ct275mx

Sample 2
Input IDs: [2193, 617, 517, 290, 4043, 13, 309, 34369, 345, 743, 765, 284, 625, 15750, 262, 308, 19944, 2427, 13, 220]
Input Text:  learn some more and wait. Tbh you may want to overclock the gpu instead. 
Label IDs: [617, 517, 290, 4043, 13, 309, 34369, 345, 743, 765, 284, 625, 15750, 262, 308, 19944, 2427, 13, 220, 46]
Label Text:  some more and wait. Tbh you may want to overclock the gpu instead. O

Sample 3
Input IDs: [393, 18529, 44866, 18216, 41408, 30, 1058, 93, 8, 198, 198, 41156, 319, 703, 1468, 14, 4053, 12, 35927, 276]
Input Text:  or upturned nostrils? :~)

Depending on how old/well-traveled
L

In [7]:
ds = ExampleCorpusDataset(10, 1)
for i in range(4):
    ex_x, ex_y = ds[i]
    print(tokenizer.decode(ex_x), tokenizer.decode(ex_y), sep=' -> ')
    print('\n\n')

The cat sat on the mat.  
 ->  cat sat on the mat.  
The



 cat sat on the mat.  
The ->  sat on the mat.  
The dog



 sat on the mat.  
The dog ->  on the mat.  
The dog bark



 on the mat.  
The dog bark ->  the mat.  
The dog barked





In [12]:
ds.tokens.max()

tensor(48024)

In [9]:
train_ds = MiniPileDataset(split='train', block_size=100)
tokenizer = get_tokenizer()
for i in range(4):
    ex_x, ex_y = train_ds[i]
    print(tokenizer.decode(ex_x), tokenizer.decode(ex_y), sep=' -> ')
    print('*'*50)

HTC's Vive Pro headset is available to pre-order for $799

We've seen plenty of Beats-focused KIRFs in our time, some better than others. Few, however, play quite so directly on the name as OrigAudio's Beets. For $25, adopters get a set of headphones that bear little direct resemblance to Dr. Dre's audio gear of choice, but are no doubt bound to impress friends -- at least, up until they see a root vegetable -> TC's Vive Pro headset is available to pre-order for $799

We've seen plenty of Beats-focused KIRFs in our time, some better than others. Few, however, play quite so directly on the name as OrigAudio's Beets. For $25, adopters get a set of headphones that bear little direct resemblance to Dr. Dre's audio gear of choice, but are no doubt bound to impress friends -- at least, up until they see a root vegetable logo
**************************************************
 logo instead of a lower-case B. Thankfully, there's more to it than just amusing and confusing peers. Every purchase 

In [3]:
train_ds[0]

(tensor([   39,  4825,   338, 29237,  1041, 23492,   318,  1695,   284,   662,
            12,  2875,   329,   720, 45455,   198,   198,  1135,  1053,  1775,
          6088,   286, 40210,    12, 18143,   509,  4663, 42388,   287,   674,
           640,    11,   617,  1365,   621,  1854,    13, 20463,    11,  2158,
            11,   711,  2407,   523,  3264,   319,   262,  1438,   355,  6913,
         21206,   338,  1355,  1039,    13,  1114,   720,  1495,    11,  4344,
          1010,   651,   257,   900,   286, 22537,   326,  6842,  1310,  1277,
         28204,   284,  1583,    13, 30882,   338,  6597,  7733,   286,  3572,
            11,   475,   389,   645,  4719,  5421,   284, 14947,  2460,  1377,
           379,  1551,    11,   510,  1566,   484,   766,   257,  6808, 20236]),
 tensor([ 4825,   338, 29237,  1041, 23492,   318,  1695,   284,   662,    12,
          2875,   329,   720, 45455,   198,   198,  1135,  1053,  1775,  6088,
           286, 40210,    12, 18143,   509,  4663,

## Estimating Parameters based on Architecture Hyperparameters

In [None]:
# settings
d_model = 2048
num_heads = 16
assert d_model % num_heads == 0
d_ff = 4*d_model
num_layers = 14
# num_layers = 1 # NOTE for testing
vocab_size = 50300

seq_len = 2048
batch_size = 64


## parameter calcs
decoder_params = 2*4*d_model**2 + 2*d_model*d_ff

embedding_params = vocab_size * d_model
nonembedding_params = decoder_params * num_layers

total_params = embedding_params + nonembedding_params

## memory footprint calcs

model_footprint = 4 * total_params

# inference footprint in GBs
inference_footprint = 1.2 * model_footprint // 10**9

adam_footprint = 12 * total_params
gradients_footprint = 4 * total_params
activations_footprint = 2*seq_len*batch_size*d_model*num_layers

# training footprint in GBs
training_footprint = (model_footprint + \
    adam_footprint + \
    gradients_footprint + \
    activations_footprint) // 10**9

model_card_str = f'''
------------Settings-----------------
d_model: {d_model}
d_ff: {d_ff}
num_layers: {num_layers}
vocab_size: {vocab_size}
seq_len: {seq_len}
batch_size: {batch_size}
assumes fp32 params
------------Parameters---------------
params per decoder layer: {decoder_params:,}
--------------------------------------
total nonembedding params: {nonembedding_params:,}
total embedding params: {embedding_params:,}
--------------------------------------

total params: {total_params:,}

----------------Memory----------------

memory footprint of model during training: {training_footprint} GBs
memory footprint of model during inference: {inference_footprint} GBs

--------------------------------------

'''

print(model_card_str)



------------Settings-----------------
d_model: 2048
d_ff: 8192
num_layers: 1
vocab_size: 50300
seq_len: 2048
batch_size: 64
assumes fp32 params
------------Parameters---------------
params per decoder layer: 67,108,864
--------------------------------------
total nonembedding params: 67,108,864
total embedding params: 103,014,400
--------------------------------------

total params: 170,123,264

----------------Memory----------------

memory footprint of model during training: 3 GBs
memory footprint of model during inference: 0.0 GBs

--------------------------------------




## Trial Training Run

In [1]:
import os
import logging
import torch
from torchinfo import summary # TODO add this to requirements.txt
from torcheval.metrics.text import Perplexity
from utils.configs import load_configs
from transformers import GPT2TokenizerFast
from utils.torch_datasets import MiniPileDataset
from torch.utils.data import DataLoader, Dataset

from utils.transformer.model import QT
from utils.training import Trainer

## load configs, logger, and device
config = load_configs()
# logs saves to training.log in harm2d directory
logger = logging.getLogger(__name__)
logging.basicConfig(
    filename=config['training'].logging_dir,
    filemode='w',
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
# get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## get datasets
train = MiniPileDataset(
    path='data/tokenized/validation_tokens.pt', 
    block_size=config['transformer'].max_seq_length
)
valid = MiniPileDataset(
    path='data/tokenized/test_tokens.pt', 
    block_size=config['transformer'].max_seq_length
)

## get dataloaders
train_loader = DataLoader(train, batch_size=config['training'].batch_size, shuffle=True)
valid_loader = DataLoader(valid, batch_size=config['training'].batch_size, shuffle=False)

## get model
qt = QT(
    config=config['transformer'],
    tokenizer = GPT2TokenizerFast,
    device = device 
)

model_card_str = summary(qt)
logging.info('\n' + str(model_card_str))
logging.info(config)

# ## pretrain
trainer = Trainer(
    model = qt,
    train_loader=train_loader,
    val_loader=valid_loader,
    config=config['training'],
    criterion = torch.nn.CrossEntropyLoss(),
    metric = Perplexity(),
    device=device
)

trainer.train()



  0%|          | 0/48 [00:00<?, ?it/s]


AttributeError: type object 'GPT2TokenizerFast' has no attribute 'pad_token_id'