## qt Training Playground

In [13]:
from datasets import load_from_disk
from utils.torch_datasets import MiniPileDataset
from utils.tokenizer import get_tokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch

In [58]:
class SmallMiniPileDataset(Dataset):
    def __init__(self, corpa: list, block_size: int, stride: int = None, offset: int = 0):
        self.tokens = torch.tensor(corpa)
        self.offset = offset
        self.block_size = block_size
        self.stride = stride if stride is not None else block_size
        # self.indices = list(range(0, len(self.tokens) - block_size, self.stride))
        self.indices = list(range(0 + offset, len(self.tokens) - block_size, self.stride))

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        start = self.indices[idx]
        chunk = self.tokens[start : start + self.block_size + 1]
        input_tensor = chunk[:-1].clone().long()
        label_tensor = chunk[1:].clone().long()
        return {"input_ids": input_tensor, "labels": label_tensor}

In [59]:
corpa = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [64]:
ds = SmallMiniPileDataset(corpa=corpa, block_size=3, offset=2)
ds[-1]


{'input_ids': tensor([6, 7, 8]), 'labels': tensor([7, 8, 9])}

In [65]:
print(ds.indices)

[2, 5]


In [2]:
# from datasets import load_from_disk
# from tqdm import tqdm
# import torch

# ds = load_from_disk("data/tokenized/minipile")
# ds
# # all_ids = []
# # for example in tqdm(ds['train']):
# #     all_ids.extend(example["input_ids"])

# # torch.save(torch.tensor(all_ids, dtype=torch.long), "data/tokenized/flattened_ids.pt")

In [3]:
train = MiniPileDataset(path='data/tokenized/flattened_ids.pt', block_size=20)
tokenizer = get_tokenizer()

In [7]:
train_loader = DataLoader(train, batch_size=32, shuffle=True)

In [11]:
pbar = tqdm(enumerate(train_loader), total=len(train_loader))

for i, batch in pbar:
    torch.sum(batch['input_ids'])

  input_tensor = torch.tensor(chunk[:-1].clone(), dtype=torch.long)
  label_tensor = torch.tensor(chunk[1:].clone(), dtype=torch.long)
  0%|          | 11318/2649621 [00:33<2:09:03, 340.70it/s]


KeyboardInterrupt: 

In [5]:
for i in range(5):
    batch = train[i]
    input_ids = tokenizer.decode(batch["input_ids"])
    labels = tokenizer.decode(batch["labels"])
    print(f'Input Ids:\n{input_ids}\n')
    print(f"Labels:\n{labels}")
    print('\n\n')

Input Ids:
HTC's Vive Pro headset is available to pre-order for $799

We've seen plenty of Beats-focused KIRFs in our time, some better than others. Few, however, play quite so directly on the name as OrigAudio's Beets. For $25, adopters get a set of headphones that bear little direct resemblance to Dr. Dre's audio gear of choice, but are no doubt bound to impress friends -- at least, up until they see a root vegetable

Labels:
TC's Vive Pro headset is available to pre-order for $799

We've seen plenty of Beats-focused KIRFs in our time, some better than others. Few, however, play quite so directly on the name as OrigAudio's Beets. For $25, adopters get a set of headphones that bear little direct resemblance to Dr. Dre's audio gear of choice, but are no doubt bound to impress friends -- at least, up until they see a root vegetable logo



Input Ids:
 logo instead of a lower-case B. Thankfully, there's more to it than just amusing and confusing peers. Every purchase will lead to a donat

## Estimating Parameters based on Architecture Hyperparameters

In [17]:
#settings
d_model = 2048
num_heads = 16
assert d_model % num_heads == 0
d_ff = 4*d_model
num_layers = 14
vocab_size = 50300

seq_len = 2048
batch_size = 4


## parameter calcs
decoder_params = 2*4*d_model**2 + 2*d_model*d_ff

embedding_params = vocab_size * d_model
nonembedding_params = decoder_params * num_layers

total_params = embedding_params + nonembedding_params

## memory footprint calcs

model_footprint = 4 * total_params

# inference footprint in GBs
inference_footprint = 1.2 * model_footprint // 10**9

adam_footprint = 12 * total_params
gradients_footprint = 4 * total_params
activations_footprint = 2*seq_len*batch_size*d_model*num_layers

# training footprint in GBs
training_footprint = (model_footprint + \
    adam_footprint + \
    gradients_footprint + \
    activations_footprint) // 10**9

model_card_str = f'''
------------Settings-----------------
d_model: {d_model}
d_ff: {d_ff}
num_layers: {num_layers}
vocab_size: {vocab_size}
seq_len: {seq_len}
batch_size: {batch_size}
assumes fp32 params
------------Parameters---------------
params per decoder layer: {decoder_params:,}
--------------------------------------
total nonembedding params: {nonembedding_params:,}
total embedding params: {embedding_params:,}
--------------------------------------

total params: {total_params:,}

----------------Memory----------------

memory footprint of model during training: {training_footprint} GBs
memory footprint of model during inference: {inference_footprint} GBs

--------------------------------------

'''

print(model_card_str)



------------Settings-----------------
d_model: 2048
d_ff: 8192
num_layers: 14
vocab_size: 50300
seq_len: 2048
batch_size: 4
assumes fp32 params
------------Parameters---------------
params per decoder layer: 67,108,864
--------------------------------------
total nonembedding params: 939,524,096
total embedding params: 103,014,400
--------------------------------------

total params: 1,042,538,496

----------------Memory----------------

memory footprint of model during training: 21 GBs
memory footprint of model during inference: 5.0 GBs

--------------------------------------


