**Table of contents**<a id='toc0_'></a>    
- [Shakespeare](#toc1_)    
  - [Vocabulary and Tokenization](#toc1_1_)    
  - [Dataset and Dataloader](#toc1_2_)    
    - [Why is the length of the dataset defined as `len(self.corpus) - self.context_length`?](#toc1_2_1_)    
    - [Understanding the Dataset Length in the Context of Language Models:](#toc1_2_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Shakespeare](#toc0_)



In [72]:
from __future__ import annotations

import os
import torch
from rich.pretty import pprint
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import requests
from typing import List, Tuple, Dict, Any, Union


In [73]:
import sys
from pathlib import Path

def find_root_dir(current_path: Path = Path.cwd(), marker: str = '.git') -> Path | None:
    """
    Find the root directory by searching for a directory or file that serves as a
    marker.

    Parameters
    ----------
    current_path : Path
        The starting path to search from.
    marker : str
        The name of the file or directory that signifies the root.

    Returns
    -------
    Path or None
        The path to the root directory. Returns None if the marker is not found.
    """
    current_path = current_path.resolve()
    for parent in current_path.parents:
        if (parent / marker).exists():
            return parent
    return None

current_file_path = Path(os.getcwd())
root_dir          = find_root_dir(current_file_path, marker='omnivault')

if root_dir is not None:
    sys.path.append(str(root_dir))
    from omnivault.transformer.utils.reproducibility import seed_all
    from omnivault.transformer.core.vocabulary import TextCharacterVocabulary
    from omnivault.transformer.core.dataset import TextCharacterDataset
    from omnivault.transformer.core.tokenizer import TextCharacterTokenizer
    from omnivault.transformer.config.composer import Composer, DataConfig
    from omnivault.transformer.config.optim import OptimizerConfig, AdamConfig
    from omnivault.transformer.config.constants import MaybeConstant
    from omnivault.transformer.config.global_ import MaybeGlobal
    from omnivault.transformer.decoder.core import GPTDecoder, GPTDecoderBlock
    from omnivault.transformer.config.decoder import *
    from omnivault.transformer.modules.attention.core import ScaledDotProductAttention, MultiHeadedAttention
    from omnivault.transformer.core.trainer import Trainer

else:
    raise ImportError("Root directory not found.")

## <a id='toc1_1_'></a>[Vocabulary and Tokenization](#toc0_)

See https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt.

Especially:

> The second step is to convert those tokens into numbers, so we can build a tensor out of them and feed them to the model. To do this, the tokenizer has a vocabulary, which is the part we download when we instantiate it with the from_pretrained() method. Again, we need to use the same vocabulary used when the model was pretrained.

In [74]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# sequence = "Using a Transformer network is simple"
# tokens = tokenizer.tokenize(sequence)

# print(tokens)

In [75]:
data = open('../../../data/tinyshakespeare/input.txt', 'r').read() # don't worry we won't run out of file handles

In [76]:
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

data has 1115394 characters, 65 unique.


In [77]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
vocabulary = TextCharacterVocabulary.from_file('../../../data/tinyshakespeare/input.txt')
vocabulary_2 = TextCharacterVocabulary.from_url(url)
assert vocabulary.index_to_token == vocabulary_2.index_to_token

In [78]:
vocabulary.vocab_size, vocabulary.token_to_index['h']

(65, 46)

In [79]:
tokenizer = TextCharacterTokenizer(vocabulary)

In [80]:
tokenizer.tokenize('hello'), tokenizer.encode('hello'), tokenizer.decode(tokenizer.encode('hello'))

(['h', 'e', 'l', 'l', 'o'], [46, 43, 50, 50, 53], 'hello')

## <a id='toc1_2_'></a>[Dataset and Dataloader](#toc0_)

In [81]:
corpus = open('../../../data/tinyshakespeare/input.txt', 'r').read()
dataset = TextCharacterDataset(corpus=corpus, context_length=64, tokenizer=tokenizer)

dataset[0], tokenizer.decode(dataset[0][0].tolist()), tokenizer.decode(dataset[0][1].tolist())

((tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
          53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
           1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
          57, 54, 43, 39, 49,  8,  0,  0, 13, 50]),
  tensor([47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44, 53,
          56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,  1,
          44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1, 57,
          54, 43, 39, 49,  8,  0,  0, 13, 50, 50])),
 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAl',
 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll')

### <a id='toc1_2_1_'></a>[Why is the length of the dataset defined as `len(self.corpus) - self.context_length`?](#toc0_)

Why the length of the dataset is defined as
`len(self.corpus) - self.context_length`. This design is common in datasets used
for training language models, particularly autoregressive models like GPT. Let
me elaborate further:

### <a id='toc1_2_2_'></a>[Understanding the Dataset Length in the Context of Language Models:](#toc0_)

1. **Training Samples Formation**:

    - In an autoregressive model, each training sample typically consists of a
      sequence of tokens used as input and a subsequent token (or tokens) used
      as the target for prediction.
    - If `context_length` is the size of the input sequence, then for any
      starting point in the corpus, you need enough tokens following it to form
      a complete input sequence.

2. **Avoiding Out-of-Bounds Access**:

    - As you approach the end of the corpus, there are fewer tokens available to
      form a complete input sequence of `context_length`.
    - For example, if the corpus length is 1000 tokens and `context_length` is
      128, trying to form a sequence starting at token 900 would result in an
      out-of-bounds access, as you would need tokens up to index 1027 (which
      doesn't exist in the corpus).

3. **Dataset Length Calculation**:

    - To prevent this out-of-bounds issue, the length of the dataset is
      restricted to `len(self.corpus) - self.context_length`. This ensures that
      for any index `i` in the dataset, you can safely access the sequence
      `self.corpus[i:i + context_length]` without exceeding the bounds of the
      corpus.
    - This adjustment means the dataset will not generate sequences that extend
      beyond the end of the corpus.

4. **Practical Example**:
    - If `self.corpus` has 1000 characters and `self.context_length` is 128, the
      last index accessed by the dataset (for the start of a sequence) will be
      `1000 - 128 = 872`. The corresponding sequence will run from index 872 to
      999, which is precisely 128 characters.

In summary, the length of the dataset is calculated as
`len(self.corpus) - self.context_length` to ensure that every training sample
has a complete input sequence of the desired context length, without attempting
to access data beyond the end of the corpus. This approach is a standard
practice in preparing datasets for training language models.


## Trainer

In [82]:
constants: MaybeConstant = MaybeConstant()
global_: MaybeGlobal = MaybeGlobal(seed=42, dataset_size=2)
data_config: DataConfig = DataConfig()
optimizer_config = AdamConfig(name="torch.optim.Adam", lr=5e-4, betas=(0.9, 0.98), eps=1e-9)

config = Composer(constants=constants, global_=global_, data=data_config, optimizer=optimizer_config)
pprint(config)

In [83]:
DEBUG  = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed_all(config.global_.seed, seed_torch=True, set_torch_deterministic=True)

  configure_deterministic_mode()


42

In [84]:
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True)

In [85]:
# Create individual component configurations
masked_self_attention_mha_config = MultiHeadedAttentionConfig(
     attention=ScaledDotProductAttention(),
    d_model=128, H=4, dropout=0.1
)

feed_forward_config = PositionwiseFeedForwardConfig(
    d_model=128, d_ff=256, activation=nn.GELU(approximate="tanh"), dropout=0.1, bias=True
)

add_norm_config_1 = AddNormConfig(feature_dim=128, dropout=0.1)
add_norm_config_2 = AddNormConfig(feature_dim=128, dropout=0.1)

# Create DecoderBlockConfig
decoder_block_config = DecoderBlockConfig(
    masked_self_attention_mha=masked_self_attention_mha_config,
    feed_forward=feed_forward_config,
    add_norm_1=add_norm_config_1,
    add_norm_2=add_norm_config_2,
)

# Create the overall DecoderConfig
model_config = DecoderConfig(
    d_model=128,
    vocab_size=vocab_size,
    context_length=64,
    num_decoder_blocks=2,
    dropout=0.1,
    decoder_block=decoder_block_config,
)

model = GPTDecoder(model_config).to(DEVICE)

In [86]:
# warmup_steps = 3*len(dataloaders.train_loader)
warmup_steps = 3 * len(train_loader)


# lr first increases in the warmup steps, and then descreases
lr_fn        = lambda step: model_config.d_model**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
# optimizer    = torch.optim.Adam(model.parameters(), lr=0.2, betas=(0.9, 0.98), eps=1e-9)

# optimizer_config = OptimizerConfig(name="torch.optim.Adam", lr=0.2, betas=(0.9, 0.98), eps=1e-9)
# optimizer   = optimizer_config.build(params=model.parameters())

# optimizer_config = OptimizerConfig(name="torch.optim.Adam", lr=0.2)
# optimizer   = optimizer_config.build(params=model.parameters(), betas=(0.9, 0.98), eps=1e-9)

optimizer   = config.optimizer.build(params=model.parameters())

scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
criterion    = nn.CrossEntropyLoss(ignore_index=-1, reduction="mean")




In [87]:
trainer = Trainer(
    model=model,
    train_dataloader=train_loader,
    valid_dataloader=train_loader,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    grad_norm_clip=1.0,
    device=DEVICE,
    # test_dataloader=test_loader,
    # NOTE: uncomment the above line to enable testing after each epoch
    # but seeding will affect.
)

if DEBUG:
    trained_model = trainer.fit(num_epochs=2) # or 15
    # torch.save(model.state_dict(), 'model_debug.pt')
    # model_debug = torch.load('./model_debug.pt')
    # if are_both_models_same(model.state_dict(), model_debug):
    #     print("Pass")
    # else:
    #     print("Fail")

else:
    trained_model = trainer.fit(num_epochs=30)

    # torch.save(model.state_dict(), 'model_non_debug.pt')

Epoch 1/2
----------


  0%|          | 0/34855 [00:00<?, ?it/s]


ValueError: not enough values to unpack (expected 4, got 2)