# Dataset Testing Notebook

Testing notebook to ensure the PyTorch dataloaders are working as expected for.

In [4]:
import os
from torch.utils.data import DataLoader

from datetime import datetime
from src.datasets.shakespeare import Shakespeare
from src.tokenizers.character_level import CharacterLevelTokenizer

In [5]:
data_dpath = "/Users/endric.daues/workspace/projects/deep_learning/gpt/data/shakespeare"
tokenizer_dpath = "/Users/endric.daues/workspace/projects/deep_learning/gpt/tokenizers"

### Test Dataset

In [6]:
# Show Data
data_fpath = os.path.join(data_dpath, f"train.txt")
with open(data_fpath, "r") as f:
    data = f.read()
    print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### Test Tokenizer

In [7]:
# Test Tokenizer
tokenizer = CharacterLevelTokenizer(tokenizer_dpath, data_dpath)
tokenizer.encode("\ntest")

Tokenizer vocab size: 65


[0, 58, 43, 57, 58]

In [8]:
# Decode newline
tokenizer.decode([0])

'\n'

### Load Dataset

In [9]:
train_dataset = Shakespeare(data_dpath, tokenizer_dpath, "train", block_size=10)

training_dataloader = DataLoader(
    train_dataset,
    batch_size=2,
    num_workers=1,
    drop_last=True,
    prefetch_factor=10,
    shuffle=False,
)

instantiating dataset class
Tokenizer vocab size: 65


In [10]:
# Check direct getitem call
train_dataset.__getitem__(1)

(tensor([39, 57,  1, 47,  5,  1, 58, 46, 43,  1]),
 tensor([57,  1, 47,  5,  1, 58, 46, 43,  1, 53]))

In [14]:
# Compare to data loader call
for data in training_dataloader:
    x, y = data
    print(f"x: {x}, shape: {x.shape}")
    print(f"y: {y}, shape: {y.shape}")
    break

x: tensor([[21,  1, 50, 43, 44, 58,  1, 46, 47, 51],
        [ 1, 54, 50, 43, 39, 57, 43,  8,  0,  0]]), shape: torch.Size([2, 10])
y: tensor([[ 1, 50, 43, 44, 58,  1, 46, 47, 51,  8],
        [54, 50, 43, 39, 57, 43,  8,  0,  0, 23]]), shape: torch.Size([2, 10])
