In [1]:
from __future__ import annotations

import os
import torch
from torch.utils.data import Dataset
import torch
import requests
from typing import List, Tuple, Dict, Any, Union


In [2]:
import sys
from pathlib import Path

def find_root_dir(current_path: Path = Path.cwd(), marker: str = '.git') -> Path | None:
    """
    Find the root directory by searching for a directory or file that serves as a
    marker.

    Parameters
    ----------
    current_path : Path
        The starting path to search from.
    marker : str
        The name of the file or directory that signifies the root.

    Returns
    -------
    Path or None
        The path to the root directory. Returns None if the marker is not found.
    """
    current_path = current_path.resolve()
    for parent in current_path.parents:
        if (parent / marker).exists():
            return parent
    return None

current_file_path = Path(os.getcwd())
root_dir          = find_root_dir(current_file_path, marker='omnivault')

if root_dir is not None:
    sys.path.append(str(root_dir))
    from omnivault.transformer.utils.reproducibility import seed_all
    from omnivault.transformer.core.vocabulary import TextCharacterVocabulary
    # from omnivault.transformer.core.dataset import TextCharacterDataset
    from omnivault.transformer.core.tokenizer import TextCharacterTokenizer

else:
    raise ImportError("Root directory not found.")

In [3]:
# class TinyShakespeareDataset(Dataset):
#     def __init__(self, root_dir='data/tinyshakespeare'):
#         self.root_dir = root_dir
#         self.file_path = os.path.join(root_dir, 'input.txt')
#         self._download()

#         # Load and preprocess data
#         self.data = self._load_data()

#     def _download(self):
#         """Download the dataset if it's not already present."""
#         if not os.path.exists(self.root_dir):
#             os.makedirs(self.root_dir)

#         if not os.path.isfile(self.file_path):
#             url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
#             response = requests.get(url)
#             with open(self.file_path, 'wb') as f:
#                 f.write(response.content)

#     def _load_data(self):
#         """Load and preprocess the data."""
#         with open(self.file_path, 'r') as file:
#             data = file.read()
#         # Implement any specific preprocessing here
#         return data

#     def __len__(self):
#         # Define the length of the dataset
#         return len(self.data)

#     def __getitem__(self, idx):
#         # Implement how to get a single item (e.g., a character or a sequence)
#         return self.data[idx]

# # Example usage
# dataset = TinyShakespeareDataset()
# print(dataset[0])  # Access the first character/sequence


In [4]:
class CharDataset(Dataset):
    """
    Emits batches of characters
    """
    def __init__(self, config, data):
        self.config = config

        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.vocab_size = vocab_size
        self.data = data

    def get_vocab_size(self):
        return self.vocab_size

    def get_block_size(self):
        return self.config.block_size

    def __len__(self):
        return len(self.data) - self.config.block_size

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        chunk = self.data[idx:idx + self.config.block_size + 1]
        # encode every character to an integer
        dix = [self.stoi[s] for s in chunk]
        # return as tensors
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y

In [5]:
data = open('../../../data/tinyshakespeare/input.txt', 'r').read() # don't worry we won't run out of file handles


In [6]:
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

data has 1115389 characters, 65 unique.


In [7]:
# class TextCharacterTokenizer:
#     def __init__(self, vocabulary: List[str]):
#         self.vocabulary = vocabulary
#         self.vocab_size = len(self.vocabulary)
#         self.str_to_int = {s: i for i, s in enumerate(self.vocabulary)}
#         self.int_to_str = {i: s for i, s in enumerate(self.vocabulary)}

#     @classmethod
#     def from_corpus(cls, text_corpus: str):
#         vocabulary = sorted(set(text_corpus))
#         return cls(vocabulary)

#     @classmethod
#     def from_file(cls, file_path: Union[str, Path]):
#         with open(file_path, "r") as f:
#             text_corpus = f.read()
#         return cls.from_corpus(text_corpus)

#     def encode(self, text: str):
#         return [self.str_to_int[s] for s in text]

#     def decode(self, tokens: List[int]):
#         return "".join([self.int_to_str[t] for t in tokens])

In [8]:
vocabulary = TextCharacterVocabulary.from_file('../../../data/tinyshakespeare/input.txt')

In [14]:
vocabulary.vocab_size, vocabulary.token_to_index['h']

(65, 46)

In [10]:
tokenizer = TextCharacterTokenizer(vocabulary)

In [11]:
tokenizer.encode('hello')

[46, 43, 50, 50, 53]

## Why is the length of the dataset defined as `len(self.corpus) - self.context_length`?

Why the length of the dataset is defined as
`len(self.corpus) - self.context_length`. This design is common in datasets used
for training language models, particularly autoregressive models like GPT. Let
me elaborate further:

### Understanding the Dataset Length in the Context of Language Models:

1. **Training Samples Formation**:

    - In an autoregressive model, each training sample typically consists of a
      sequence of tokens used as input and a subsequent token (or tokens) used
      as the target for prediction.
    - If `context_length` is the size of the input sequence, then for any
      starting point in the corpus, you need enough tokens following it to form
      a complete input sequence.

2. **Avoiding Out-of-Bounds Access**:

    - As you approach the end of the corpus, there are fewer tokens available to
      form a complete input sequence of `context_length`.
    - For example, if the corpus length is 1000 tokens and `context_length` is
      128, trying to form a sequence starting at token 900 would result in an
      out-of-bounds access, as you would need tokens up to index 1027 (which
      doesn't exist in the corpus).

3. **Dataset Length Calculation**:

    - To prevent this out-of-bounds issue, the length of the dataset is
      restricted to `len(self.corpus) - self.context_length`. This ensures that
      for any index `i` in the dataset, you can safely access the sequence
      `self.corpus[i:i + context_length]` without exceeding the bounds of the
      corpus.
    - This adjustment means the dataset will not generate sequences that extend
      beyond the end of the corpus.

4. **Practical Example**:
    - If `self.corpus` has 1000 characters and `self.context_length` is 128, the
      last index accessed by the dataset (for the start of a sequence) will be
      `1000 - 128 = 872`. The corresponding sequence will run from index 872 to
      999, which is precisely 128 characters.

In summary, the length of the dataset is calculated as
`len(self.corpus) - self.context_length` to ensure that every training sample
has a complete input sequence of the desired context length, without attempting
to access data beyond the end of the corpus. This approach is a standard
practice in preparing datasets for training language models.
