In [1]:
from pathlib import Path

text = Path('tiny-shakespeare.txt').read_text()

In [2]:
text[0:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [3]:
import torch

class CharTokenizer:
  def __init__(self, vocabulary):
    self.id_for_char = {chr: id for id, chr in enumerate(vocabulary)}
    self.char_for_id = {id: chr for id, chr in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(vocabulary)

  def encode(self, text):
    ids = []
    for chr in text:
      ids.append(self.id_for_char[chr])
    return torch.tensor(ids, dtype=torch.long)

  def decode(self, ids):
    chars = []
    for id in ids.tolist():
      chars.append(self.char_for_id[id])
    return ''.join(chars)

  def vocabulary_size(self):
    return len(self.id_for_char)

In [4]:
tokenizer = CharTokenizer.train_from_text(text)

In [5]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([15, 41, 44, 44, 20, 40, 21, 20,  8, 44, 14])
Hello world


In [6]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 65


In [7]:
from torch.utils.data import Dataset

class IndexesDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Ensure all sequences generated are complete by reducing length
        return len(self.data) - self.block_size

    def __getitem__(self, pos):
        assert pos < len(self.data) - self.block_size

        x = self.data[pos:pos + self.block_size]
        y = self.data[pos + 1:pos + 1 + self.block_size]
        return x, y

In [8]:
tokenized_text = tokenizer.encode(text)
dataset = IndexesDataset(tokenized_text, 64)

In [9]:
from torch.utils.data import DataLoader, RandomSampler

sampler = RandomSampler(dataset, replacement=True)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

In [10]:
x, y = next(iter(dataloader))

In [11]:
x

tensor([[20, 40, 48, 44, 31,  7, 14, 59, 20, 40, 31, 38, 40, 12, 59, 46, 38, 41,
         44, 49, 32, 18, 18,  2,  6, 37, 55, 40, 62, 25, 50, 48, 55, 50, 30, 25,
         51, 19, 18, 18, 10,  6, 48, 25, 51, 19, 18,  9, 60, 31, 59, 23, 12,  4,
         40, 46, 54, 40, 44, 20,  8, 14, 32, 40],
        [38, 12, 41,  9, 44, 44, 40, 53, 41, 40, 31, 38, 12, 31, 46, 41, 14, 32,
         18, 18, 55, 52, 48, 39, 10,  6, 52, 19, 18, 25, 40, 21, 59, 44, 44, 40,
         61, 20, 40, 14, 31,  8,  5, 44, 54, 40, 23, 20, 40, 21, 20,  8,  5, 40,
         21, 59, 23, 12, 40, 12, 41,  8, 32, 18]])

In [12]:
tokenizer.decode(x[0])

"o Claudio as himself.\n\nDUKE VINCENTIO:\n\nLUCIO:\n'Faith, my lord. "

In [13]:
tokenizer.decode(y[0])

" Claudio as himself.\n\nDUKE VINCENTIO:\n\nLUCIO:\n'Faith, my lord. I"