In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from pathlib import Path

text = Path('../../../data/tiny-shakespeare.txt').read_text()

In [3]:
print(text[0:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:

class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)


  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [5]:
tokenizer = CharTokenizer.train_from_text(text)

In [6]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([20, 43, 50, 50, 53,  1, 61, 53, 56, 50, 42])
Hello world


In [7]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 65


In [8]:
from torch.utils.data import Dataset

class TokenIdsDataset(Dataset):
  def __init__(self, data, block_size):
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data) - self.block_size

  def __getitem__(self, pos):
    assert pos < len(self.data) - self.block_size

    x = self.data[pos:pos + self.block_size]
    y = self.data[pos + 1:pos + 1 + self.block_size]
    return x, y

In [9]:
tokenized_text = tokenizer.encode(text)
dataset = TokenIdsDataset(tokenized_text, block_size=64)

In [10]:
x, y = dataset[0]

In [11]:
x

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50])

In [12]:
tokenizer.decode(x)

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAl'

In [13]:
from torch.utils.data import DataLoader, RandomSampler

sampler = RandomSampler(dataset, replacement=True)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)

In [14]:
x, y = next(iter(dataloader))

In [15]:
x.shape

torch.Size([2, 64])

In [16]:
x

tensor([[56, 53,  1, 58, 46, 43,  1, 54, 56, 47, 51, 43,  1, 42, 59, 49, 43,  6,
          1, 40, 43, 47, 52, 45,  1, 57, 53,  1, 56, 43, 54, 59, 58, 43, 42,  0,
         21, 52,  1, 42, 47, 45, 52, 47, 58, 63,  6,  1, 39, 52, 42,  1, 44, 53,
         56,  1, 58, 46, 43,  1, 50, 47, 40, 43],
        [52, 41, 46,  1, 57, 50, 53, 54,  8,  1, 37, 53, 59,  1, 45, 39, 60, 43,
          1, 59, 57,  1, 58, 46, 43,  1, 41, 53, 59, 52, 58, 43, 56, 44, 43, 47,
         58,  0, 44, 39, 47, 56, 50, 63,  1, 50, 39, 57, 58,  1, 52, 47, 45, 46,
         58,  8,  0,  0, 30, 27, 25, 17, 27, 10]])

In [17]:
tokenizer.decode(x[0])

'ro the prime duke, being so reputed\nIn dignity, and for the libe'

In [18]:
tokenizer.decode(y[0])

'o the prime duke, being so reputed\nIn dignity, and for the liber'