In [1]:
from tqdm import tqdm
import numpy as np
import os
import torch
import urllib.request

## Download the data

The best place to access books that are no longer under Copyright is [Project Gutenberg](https://www.gutenberg.org/). Today we recommend using [Alice’s Adventures in Wonderland by Lewis Carroll](https://www.gutenberg.org/files/11/11-0.txt) for consistency. Of course you can experiment with other books as well.

In [2]:
data_url = 'https://www.gutenberg.org/files/219/219-0.txt'
fname = 'heart_of_darkness.txt'

if fname not in os.listdir():
    urllib.request.urlretrieve(data_url, fname)

## Load data and create character to integer mappings

- Open the text file, read the data then convert it to lowercase letters.
- Map each character to a respective number. Keep 2 dictionaries in order to have more easily access to the mappings both ways around.
- Transform the data from a list of characters to a list of integers

In [3]:
# Load data
with open(fname, 'r') as f:
    data = f.read()

# Preprocess data
data = list(data.lower())

# Build char-to-int and int-to-char dictionaries
c2i = {x: i for i, x in enumerate(set(data))}
i2c = {i: x for x, i in c2i.items()}

# Transform the data from chars to integers
data = [c2i.get(c) for c in data]
data[:10], [i2c.get(i) for i in data][:10]

([58, 55, 57, 20, 12, 1, 29, 10, 48, 20],
 ['\ufeff', 't', 'h', 'e', ' ', 'p', 'r', 'o', 'j', 'e'])

In [4]:
list(c2i.items())[:10], list(i2c.items())[:10]

([('*', 0),
  ('p', 1),
  ('n', 2),
  ('.', 3),
  ('x', 4),
  ('6', 5),
  ('(', 6),
  ('#', 7),
  ('’', 8),
  ('%', 9)],
 [(0, '*'),
  (1, 'p'),
  (2, 'n'),
  (3, '.'),
  (4, 'x'),
  (5, '6'),
  (6, '('),
  (7, '#'),
  (8, '’'),
  (9, '%')])

## Define the datasets and dataloaders
- We are "thinking" in sequences of 100 characters: 99 characters in the input and 1 in the output.  
E.g. for the sequence *\['h', 'e', 'l', 'l'\]* as input, we will have *\['o'\]* as the expected output.
- Each pair (sample, label) from the training dataset will be composed from a sequence of 99 ints and a single integer label
- We will keep the first 85% sequences as training data and use the remaining for validation

In [5]:
import torch.utils.data as data
import typing as t

# Define datasets
class SequenceDataset(data.Dataset):
    def __init__(self, data_url: str, fname: str, seq_len: int=99) -> None:
        super().__init__()

        # Useful props
        self.__data_url = data_url
        self.__seq_len = seq_len
        self.__fname = fname

        # Populated through loading
        self.c2i: t.Dict[str, int]
        self.i2c: t.Dict[int, str]
        self.char: bool = False

        # Load the data
        self.__data = self.__load()

    def seq_to_txt(self, seq: t.List[int]) -> str:
        return ''.join([self.i2c[i] for i in seq])

    def __getitem__(self, index: int):
        X = self.__data[index:index + self.__seq_len]
        y = self.__data[index + self.__seq_len]

        if not self.char:
            X = torch.tensor(X)
            y = torch.tensor(y)
            return X, y

        return self.seq_to_txt(X), self.i2c[y]

    def __len__(self) -> int:
        return max(0, len(self.__data) - self.__seq_len)

    def __load(self) -> t.List[int]:
        # Download it if does not exist
        if self.__fname not in os.listdir():
            urllib.request.urlretrieve(self.__data_url, self.__fname)

        # Load data
        with open(self.__fname, 'r') as f:
            data = f.read()

        # Preprocess data
        data = list(data.lower())

        # Build char-to-int and int-to-char dictionaries
        self.c2i = {x: i for i, x in enumerate(set(data))}
        self.i2c = {i: x for x, i in c2i.items()}

        # Transform the data from chars to integers
        return [c2i[c] for c in data]

# Create datasets
dataset = SequenceDataset(
    data_url='https://www.gutenberg.org/files/219/219-0.txt',
    fname='heart_of_darkness.txt'
)

# Split into Train & Validation
gen = torch.Generator('cpu')
train_d, valid_d  = data.random_split(dataset, [0.85, 0.15], generator=gen)

# Specify the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define dataloaders
batch_size = 128
train_dl = data.DataLoader(train_d, batch_size, shuffle=True, generator=gen)
valid_dl = data.DataLoader(valid_d, batch_size, shuffle=True, generator=gen)

## Define a model with
- An embedding layer with size 32
- Three LSTM layers with a hidden size of 256 and a dropout rate of 20%
- A final linear classification layer

In [None]:
import torch.nn as nn


class LSTMLayer(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, dropout: bool=True):
        self.lstm_cell = nn.LSTMCell(input_size, hidden_size, bias=True)
        self.dropout = nn.Dropout(p=0.2)
        self.drop = dropout

    def forward(self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor) -> t.Tuple[torch.Tensor, torch.Tensor]:
        # May or may not apply dropout on the initial hidden state
        if self.drop:
            h = self.dropout(h)

        # Apply the LSTM and return the new states
        return self.lstm_cell(x, (h, c))


class Model(nn.Module):
    def __init__(self, num_embeddings: int):
        super().__init__()

        # From int to internal learnable embeddings
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim=32)

        # Define a RNN using three LSTM layers, applied one after another
        self.rnn = nn.Sequential(
            LSTMLayer(input_size=32, hidden_size=256, dropout=False),
            LSTMLayer(input_size=32, hidden_size=256, dropout=True),
            LSTMLayer(input_size=32, hidden_size=256, dropout=True)
        )
        self.dense = nn.Linear(in_features=256, out_features=num_embeddings, bias=True)

    def forward(self, input):
        pass

## Define the training loop and train the model to predict the next character in the sequence

In [None]:
# define the training loop and traing the model


## Evaluate the model by generating text

- Start with 99 characters (potentially chosen from a text)
- Generate a new character using the trained network
- Repeat the process by appending the generated character and making a prediction for a new one

In [None]:
# generate text