## Charlie: one hot encoded characters based text generation

This neural network is the simplest among the ones faced in this project. It uses sentences splitted into characters, which are then one hot encoded to provide a computer readable format as network input.

First step in the development of this neural network is the definition of its architecture by means of a grid search for a few epochs in order to compute loss and compare it among the parameters tested.

Once an architecture has been set and other optimal parameters such as length of the recurrent window and the type of the neural network have been found, the neural network is trained again on the whole dataset for many epochs to develop the final model, saving its weights in checkpoints at regular epochs interval, in order to avoid eventual loss of traning progress.

In [1]:
# Dependencies
from src.dataset import WarAndPeace, OneHotEncode, RandomCrop, ToTensor
from src.dataset import split_train_test
from src.network import Charlie
from src.network import grid_search, save_epochs, load_epochs, train_test_epochs
from torch.utils.data import DataLoader
from torchvision import transforms
from torch import optim, nn
import matplotlib.pyplot as plt
import numpy as np
import torch
import json

%matplotlib inline

In [2]:
# Define CPU device
cpu = torch.device('cpu')
# Define best device (GPU if available, CPU otherwise)
device = torch.device('cuda') if torch.cuda.is_available() else cpu

In [3]:
# Make dataset
dataset = WarAndPeace('./data/war-and-peace-tolstoj.txt', split_how='chars', min_len=10)
# Show dataset length (number of sentences)
print('Dataset length:', len(dataset))

Dataset length: 32594


In [4]:
# Show first 3 sentences
for i in range(3):
    print(dataset[i])

['"', 'w', 'e', 'l', 'l', ',', ' ', 'p', 'r', 'i', 'n', 'c', 'e', ',', ' ', 's', 'o', ' ', 'g', 'e', 'n', 'o', 'a', ' ', 'a', 'n', 'd', ' ', 'l', 'u', 'c', 'c', 'a', ' ', 'a', 'r', 'e', ' ', 'n', 'o', 'w', ' ', 'j', 'u', 's', 't', ' ', 'f', 'a', 'm', 'i', 'l', 'y', ' ', 'e', 's', 't', 'a', 't', 'e', 's', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 'b', 'u', 'o', 'n', 'a', 'p', 'a', 'r', 't', 'e', 's', '.', ' ']
['b', 'u', 't', ' ', 'i', ' ', 'w', 'a', 'r', 'n', ' ', 'y', 'o', 'u', ',', ' ', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 'd', 'o', 'n', "'", 't', ' ', 't', 'e', 'l', 'l', ' ', 'm', 'e', ' ', 't', 'h', 'a', 't', ' ', 't', 'h', 'i', 's', ' ', 'm', 'e', 'a', 'n', 's', ' ', 'w', 'a', 'r', ',', ' ', 'i', 'f', ' ', 'y', 'o', 'u', ' ', 's', 't', 'i', 'l', 'l', ' ', 't', 'r', 'y', ' ', 't', 'o', ' ', 'd', 'e', 'f', 'e', 'n', 'd', ' ', 't', 'h', 'e', ' ', 'i', 'n', 'f', 'a', 'm', 'i', 'e', 's', ' ', 'a', 'n', 'd', ' ', 'h', 'o', 'r', 'r', 'o', 'r', 's', ' ', 'p', 'e', 'r', 'p', 'e', 't', 'r', 'a'

In [5]:
# Initialize alphabet (set of available characters)
alphabet = set()
# Go through each sentence in dataset
for i in range(len(dataset)):
    # Turn sentence (list of characters) into set
    sentence = set(dataset[i])
    # Update alphabet
    alphabet |= sentence

In [6]:
# Show all available characters
print(' '.join([str(i).zfill(2) for i in range(len(alphabet))]))  # Index
print(' '.join([str(c) + ' ' for c in alphabet]))  # Characters

00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
z  c  .  ,  n     s  b  i  r  f  u  :  v  m  ?  w  d  h  e  o  j  g  )  x  l  a  k  p  q  y  ;  "  (  '  t  ! 


In [7]:
# Define transformation for dataset
dataset.transform = transforms.Compose([
    OneHotEncode(alphabet),
    RandomCrop(7),
    ToTensor()
])

In [8]:
# Show first 3 sentences shapes
for i in range(3):
    print('Sentence nr {:d} has shape {:d} x {:d}'.format(i+1, *dataset[i].shape))

Sentence nr 1 has shape 7 x 37
Sentence nr 2 has shape 7 x 37
Sentence nr 3 has shape 7 x 37


### Hyperparameters tuning

In [9]:
# Split initial dataset in train dataset and test dataset
train_dataset, test_dataset = split_train_test(dataset, 0.8)
# Make train dataloader
train_dataloader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
# Make test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [10]:
# Enable hyperparameters cross validation
HYPER_TRAIN = True

# Do only if hyperparameters training is enabled
if HYPER_TRAIN:
    
    # Make parameters training
    params, train_losses, train_times, test_losses, test_times = grid_search(
        train_dl=train_dataloader, test_dl=test_dataloader,
        net=[Charlie], net__input_size=[len(alphabet)], net__hidden_units=[512, 256], net__layers_num=[5, 4, 3], net__hidden_type=['GRU', 'LSTM'],
        optim=[optim.Adam], optim__weight_decay=[5e-4, 5e-3, 5e-2],
        loss_fn=[nn.CrossEntropyLoss],
        num_epochs=10,
        verbose=True,
        device=device
    )

    # Store parameters to disk
    save_epochs(
        path='data/charlie/hyper.json',
        params=params,
        train_losses=train_losses,
        train_times=train_times,
        test_losses=test_losses,
        test_times=test_times
    )

Current network parameters (1):
(net: <class 'src.network.Charlie'>, net__input_size: 37, net__hidden_units: 512, net__layers_num: 5, net__hidden_type: GRU, optim: <class 'torch.optim.adam.Adam'>, optim__weight_decay: 0.0005, loss_fn: <class 'torch.nn.modules.loss.CrossEntropyLoss'>)
Training done with loss 0.299 in 221 seconds
Test done with loss 2.978 in 5 seconds

Current network parameters (2):
(net: <class 'src.network.Charlie'>, net__input_size: 37, net__hidden_units: 512, net__layers_num: 5, net__hidden_type: GRU, optim: <class 'torch.optim.adam.Adam'>, optim__weight_decay: 0.005, loss_fn: <class 'torch.nn.modules.loss.CrossEntropyLoss'>)
Training done with loss 0.298 in 220 seconds
Test done with loss 2.968 in 5 seconds

Current network parameters (3):
(net: <class 'src.network.Charlie'>, net__input_size: 37, net__hidden_units: 512, net__layers_num: 5, net__hidden_type: GRU, optim: <class 'torch.optim.adam.Adam'>, optim__weight_decay: 0.05, loss_fn: <class 'torch.nn.modules.los

NameError: name 'save_epochs' is not defined

In [None]:
# Load parameters from disk
params, train_losses, train_times, test_losses, test_times = load_epochs(
    path='data/charlie/hyper.json'
)

# Print parameters
for i in range(len(params)):
    # Print current parameters combination
    print('Current network parameters ({:d}):'.format(i + 1))
    print('(' + ', '.join(['{:}: {:}'.format(kw, arg) for kw, arg in params[i].items()]) + ')')
    # Print train step
    print('Training done with loss {:.03f}'.format(train_losses[i]), end=' ')
    print('in {:.0f} seconds'.format(train_times[i]))
    # Print test step
    print('Test done with loss {:.03f}'.format(test_losses[i]), end=' ')
    print('in {:.0f} seconds'.format(test_times[i]))
    print()

In [None]:
# Plot execution times vs loss

# Initialize plot
fig, axs = plt.subplots(1, 2, figsize=(21, 7))

# Set title and labels
axs[0].set_title('Training time vs loss')
axs[0].set_xlabel('Time (seconds)')
axs[0].set_ylabel('Loss (float)')
# Make plot
axs[0].scatter(x=train_times, y=train_losses)
# Loop through every marker in plot
for i in range(len(params)):
    # Add correction for text
    corr = 0.01
    # Add marker index on point
    axs[0].annotate(str(i+1), xy=(train_times[i] + corr, train_losses[i] + corr))

# Set title and labels
axs[1].set_title('Test time vs loss')
axs[1].set_xlabel('Time (seconds)')
axs[1].set_ylabel('Loss (float)')
# Make plot
axs[1].scatter(x=test_times, y=test_losses)
# Loop through every marker in plot
for i in range(len(params)):
    # Define a little correction to point placement
    corr = 0.01
    # Add marker index on point
    axs[1].annotate(str(i+1), xy=(test_times[i] + corr, test_losses[i] + corr))

# Show plot
plt.show()

### Model training

In [None]:
# Instantiate new network
net = Charlie(input_size=len(alphabet), hidden_units=512, layers_num=3, hidden_type='GRU')
net.to(device)

In [None]:
# Define optimizer
optimizer = optim.Adam(net.params(), weight_decay=0.005)
# Define loss function
loss_fn = nn.CrossEntropyLoss()

# Define number of epochs
num_epochs = 1000000
# Define number of epochs per checkpoint
save_after = 100

# Make train and test
train_test_epochs(
    net=net,
    train_dl=train_dataloader,
    test_dl=test_dataloader,
    loss_fn=loss_fn,
    optimizer=optimizer,
    num_epochs=num_epochs,
    save_after=save_after,
    net_path='data/charlie/model.pth',
    epochs_path='data/charlie/train.json',
    verbose=True,
    device=device
)