## Worden: vector encoded words based text generation



In [None]:
# Dependencies
from src.dataset import WarAndPeace, WordToVector, RandomCrop, ToTensor
from src.dataset import split_train_test
from scr.embeddings import Embeddings, Glove
from src.network import Worden
from src.network import grid_search
from torch.utils.data import DataLoader
from torchvision import transforms
from torch import optim, nn
import matplotlib.pyplot as plt
import numpy as np
import torch

%matplotlib inline

In [None]:
# Define CPU device
cpu = torch.device('cpu')
# Define best device (GPU if available, CPU otherwise)
device = torch.device('cuda') if torch.cuda.is_available() else cpu

In [None]:
# Make dataset
dataset = WarAndPeace('./data/war-and-peace-tolstoj.txt', split_how='words', min_len=10)
# Show dataset length (number of sentences)
print('Dataset length:', len(dataset))

In [None]:
# Show first 3 sentences
for i in range(10):
    print(dataset[i])

In [None]:
# Initialize set of words in text
words = set()
# Go through each sentence in dataset
for i in range(len(dataset)):
    # Turn sentence (list of words) into set
    sentence = set(dataset[i])
    # Update words set
    words |= sentence

In [None]:
# Show some available words
print(', '.join([*words][:10]))

### Embeddings

In [None]:
# Define embedding dimension
embedding_dim = 50

print('Embedding dimension is: {:d}'.format(embedding_dim))

In [None]:
# Load embeddings from glove
glove = Glove.from_file(
    path='data/glove.6B/glove.6B.50d.txt',
    words=words
)

In [None]:
# Show first 3 embeddings
print('Glove embeddings:')
# Loop through each word, vector tuple
for i, (word, vector) in enumerate(glove.items()):
    # Print word to vector
    print('{0:s} ({1:d}):\t{2:s} ...'.format(word, i + 1,  ' '.join([str(v) for v in vector[:10]])))
    # Early stopping
    if i >= 10: break
print('...')

In [None]:
# Plot mean and variance distribution in glove embeddings
Embeddings.plot_embeddings(glove)

In [None]:
# Define embeddings for unknown words by sampling from normal distirbution, using found parameters
mean = 0.01  # Found mean is circa 0.0
std = 0.61  # Found std. dev. is circa 0.6

# Initialize randomly sampled embeddings
embeddings = Embeddings.from_normal(mean, std, dim=embedding_dim, words=words)
# Loop through each embedded word
for word, vector in embeddings.items():
    # Subsititute current entry with glove one, if available
    embeddings[word] = glove.get(word, vector)

# Get list of words
words = [*embeddings.keys()]
# Get vectors as float tensor
vectors = torch.tensor([*embeddings.values()], dtype=torch.float)

In [None]:
# Plot mean and variance distribution in retrieved embeddings
Embeddings.plot_embeddings(embeddings)

In [None]:
# Define transformation for dataset
dataset.transform = transforms.Compose([
    WordToVector(words),
    RandomCrop(7),
    ToTensor()
])

In [None]:
# Show first 3 sentences shapes
for i in range(3):
    print('Sentence nr {:d} has shape {:d}'.format(i+1, *dataset[i].shape))

### Hyperparameters tuning

In [None]:
# Split initial dataset in train dataset and test dataset
train_dataset, test_dataset = split_train_test(dataset, 0.9)
# Make train dataloader
train_dataloader = DataLoader(train_dataset, batch_size=1000, shuffle=True)
# Make test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [None]:
# Run hyperparameters evaluation through grid search
params, train_losses, train_times, test_losses, test_times = grid_search(
    # Set training and testing dataloaders
    train=train_dataloader, test=test_dataloader,
    # Define network architecture
    net=Worden, net__embedding_dim=[embedding_dim], net__hidden_units=[512, 256], net__layers_num=[5, 4, 3], net__hidden_type=['GRU', 'LSTM'],
    net__trained_embeddings=[vectors], net__freeze_embeddings=[False],
    # Define optimizer
    optim=[optim.Adam], optim__weight_decay=[5e-4, 5e-3, 5e-2],
    # Define loss function
    loss_fn=[nn.nn.NLLLoss],
    # Define (low) number of epochs
    num_epochs=10,
    # Set network device (CPU or GPU)
    device=device
)