In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import os
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from matplotlib import pyplot as plt

import torchtext
import collections

import pickle
import random

### Verify if CUDA is available

In [2]:
# If CUDA is available print devices
if torch.cuda.is_available():
    print('CUDA devices:')
    for device in range(0, torch.cuda.device_count()):
        print('\t{} - {}'.format(device, torch.cuda.get_device_name(device)))
else:
    print('No CUDA devices')

CUDA devices:
	0 - GeForce RTX 2060


### Define encoder and decoder

In [3]:
class Encoder(nn.Module):
  def __init__(self, embedding_size, encoding_size, embedding_layer, use_cuda=False):
    super(Encoder, self).__init__()

    self.encoding_size = encoding_size    
    self.embedding = embedding_layer
    
    self.encoder = nn.GRU(
      input_size=embedding_size, 
      hidden_size=encoding_size,
      dropout=0.2,
      num_layers=3,
      bias=True,
      batch_first=True,
      bidirectional=False
    )
  
  def forward(self, words_indices):
    embeddings = self.embedding(words_indices)
    _, hidden  = self.encoder(embeddings)
    return hidden, hidden

In [4]:
class Decoder(nn.Module):

  def __init__(self, encoding_size, embedding_size, embedding_layer, vocab_size, use_cuda=False):
    super(Decoder, self).__init__()

    self.encoding_size = encoding_size
    self.embedding = embedding_layer

    self.decoder = nn.GRU(
      input_size=embedding_size, 
      hidden_size=encoding_size,
      dropout=0.2,
      num_layers=3,
      bias=True,
      batch_first=True,
      bidirectional=False
    )

    self.dim_linear = nn.Linear(encoding_size, vocab_size)
    self.dim_fn = nn.ReLU()    

  def forward(self, words_indices, init_hidden, init_memory):
    with torch.no_grad():
        embeddings = self.embedding(words_indices)
        
    output_, _ = self.decoder(embeddings, init_hidden)
    linear = self.dim_fn(self.dim_linear(output_))
    
    return linear

### Defining functions related with transforming data

In [5]:
# This function recover a sentence from word's indices
def get_text_fn(vocab):
    itos = {}
    for word in vocab:
        itos[vocab[word]] = word
        
    def get_text(example):
        text = []
        for idx in example:
            text.append(itos[idx])

        return ' '.join(text)
    return get_text

### Define variables related with loading information and training/validation data

In [6]:
VAL_PARTITION = 0.3 
EXAMPLES_PER_EPOCH = 1
BATCH_SIZE = 8
DATASET_FILENAME = '../datasets/books_dataset.pk'
CHECKPOINT_BASE = 'checkpoints'
ENCODER_CHECKPOINT_FILE = 'Hemingway_encoder'
DECODER_CHECKPOINT_FILE = 'Hemingway_decoder'
AUTHOR = 'Ernest Hemingway'

# Keys for dictionary
MEAN_KEY = 'SentenceLengthMean'
DATASET_KEY = 'Dataset'
AUTHOR_KEY = 'Author'
BOOKS_KEY = 'Books'
DATASET_SENTENCES_MEAN = 'SentenceLengthMean'
BOOKS_PATH_KEY = 'Path'
BOOK_CONTENT_KEY = 'Content'
BOOK_SENTENCES_KEY = 'Sentences'
BOOK_SENTENCES_HIST_KEY = 'SentenceLengthHist'
BOOK_SENTENCES_MEAN = 'SentenceLengthMean'

PAD_CHAR = '<pad>'
UNKNOWN_CHAR = '<unk>'
START_CHAR = '<str>'
END_CHAR = '<end>'


### Loading word vectors and trainig/validation dataset

In [7]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(
        self, 
        data, 
        vocabulary,
        unknown_char=UNKNOWN_CHAR, 
        pad_char=PAD_CHAR, 
        start_char=START_CHAR, 
        end_char=END_CHAR
    ):
        # Defines dataset
        self.data = []
        
        # For all sentences
        for sentence in data:
            # Add start character
            example = [vocabulary[start_char]]
                       
            for word in sentence:
                if word not in vocabulary:
                    example.append(vocabulary[unknown_char])
                else:
                    example.append(vocabulary[word])
                    
            # Add end character
            example.append(vocabulary[end_char])
            self.data.append(example)
            
        self.data = torch.LongTensor(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [8]:
# Load all authors datasets
with open(DATASET_FILENAME, 'rb') as f:
    datasets = pickle.load(f)

# Find current author dataset
dataset = None
for ds in datasets[DATASET_KEY]:
    if ds[AUTHOR_KEY] == AUTHOR:
        dataset = ds
        break

In [9]:
# Creates vocabulary
vocab = {UNKNOWN_CHAR: 0, PAD_CHAR: 1, START_CHAR: 2, END_CHAR: 3}

# Get sentences mean of author's books
sentence_length = math.ceil(dataset[DATASET_SENTENCES_MEAN])

# Join books of same author in a unique dataset
full_dataset = []
for book in dataset[BOOKS_KEY]:
    # Balance sentences length to their mean
    for sentence in book[BOOK_SENTENCES_KEY]:
        balanced_sentence = [word.lower().strip() for word in sentence[:sentence_length]]
        balanced_sentence.extend([PAD_CHAR] * (sentence_length - len(balanced_sentence)))
        full_dataset.append(balanced_sentence)

# Shuffle dataset
random.shuffle(full_dataset)

# Partition and creation of training and validation datasets
partition_idx = math.floor(len(full_dataset) * (1 - VAL_PARTITION))

# Build vocab with training set
for sentence in full_dataset[:partition_idx]:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab)

train_dataset = BookDataset(full_dataset[:partition_idx], vocab)
test_dataset = BookDataset(full_dataset[partition_idx:], vocab)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Defining variables related with training

In [10]:
USE_CUDA = torch.cuda.is_available()
EMBEDDING_SIZE = 300
ENCODING_SIZE = 2048

BATCH_SIZE = 1
LEARNING_RATE = 1e-5
EPOCHS = 50
VOCAB_SIZE = len(vocab)

In [11]:
embedding_layer = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)   

### Defining training components

In [12]:
encoder = Encoder(EMBEDDING_SIZE, ENCODING_SIZE, embedding_layer, use_cuda=USE_CUDA)
decoder = Decoder(ENCODING_SIZE, EMBEDDING_SIZE, embedding_layer, VOCAB_SIZE, use_cuda=USE_CUDA)

try:
    encoder.load_state_dict(torch.load(os.path.join(CHECKPOINT_BASE, ENCODER_CHECKPOINT_FILE + '.pt')))
    decoder.load_state_dict(torch.load(os.path.join(CHECKPOINT_BASE, DECODER_CHECKPOINT_FILE + '.pt')))
except:
    print('No checkpoints found. New training.')
    pass

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()


No checkpoints found. New training.


In [13]:
def get_loss_function():
    loss_fn = nn.CrossEntropyLoss(reduction='mean')
    
    def my_loss_fn(target, predicted):
        predicted_size = predicted.size()
        flatten_size = predicted_size[0] * predicted_size[1]
        
        target_ = target.reshape([flatten_size])
        predicted_ = predicted.reshape([flatten_size, predicted_size[2]])
        return loss_fn(predicted_, target_)
    
    return my_loss_fn

In [14]:
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()), 
    lr=LEARNING_RATE,
    betas=(0.1, 0.999)
)

loss_fn = get_loss_function()

In [15]:
get_text = get_text_fn(vocab)

### Training and validation

In [16]:
def train_step(encoder, decoder, loss_fn, optimizer, batch, use_cuda):
    encoder.train()
    decoder.train()

    if use_cuda:
      batch = batch.cuda()

    encoder.zero_grad()
    decoder.zero_grad()
    optimizer.zero_grad()

    representation, memory = encoder(batch[:,1:])
    
    if use_cuda:
      representation = representation.cuda()
      memory = memory.cuda()
    
    decodings = decoder(batch[:,:-1], representation, memory)
    loss = loss_fn(batch[:,1:], decodings)
    loss.backward()
    optimizer.step()

    return loss.item()
    

In [17]:
def val_step(encoder, decoder, loss_fn, batch, use_cuda):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():

        if use_cuda:
            batch = batch.cuda()

        representation, memory = encoder(batch[:,1:])

        if use_cuda:
            representation = representation.cuda()
            memory = memory.cuda()
        
        decodings = decoder(batch[:,:-1], representation, memory)

        loss = loss_fn(batch[:,1:], decodings)

        return loss.item(), decodings

In [18]:
# Define variables for history
train_loss_history = []
val_loss_history = []

In [19]:
# Define steps where examples will be sampled 
example_step = math.floor(len(train_dataloader) / EXAMPLES_PER_EPOCH)
test_examples = iter(test_dataloader)


last_val_loss = None

# For EPOCHS
for epoch in range(EPOCHS):
    
  print('*************************** EPOCH {} ***************************'.format(epoch))

  # Restart train and validation datasets
  examples = iter(train_dataloader)

  # Progress bar for training dataset
  progress_bar = tqdm(range(len(train_dataloader)))
  train_loss = 0
  
  # For all data in training dataset
  
  for batch_idx in progress_bar:

    # Add train loss to progress bar
    progress_bar.set_description('Loss: {}'.format(train_loss / (batch_idx + 1)))
    
    # Train step
    example = next(examples)
    train_loss += train_step(encoder, decoder, loss_fn, optimizer, example, USE_CUDA)


    if batch_idx % example_step == 0:
      with torch.no_grad():
        try:
          example = next(test_examples)
        except:
          test_examples = iter(test_dataloader)
          example = next(test_examples)

        _, decodings = val_step(encoder, decoder, loss_fn, example, USE_CUDA)
        
        decodings = torch.argmax(decodings[0], dim=-1).cpu().numpy()

        print('\nReal: {}'.format(get_text(example[0].numpy())))
        print('Decoded: {}'.format(get_text(decodings)))

  
  val_examples = iter(test_dataloader)
  with torch.no_grad():
    progress_bar = tqdm(range(len(test_dataloader)))
    val_loss = 0

    for batch_idx in progress_bar:
      progress_bar.set_description('Val loss: {}'.format(val_loss / (batch_idx + 1)))
      example = next(val_examples)

      val_loss += val_step(encoder, decoder, loss_fn, example, USE_CUDA)[0]
       

    if last_val_loss is None or val_loss < last_val_loss:
      last_val_loss = val_loss

      torch.save(encoder.state_dict(), os.path.join(CHECKPOINT_BASE, ENCODER_CHECKPOINT_FILE + '_{}.pt').format(epoch))
      torch.save(decoder.state_dict(), os.path.join(CHECKPOINT_BASE, DECODER_CHECKPOINT_FILE + '_{}.pt').format(epoch))
    
  train_loss_history.append(train_loss / len(train_dataloader))
  val_loss_history.append(val_loss / len(test_dataloader))
    

*************************** EPOCH 0 ***************************


HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))


Real: <str> maybe it will open with the sun , he thought <pad> <pad> <pad> <pad> <pad> <pad> <pad> <end>
Decoded: began teetered teetered baseball baseball baseball motion motion joe joe several right right effectively horned horned yet yet



HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))


*************************** EPOCH 1 ***************************


HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))


Real: <str> i spoke to him about the <unk> in milano <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <end>
Decoded: he he the <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>



HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))


*************************** EPOCH 2 ***************************


HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))


Real: <str> “ sure is hell <unk> it down , joe ” he ’ d say and <unk> back <end>
Decoded: he he the the the the the the the the the the the the the the the the



HBox(children=(FloatProgress(value=0.0, max=89.0), HTML(value='')))


*************************** EPOCH 3 ***************************


HBox(children=(FloatProgress(value=0.0, max=207.0), HTML(value='')))


Real: <str> " it was only his turn , " he said <pad> <pad> <pad> <pad> <pad> <pad> <pad> <end>
Decoded: " he was the the the the the <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


KeyboardInterrupt: 

In [None]:
x = [epoch for epoch in range(EPOCHS)]
plt.figure(figsize=(15, 8))

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and validation loss')

plt.plot(x, train_loss_history, label='Training loss')
plt.plot(x, val_loss_history, label='Validation loss')

plt.legend()

plt.show()