# Text RNN Autoencoder

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### Do the basic imports

In [None]:
import os
import sys

import torch
import torch.nn as nn

import numpy as np
import json
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader

In [None]:
import os
import sys

home_dir = os.path.expanduser('~')
sys.path.append("{}/dev/github/ml-toolkit".format(home_dir))

## Load Data

In [None]:
from pytorch.utils.data.text.vectorizer import Vectorizer

vectorizer = Vectorizer(default_indexes={0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'})
vectorizer.load_dictionary('{}/data/datasets/hotel-reviews-txt/dictionary'.format(home_dir), word_col=0)

print(vectorizer.vocab_size)

In [None]:
train_seq_list = []
with open('{}/data/datasets/hotel-reviews-txt/train_permute.txt'.format(home_dir)) as infile:
    for idx, line in enumerate(infile):
        seq = [ int(i) for i in line.strip().split()]
        train_seq_list.append(seq)

X, indices, lengths = vectorizer.prepare_sequences(train_seq_list, auto_padding=False, max_len=50, unknown_idx=1, return_lengths=True)

# Free up some memory
train_seq_list = None

### Print an example

In [None]:
idx = 2
seq = X[idx]
seq_decoded = vectorizer.sequence_to_text(seq)
print(seq_decoded)

### Get max index (parameter for embedding layer)

In [None]:
max_idx = int(np.max([ np.max(seq) for seq in X ]))

### Sample dataset for testing

In [None]:
num_samples = 1000
X_train = X[:num_samples]

print("Size of training set: {}".format(len(X_train)))

In [None]:
from pytorch.utils.data.text.wordvectorloader import WordVectorLoader

use_pretrained_embeddings = False

if use_pretrained_embeddings is True:
    word_vector_loader = WordVectorLoader(300)
    embed_mat = word_vector_loader.create_embedding_matrix('{}/data/dumps/glove/glove.840B.300d.txt'.format(home_dir), vectorizer.vocabulary.word_to_index, max_idx, init='random', verbatim=True)
    print(embed_mat.shape)

### Create training data iterator

In [None]:
batch_size = 32

In [None]:
from pytorch.utils.data.text.dataset import BucketBatchSampler, BucketDataset

bucket_batch_sampler = BucketBatchSampler(X_train, batch_size)
bucket_dataset = BucketDataset(X_train, None)

X_train_iter = DataLoader(bucket_dataset, batch_size=1, batch_sampler=bucket_batch_sampler, shuffle=False, num_workers=8, drop_last=False)

print(len(X_train_iter))

## Create network model

In [None]:
from pytorch.models.text.autoencoder.textrnnae import RnnType, Parameters

### Use GPU if available

In [None]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
use_cuda = True
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

In [None]:
path = '{}/data/ml-toolkit/pytorch-models/text-rnn-ae/'.format(home_dir)

params = { 'rnn_type': RnnType.LSTM,
           'rnn_hidden_dim': 512,
           'num_layers': 1,
           'bidirectional_encoder': True,
           'dropout': 0.0,
           'vocab_size': max_idx+1,
           'embed_dim': 300,
           'clip': 0.5,
           'encoder_lr': 0.001,
           'decoder_lr': 0.001,
           'teacher_forcing_prob': 0.0,
           'linear_dims': [],
           'z_dim': 1024 }

print(params)
with open(path+'params.json', 'w') as outfile:
    json.dump(params, outfile)

params = Parameters(params)

In [None]:
from pytorch.models.text.autoencoder.textrnnae import TextRnnAE

criterion = nn.NLLLoss()
text_rnn_ae = TextRnnAE(device, params, criterion)

print(text_rnn_ae.encoder)
print(text_rnn_ae.decoder)

### Set pretrained word embeddings if needed

In [None]:
if use_pretrained_embeddings is True:
    text_rnn_ae.embedding.weight.data.copy_(torch.from_numpy(embed_mat))
    text_rnn_ae.embedding.weight.requires_grad=False
else:
    text_rnn_ae.embedding.weight.requires_grad=True

## Train model

In [None]:
losses = []

In [None]:
num_epochs = 100
safe_after_epoch = False

encoder_file_name = '{}/data/ml-toolkit/pytorch-models/text-rnn-ae/textrnnae-encoder.model'.format(home_dir)
decoder_file_name = '{}/data/ml-toolkit/pytorch-models/text-rnn-ae/textrnnae-decoder.model'.format(home_dir)


text_rnn_ae.train()

#text_rnn_ae.set_learning_rates(0.001, 0.001)
for epoch in range(num_epochs):
    epoch_loss = text_rnn_ae.train_epoch(epoch, X_train_iter, verbatim=True)
    print(epoch_loss)
    losses.append(epoch_loss)
    if safe_after_epoch:
        text_rnn_ae.save_models(encoder_file_name, decoder_file_name)
    text_rnn_ae.update_learning_rates(0.99, 0.99)
        
text_rnn_ae.eval()

In [None]:
max_loss = np.max(losses)
losses_normalized = losses / max_loss

plt.plot(losses_normalized, label='loss')
plt.legend(loc='upper right')
plt.ylabel('RNN-AE (e_dim={}, h_dim={})'.format(params.embed_dim, params.rnn_hidden_dim))
plt.show()

## Evaluate model

In [None]:
def check_sequence(sequence, model, vectorizer, max_length=100):
    original_sequence = vectorizer.sequence_to_text(sequence)
    X = torch.tensor([sequence], dtype=torch.long).to(model.device)
    decoded_indices = model.evaluate(X)
    decoded_sequence = vectorizer.sequence_to_text(decoded_indices)
    return ' '.join(original_sequence), ' '.join(decoded_sequence)
    
print(check_sequence(X_train[0], text_rnn_ae, vectorizer, max_length=50))

### Check a sample of the training data

In [None]:
for idx, s in enumerate(X_train):
    original, decoded = check_sequence(s, text_rnn_ae, vectorizer)
    print("================================================")
    print()
    print(original)
    print(">>>")
    print(decoded)
    print()
    if idx > 200:
        break