In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from IPython.display import clear_output
import re
import os
import unicodedata
import numpy as np
from tqdm import tqdm
import time
%load_ext autoreload
%autoreload 2

os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Dataset load and prepare

In [2]:
from lib.utils import load_imdb_data
from lib.utils import vocab_idxs

In [3]:
try:
    reviews = torch.load("data/reviews")
except:    
    path = 'aclImdb/train/unsup/'
    reviews = load_imdb_data(path, gen=False)
    torch.save(reviews, "data/reviews")

In [4]:
try:
    reviews_40 = torch.load("data/reviews_40")
except:
    reviews_40 = load_imdb_data(path, gen=True)
    torch.save(reviews_40, "data/reviews_40")

In [5]:
try:
    vocab, word2id, id2word = torch.load("data/vocab_idxs")
except:
    vocab, word2id, id2word = vocab_idxs(reviews) 
    torch.save([vocab, word2id, id2word], "data/vocab_idxs")
    

In [6]:
try:
    matrix = torch.load("data/matrix")
except:    
    matrix = sents2matrix(reviews_40, word2id)
    torch.save(matrix, "data/matrix")

### Generator Model

In [7]:
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.LongTensor(matrix))

# dataloaders
batch_size = 128

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [8]:
hidden_dim = 256
vocab_size = len(vocab)
embedding_dim = 110
p = 0.5
n_layers = 1
device = torch.device("cuda")

In [9]:
class MaskedEncoderRNN(nn.Module):
    def __init__(
        self, hidden_dim, vocab_size,
        embedding_dim, p=0.5, n_layers=1, device=torch.device("cuda")
    ):
        super(MaskedEncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.device = device
        self.p = p

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)

    def forward(self, input, hidden):    
        input = input.to(self.device)
        mask = self.generate_mask(input.shape) # now masked symbols are <m> pad symbol
        masked_input = torch.mul(input, mask)
        output = self.embedding(input) #.view(self.n_layers, input.shape[0], -1)
        output, hidden = self.lstm(output, hidden)
        return output, hidden, mask
    
    def generate_mask(self, size):
        return torch.randn(size, device=self.device).ge(self.p).long()

    def init_hidden(self, batch_size):
        return (
            torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device),
            torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(self.device)
        )


In [10]:
# from pyt

class AttnMaskedDecoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size, dropout_p=0.1, n_layers=1, max_length=41, device="cuda"):
        super(AttnMaskedDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        self.device = device
        
        self.embedding = nn.Embedding(self.vocab_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.vocab_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded, hidden[0].transpose(0,1)), 2)), dim=2)
        attn_applied = torch.bmm(attn_weights, encoder_outputs)
        output = torch.cat((embedded, attn_applied), 2)
        output = self.attn_combine(output)
        output = F.relu(output)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output), dim=2)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


In [11]:
class MaskedDecoderRNN(nn.Module):
    def __init__(self, hidden_size, vocab_size, embedding_dim, n_layers=1, device="cuda"):
        super(MaskedDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.device = device
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)
    
    def forward(self, input, hidden):
        output = self.embedding(input.long())
        output, hidden = self.lstm(output, hidden)
        output = self.fc(output)
        output = F.relu(output)
        output = self.softmax(self.out(output))
        
        return output, hidden

In [12]:
decoder =  AttnMaskedDecoderRNN(hidden_dim, vocab_size, dropout_p=0.2, n_layers=n_layers, max_length=41, device=device).to(device)
#MaskedDecoderRNN(hidden_dim, vocab_size, embedding_dim, device=device, n_layers=n_layers).to(device)
encoder = MaskedEncoderRNN(hidden_dim, vocab_size, embedding_dim, device=device, p=p, n_layers=n_layers).to(device)

In [13]:
## load weight for encoder
weights = dict()
weights = torch.load("weights_1_layer")
for layer in encoder.state_dict():
    if layer in weights:
        encoder.state_dict()[layer] = weights[layer]

In [14]:
def plot_history(train_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)    
    plt.xlabel('train steps')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

In [15]:
def trainIters(encoder, decoder, n_epochs, learning_rate=0.01, save_to_disk=True):
    start = time.time()
    train_log = []

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    encoder.train()
    decoder.train()

    for epoch in range(n_epochs):
        train_loss = train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, train_loader)
        train_log.extend(train_loss)
        
        clear_output()
        print ('Epoch [{}/{}], Loss: {:.4f}' 
                .format(epoch+1, n_epochs, np.mean(train_log[-100:])))
        plot_history(train_log)
        
    if save_to_disk:
        torch.save(model, 'generator.pt')
        
def train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, train_loader):
    loss_log = []
    criterion = nn.NLLLoss()

    index = 0
    for sequence in train_loader:
        index += 1
        input = sequence[0].to(encoder.device)
        output = input
        loss = train(input, output, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss_log.append(loss.item())
        if index % 100:
            clear_output(True)
            print("mean error : ", np.mean(loss_log[-100:]))

    return loss_log

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    # encoder part
    input_length = input_tensor.size(0)
    encoder_hidden = encoder.init_hidden(input_length)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
 
    encoder_output, encoder_hidden, mask = encoder(input_tensor, encoder_hidden)
    #decoder part
    
    decoder_input = torch.ones(input_length, 1).to(decoder.device).long()
    
    decoder_output, decoder_hidden, _ = decoder(decoder_input, encoder_hidden, encoder_output)
    torch.where(
        mask[:, 0].byte().unsqueeze(1).unsqueeze(1),
        torch.zeros(decoder_output[0, 0].shape).to(decoder.device),
        decoder_output
    )

    for batch_index in range(input_tensor.shape[0]):
        if mask[batch_index, 0] == 1:
            torch.add(decoder_output[batch_index, 0, input_tensor[batch_index, 0]], 1.)

    loss = criterion(decoder_output.view(input_tensor.shape[0], -1), input_tensor[:, 0])
    
    for char_index in range(input_tensor.shape[1] - 1):
        decoder_output = torch.argmax(decoder_output, dim=2)
        decoder_output, decoder_hidden, _ = decoder(decoder_output, decoder_hidden, encoder_output)
        torch.where(
            mask[:, char_index + 1].byte().unsqueeze(1).unsqueeze(1),
            torch.zeros(decoder_output[0, 0].shape).to(decoder.device),
            decoder_output
        )
        for batch_index in range(input_tensor.shape[0]):
            if mask[batch_index, char_index + 1] == 1:
                torch.add(decoder_output[batch_index, 0, input_tensor[batch_index, char_index]], 1.)
        loss += criterion(decoder_output.view(input_tensor.shape[0], -1), input_tensor[:, char_index + 1])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss / input_length


### Learning

In [16]:
trainIters(encoder, decoder, n_epochs=10, learning_rate=0.01)

mean error :  2.3459130454063417


KeyboardInterrupt: 