In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import re
import os
import unicodedata
import time

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

from torch import optim
from torch.nn import functional as F

from tqdm import tqdm
from IPython.display import clear_output

import utils
import models

%load_ext autoreload
%autoreload 2
%matplotlib inline

### Dataset load and prepare

In [2]:
trainloader, validloader, vocab, word2id, id2word = utils.prepare_imdb_dataloaders(path_to_pkl='data/matrix.pkl',
                                                                            forward=True)

Data has been successfully loaded


In [3]:
backward_trainloader, backward_validloader, vocab, word2id, id2word = utils.prepare_imdb_dataloaders(path_to_pkl='data/matrix.pkl',
                                                                            forward=False)

Data has been successfully loaded


### Generator Model

In [4]:
batch_size = 64
hidden_dim = 256
linear_dim = 256
embedding_dim = 128
vocab_size = len(vocab) + 2 # tut ploho
n_layers = 1

model = models.language_model.LanguageModel(hidden_dim, vocab_size, embedding_dim, 
                                            linear_dim, n_layers, train_on_gpu=True)

In [5]:
backward_model = models.language_model.LanguageModel(hidden_dim, vocab_size, embedding_dim, 
                                            linear_dim, n_layers, train_on_gpu=True)

In [6]:
forward_state_dict = torch.load('pretrained_forward_105.pt')
backward_state_dict = torch.load('pretrained_backward_110.pt')

#### Encoder

In [7]:
hidden_dim = 256
vocab_size = len(vocab) + 2
embedding_dim = 128
p = 0.5
n_layers = 1

In [8]:
encoder = models.generator.MaskedEncoderRNN(hidden_dim, vocab_size, 
                                            embedding_dim, train_on_gpu=True, 
                                            p=p, n_layers=n_layers, bidirectional=True)
encoder.cuda()

MaskedEncoderRNN(
  (embeddings): Embedding(80392, 128)
  (lstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (projection): Linear(in_features=512, out_features=256, bias=True)
)

loading pretrained weights into encoder

In [9]:
encoder_dict = encoder.state_dict()


pretrained_dict = {k: v for k, v in forward_state_dict.items() if k in encoder_dict}
encoder_dict.update(pretrained_dict) 
encoder.load_state_dict(encoder_dict)

In [10]:
from collections import OrderedDict
backward_dict = OrderedDict()

In [11]:
backward_dict['lstm.weight_ih_l0_reverse'] = backward_state_dict['lstm.weight_ih_l0']
backward_dict['lstm.weight_hh_l0_reverse'] = backward_state_dict['lstm.weight_hh_l0']
backward_dict['lstm.bias_ih_l0_reverse'] = backward_state_dict['lstm.bias_ih_l0']
backward_dict['lstm.bias_hh_l0_reverse'] = backward_state_dict['lstm.bias_hh_l0']

In [12]:
pretrained_dict = {k: v for k, v in backward_dict.items() if k in encoder_dict}
encoder_dict.update(pretrained_dict) 
encoder.load_state_dict(encoder_dict)

#### Decoder

In [13]:
hidden_dim = 256
vocab_size = len(vocab) + 2
embedding_dim = 128
p = 0.5
n_layers = 1

In [14]:
decoder =  models.generator.AttnMaskedDecoderRNN(hidden_dim, vocab_size, embedding_dim, 
                                                 dropout_p=0.2, n_layers=n_layers, max_length=41)
decoder.cuda()

AttnMaskedDecoderRNN(
  (embedding): Embedding(80392, 128)
  (attention): Attention(
    (linear_in): Linear(in_features=256, out_features=256, bias=False)
    (linear_out): Linear(in_features=512, out_features=256, bias=False)
    (softmax): Softmax()
    (tanh): Tanh()
  )
  (dropout): Dropout(p=0.2)
  (lstm): LSTM(128, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=80392, bias=True)
  (prediction): Linear(in_features=512, out_features=80392, bias=True)
)

loading pretrained weights into decoder

In [15]:
decoder_dict = decoder.state_dict()


pretrained_dict = {k: v for k, v in forward_state_dict.items() if k in decoder_dict}
decoder_dict.update(pretrained_dict) 
decoder.load_state_dict(decoder_dict)

In [16]:
import matplotlib.pyplot as plt

def plot_history(train_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)    
    plt.xlabel('train steps')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

In [17]:
def trainIters(encoder, decoder, n_epochs, learning_rate=0.01, save_to_disk=True):
    start = time.time()
    train_log = []

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    encoder.train()
    decoder.train()

    for epoch in range(n_epochs):
        train_loss = train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, trainloader)
        train_log.extend(train_loss)
        
        clear_output()
        print ('Epoch [{}/{}], Loss: {:.4f}' 
                .format(epoch+1, n_epochs, np.mean(train_log[-100:])))
        plot_history(train_log)
        
    if save_to_disk:
        torch.save(model, 'generator.pt')
        
def train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, trainloader, train_on_gpu=True):
    loss_log = []
    criterion = nn.NLLLoss()

    index = 0
    for sequence in trainloader:
        index += 1
        if train_on_gpu:
            inp = sequence[0].cuda()
        else:
            inp = sequence[0]
    
        output = inp
        loss = train(inp, output, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss_log.append(loss.item())
        if index % 100:
            clear_output(True)
            print("mean error : ", np.mean(loss_log[-100:]))

    return loss_log

def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, train_on_gpu=True):
    
    # encoder part
    input_length = input_tensor.size(0)
    encoder_hidden = encoder.init_hidden(input_length)
    encoder_hidden = tuple([each.data for each in encoder_hidden])
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_output, encoder_hidden, mask = encoder(input_tensor, encoder_hidden)
    
    decoder_output = input_tensor[:, 0].unsqueeze(1)
    decoder_hidden = encoder_hidden
    
    for char_index in range(input_tensor.shape[1] - 1):
        if char_index != 0:
            decoder_output = torch.argmax(decoder_output, dim=2)
        
        decoder_output, decoder_hidden, _ = decoder(decoder_output, decoder_hidden, encoder_output)
        
        loss = criterion(
            decoder_output[mask[:, char_index].byte()].squeeze(1),
            input_tensor[:, char_index][mask[:, char_index].byte()]
        )

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss / input_length



In [None]:
validloader

### Learning

In [19]:
trainIters(encoder, decoder, n_epochs=10, learning_rate=0.001)

mean error :  0.008290719492115864


KeyboardInterrupt: 