In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import re
import os
import unicodedata
import time

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

from torch import optim
from torch.nn import functional as F

from tqdm import tqdm
from IPython.display import clear_output

import utils
import models

%load_ext autoreload
%autoreload 2
%matplotlib inline

### Dataset load and prepare

In [2]:
trainloader, validloader, vocab, word2id, id2word = utils.prepare_imdb_dataloaders(path_to_pkl='data/matrix.pkl',
                                                                            forward=True)

Data has been successfully loaded


In [3]:
backward_trainloader, backward_validloader, vocab, word2id, id2word = utils.prepare_imdb_dataloaders(path_to_pkl='data/matrix.pkl',
                                                                            forward=False)

Data has been successfully loaded


### Generator Model

In [4]:
batch_size = 64
hidden_dim = 256
linear_dim = 256
embedding_dim = 128
vocab_size = len(vocab) + 2 # tut ploho
n_layers = 1

model = models.language_model.LanguageModel(hidden_dim, vocab_size, embedding_dim, 
                                            linear_dim, n_layers, train_on_gpu=True)

In [5]:
backward_model = models.language_model.LanguageModel(hidden_dim, vocab_size, embedding_dim, 
                                            linear_dim, n_layers, train_on_gpu=True)

In [6]:
forward_state_dict = torch.load('pretrained_forward_105.pt')
backward_state_dict = torch.load('pretrained_backward_110.pt')

#### Encoder

In [7]:
hidden_dim = 256
vocab_size = len(vocab) + 2
embedding_dim = 128
p = 0.5
n_layers = 1

In [8]:
encoder = models.generator.MaskedEncoderRNN(hidden_dim, vocab_size, 
                                            embedding_dim, train_on_gpu=True, 
                                            p=p, n_layers=n_layers, bidirectional=True)

loading pretrained weights into encoder

In [9]:
encoder_dict = encoder.state_dict()


pretrained_dict = {k: v for k, v in forward_state_dict.items() if k in encoder_dict}
encoder_dict.update(pretrained_dict) 
encoder.load_state_dict(encoder_dict)

In [10]:
from collections import OrderedDict
backward_dict = OrderedDict()

In [11]:
backward_dict['lstm.weight_ih_l0_reverse'] = backward_state_dict['lstm.weight_ih_l0']
backward_dict['lstm.weight_hh_l0_reverse'] = backward_state_dict['lstm.weight_hh_l0']
backward_dict['lstm.bias_ih_l0_reverse'] = backward_state_dict['lstm.bias_ih_l0']
backward_dict['lstm.bias_hh_l0_reverse'] = backward_state_dict['lstm.bias_hh_l0']

In [12]:
pretrained_dict = {k: v for k, v in backward_dict.items() if k in encoder_dict}
encoder_dict.update(pretrained_dict) 
encoder.load_state_dict(encoder_dict)

In [13]:
encoder.cuda()

MaskedEncoderRNN(
  (embeddings): Embedding(80392, 128)
  (lstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (projection): Linear(in_features=512, out_features=256, bias=True)
)

#### Decoder

In [14]:
hidden_dim = 256
vocab_size = len(vocab) + 2
embedding_dim = 128
p = 0.5
n_layers = 1

In [15]:
decoder =  models.generator.AttnMaskedDecoderRNN(hidden_dim, vocab_size, embedding_dim, 
                                                 dropout_p=0.2, n_layers=n_layers, max_length=41)

In [16]:
decoder.cuda()

AttnMaskedDecoderRNN(
  (embedding): Embedding(80392, 128)
  (attention): Attention(
    (linear_in): Linear(in_features=256, out_features=256, bias=False)
    (linear_out): Linear(in_features=512, out_features=256, bias=False)
    (softmax): Softmax()
    (tanh): Tanh()
  )
  (dropout): Dropout(p=0.2)
  (lstm): LSTM(128, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=80392, bias=True)
  (prediction): Linear(in_features=512, out_features=80392, bias=True)
)

loading pretrained weights into decoder

In [17]:
decoder_dict = decoder.state_dict()


pretrained_dict = {k: v for k, v in forward_state_dict.items() if k in decoder_dict}
decoder_dict.update(pretrained_dict) 
decoder.load_state_dict(decoder_dict)

In [18]:
## load weight for encoder
#weights = dict()
#weights = torch.load("weights_1_layer")
#for layer in encoder.state_dict():
#    if layer in weights:
#        encoder.state_dict()[layer] = weights[layer]

In [19]:
def plot_history(train_history, title='loss'):
    plt.figure()
    plt.title('{}'.format(title))
    plt.plot(train_history, label='train', zorder=1)    
    plt.xlabel('train steps')
    plt.legend(loc='best')
    plt.grid()
    plt.show()

In [196]:
def trainIters(encoder, decoder, n_epochs, learning_rate=0.01, save_to_disk=True):
    start = time.time()
    train_log = []

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    encoder.train()
    decoder.train()

    for epoch in range(n_epochs):
        train_loss = train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, trainloader)
        train_log.extend(train_loss)
        
        clear_output()
        print ('Epoch [{}/{}], Loss: {:.4f}' 
                .format(epoch+1, n_epochs, np.mean(train_log[-100:])))
        plot_history(train_log)
        
    if save_to_disk:
        torch.save(model, 'generator.pt')
        
def train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, trainloader, train_on_gpu=True):
    loss_log = []
    criterion = nn.NLLLoss()

    index = 0
    for sequence in trainloader:
        index += 1
        if train_on_gpu:
            inp = sequence[0].cuda()
        else:
            inp = sequence[0]
    
        output = inp
        loss = train(inp, output, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        loss_log.append(loss.item())
        if index % 100:
            clear_output(True)
            print("mean error : ", np.mean(loss_log[-100:]))

    return loss_log

def train(input_tensor, target_tensor, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion, train_on_gpu=True):
    
    # encoder part
    input_length = input_tensor.size(0)
    encoder_hidden = encoder.init_hidden(input_length)
    #print(encoder_hidden[0].shape)
    encoder_hidden = tuple([each.data for each in encoder_hidden])
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
 
    encoder_output, encoder_hidden, mask = encoder(input_tensor, encoder_hidden)
    # print('no problems with encoder')
    #decoder part
    if train_on_gpu:
        decoder_input = torch.ones(input_length, 1).cuda().long()
    else:
        decoder_input = torch.ones(input_length, 1).long()
    #print(decoder_input.shape)
    #print('hidden: ', encoder_hidden[0].shape)
    #print('hidden: ', encoder_hidden[1].shape)
    #print('out: ', encoder_output.shape)
    
    decoder_output, decoder_hidden, _ = decoder(decoder_input, encoder_hidden, encoder_output)
    print(decoder_output.shape, decoder_hidden[0].shape)
    
    print('1 ', decoder_output[mask[:, 0].byte()].squeeze(1).shape)
    print('2 ', input_tensor[:, 0][mask[:, 0].byte()])
    print(np.mean(mask.cpu().numpy(), 1))
    print(input_tensor)
    loss = criterion(
        decoder_output[mask[:, 0].byte()].squeeze(1),
        input_tensor[:, 0][mask[:, 0].byte()]
    )
    print('dec_out: ', decoder_output)
    for char_index in range(input_tensor.shape[1] - 1):
        decoder_output = torch.argmax(decoder_output, dim=2)
        print(decoder_output)
        decoder_output, decoder_hidden, _ = decoder(decoder_output, decoder_hidden, encoder_output)
        loss = criterion(
            decoder_output[mask[:, char_index + 1].byte()].squeeze(1),
            input_tensor[:, char_index + 1][mask[:, char_index + 1].byte()]
        )

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss / input_length



### Learning

In [198]:
2 ** 0.17

1.1250584846888094

In [197]:
trainIters(encoder, decoder, n_epochs=10, learning_rate=0.0001)

mean error :  0.17654021084308624
AFTER:  tensor([[[-11.2756, -11.2428, -11.3062,  ..., -11.2637, -11.0469, -11.2953]],

        [[-11.1443, -11.2054, -11.2376,  ..., -11.2918, -11.1806, -11.4930]],

        [[-11.0359, -11.2142, -11.3932,  ..., -11.1650, -11.2296, -11.3143]],

        ...,

        [[-11.1950, -11.2412, -11.1987,  ..., -11.3102, -11.1262, -11.3572]],

        [[-11.0068, -11.1027, -11.3424,  ..., -11.2486, -11.0599, -11.4614]],

        [[-11.0982, -11.2714, -11.1897,  ..., -11.0535, -11.0116, -11.3089]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)
torch.Size([64, 1, 80392]) torch.Size([1, 64, 256])
1  torch.Size([37, 80392])
2  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
[0.53658537 0.56097561 0.58536585 0.36585366 0.41463415 0.41463415
 0.65853659 0.48780488 0.46341463 0.48780488 0.41463415 0.51219512
 0.43902439 0.34146341 0.48780488 0.48780488 0.48780488

AFTER:  tensor([[[-11.4841, -11.2767, -11.4690,  ..., -11.2806, -11.1208, -11.3488]],

        [[-10.9431, -11.2518, -11.3546,  ..., -11.3231, -11.4067, -11.5082]],

        [[-11.1845, -11.2347, -11.1995,  ..., -11.4248, -11.3471, -11.5075]],

        ...,

        [[-11.2004, -11.3243, -11.5068,  ..., -11.0061, -11.2045, -11.5213]],

        [[-10.9576, -11.0906, -11.4243,  ..., -11.4354, -11.4959, -11.4042]],

        [[-11.2964, -11.0082, -11.1721,  ..., -11.5911, -11.4374, -11.2683]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)
tensor([[22286],
        [70044],
        [43931],
        [67893],
        [39910],
        [39818],
        [46770],
        [38997],
        [45926],
        [20523],
        [67832],
        [ 4907],
        [18459],
        [26785],
        [26336],
        [30763],
        [67337],
        [27397],
        [24719],
        [33884],
        [50882],
        [23984],
        [64893],
        [53705],
        [67793],
        [75966],
        

AFTER:  tensor([[[-11.0227, -11.2798, -11.2136,  ..., -11.3684, -11.3251, -11.3495]],

        [[-11.0071, -11.6406, -11.1446,  ..., -11.3477, -11.0971, -11.3023]],

        [[-11.0461, -11.2990, -11.3524,  ..., -11.1408, -11.4355, -11.4294]],

        ...,

        [[-11.2003, -11.1535, -11.1477,  ..., -11.3622, -11.2850, -11.4836]],

        [[-11.2768, -11.3124, -11.3946,  ..., -11.2997, -11.5216, -11.4646]],

        [[-11.1673, -11.3621, -11.3273,  ..., -11.2083, -11.6938, -11.4917]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)
tensor([[ 6427],
        [53325],
        [12308],
        [67450],
        [27748],
        [32220],
        [37485],
        [56144],
        [58046],
        [25298],
        [46282],
        [35612],
        [ 6296],
        [62989],
        [55263],
        [50159],
        [31275],
        [76784],
        [65070],
        [ 8517],
        [20050],
        [64885],
        [71606],
        [33310],
        [55122],
        [28824],
        

AFTER:  tensor([[[-11.3716, -11.0002, -11.2002,  ..., -11.3777, -11.4009, -11.4005]],

        [[-11.0091, -11.1682, -11.1544,  ..., -11.4303, -11.3253, -11.4321]],

        [[-11.0639, -11.3548, -11.2057,  ..., -11.2770, -11.2558, -11.2955]],

        ...,

        [[-10.9955, -11.2765, -11.2068,  ..., -11.1877, -11.3615, -11.5589]],

        [[-11.4340, -11.3208, -11.4732,  ..., -11.2896, -11.3901, -11.2293]],

        [[-11.1208, -11.4132, -10.8510,  ..., -11.1271, -11.3739, -11.3669]]],
       device='cuda:0', grad_fn=<LogSoftmaxBackward>)
tensor([[11325],
        [65617],
        [14390],
        [45537],
        [76632],
        [59161],
        [11837],
        [28335],
        [ 7044],
        [60818],
        [68423],
        [61539],
        [63686],
        [46650],
        [19958],
        [29723],
        [11735],
        [75441],
        [63074],
        [39391],
        [17550],
        [ 1376],
        [61529],
        [22973],
        [32642],
        [22179],
        

KeyboardInterrupt: 

In [None]:
h_test = encoder.init_hidden(64)

In [None]:
h_test[0].shape

In [None]:
torch.cat((h_test[0][0], h_test[0][1]), 1).unsqueeze(0).shape

In [None]:
o = encoder(next(iter(trainloader))[0].cuda(), h_test)

In [None]:
o[0].shape

In [None]:
next(iter(trainloader))[0].shape

In [None]:
encoder_output, encoder_hidden, mask = encoder(input_tensor, encoder_hidden)

In [83]:
a = np.ones((10, 10))
for i in range(a.shape[0]):
    a[i] = np.random.choice(2, (1, 10), p=[0.5, 0.5])

In [186]:
a = np.random.choice(2, (10, 10), p=[0.5, 0.5])

In [187]:
a[:, 0] = 1

In [188]:
np.mean(np.mean(a, 1))

0.51

In [121]:
torch.from_numpy(a)

tensor([[0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 0, 1, 1, 1, 0, 1, 1, 1],
        [1, 0, 1, 0, 1, 1, 1, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 1, 1, 0, 1, 0, 0, 1, 1, 0],
        [1, 0, 1, 1, 1, 0, 1, 0, 0, 1],
        [0, 0, 1, 0, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 1, 0, 0, 0],
        [1, 1, 0, 1, 0, 0, 1, 1, 1, 1],
        [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])