## update:
1. Add setBatchSize function to both encoder and decoder classes.
2. (todo)Change initHidden function of decoder. Now it produces SOS_token for decoder_input. (initHidden function of decoder has never been called before)

In [1]:
import torch 
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import os
from torch import optim
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import numpy as np

## Parameters Setup

In [24]:
# parameters
MAX_LENGTH = 25
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu") # to be removed
print(device)

SOS_token = 1
EOS_token = 2

hidden_size = 512

batch_size = 128

attention_vector_size = 256

cpu


## Data Setup

In [92]:
# set data path
data_dir = os.path.join('datasets', 'nmt_data_vi')
train_source = 'train.vi'
train_target = 'train.en'
train_source_dir = os.path.join(data_dir, train_source)
train_target_dir = os.path.join(data_dir, train_target)

test_source = 'tst2012.vi'
test_target = 'tst2012.en'
test_source_dir = os.path.join(data_dir, test_source)
test_target_dir = os.path.join(data_dir, test_target)

vocab_source = 'vocab.vi'
vocab_target = 'vocab.en'
vocab_source_dir = os.path.join(data_dir, vocab_source)
vocab_target_dir = os.path.join(data_dir, vocab_target)

In [97]:
# load training sets
with open(train_source_dir) as f_source:
    sentences_source = f_source.readlines()
with open(train_target_dir) as f_target:
    sentences_target = f_target.readlines()

# check the total number of sentencs in training sets    
print("Total number of sentences in source training set: {}".format(len(sentences_source)))
print("Total number of sentences in target training set: {}".format(len(sentences_target)))

# load testing sets
with open(test_source_dir) as f_source:
    test_source = f_source.readlines()
with open(test_target_dir) as f_target:
    test_target = f_target.readlines()

# check the total number of sentencs in training sets    
print("Total number of sentences in source testing set: {}".format(len(test_source)))
print("Total number of sentences in target testing set: {}".format(len(test_target)))

Total number of sentences in source training set: 133317
Total number of sentences in target training set: 133317
Total number of sentences in source testing set: 1553
Total number of sentences in target testing set: 1553


In [5]:
# check the longest sentence after sentence truncation
max = 0
for s in sentences_source:
    if len(s) > max:
        max = len(s)
        max_s = s
print("Number of words in the longest sentence in sentences_source: {}".format(max))
print("The longest sentence: \n{}".format(max_s))

Number of words in the longest sentence in sentences_source: 3199
The longest sentence: 
Thula Mama , Thula Mama , Thula Mama , Thula Mama . Trong kí ức tuổi thơ con , qua những giọt lệ nhoè mắt bà , con thấy chân lý trong nụ cười của bà , con thấy chân lý trong nụ cười của bà , xuyên thấu màn đêm u tối trong sự vô tri của con . Ôi , có một người bà đang nằm nghỉ bà ốm đau và trái tim bà rơi lệ . Băn khoăn , băn khoăn , băn khoăn , băn khoăn liệu thế giới này đang đi về đâu . Lẽ nào chuyện trẻ nhỏ phải tự xoay xở lấy là đúng ? Không , không , không , không , không , không . Lẽ nào phiền muộn dồn hết lên mái đầu người phụ nữ già là đúng ? Những người vô danh bất hạnh . Thula Mama Mama , Thula Mama . Thula Mama Mama . Thula Mama , Thula Mama , Thula Mama Mama , Thula Mama . Ngày mai sẽ tốt đẹp hơn . Ngày mai trèo đèo lội suối sẽ dễ hơn , bà ơi . Thula Mama , Thula Mama . Tôi có nên tan vào bài hát này như người đàn ông hát nhạc blues hay một người hát rong . Và rồi từ rất xa , không phải

In [98]:
# Truncate sentences by maximum length
sentences_source = list(map(lambda src:src.split()[:MAX_LENGTH], sentences_source))
sentences_target = list(map(lambda src:src.split()[:MAX_LENGTH], sentences_target))
test_source = list(map(lambda src:src.split()[:MAX_LENGTH], test_source))
test_target = list(map(lambda src:src.split()[:MAX_LENGTH], test_target))

In [99]:
# Delete empty sentences in source and target
i = 0
while i < len(sentences_source):
    if sentences_source[i]==[] or sentences_target[i]==[]:
        del sentences_source[i]
        del sentences_target[i]
        i -= 1
    i += 1
print(len(sentences_source))
print(len(sentences_target))

i = 0
while i < len(sentences_source):
    if sentences_source[i]==[] or sentences_target[i]==[]:
        del test_source[i]
        del test_target[i]
        i -= 1
    i += 1
print(len(test_source))
print(len(test_target))

133166
133166
1553
1553


In [8]:
# load vocabularies

# build index2word
with open(vocab_source_dir) as f_vocab_source:
    #index2word_source = f_vocab_source.readlines()
    index2word_source = [line.rstrip() for line in f_vocab_source]
with open(vocab_target_dir) as f_vocab_target:
    #index2word_target = f_vocab_target.readlines()
    index2word_target = [line.rstrip() for line in f_vocab_target]

# build word2index
word2index_source = {}
for idx, word in enumerate(index2word_source):
    word2index_source[word] = idx
word2index_target = {}
for idx, word in enumerate(index2word_target):
    word2index_target[word] = idx
    
# check vocabularies size    
source_vocab_size = len(index2word_source)
target_vocab_size = len(index2word_target)
print("Total nummber of words in source vocabulary: {}".format(len(index2word_source)))
print("Total nummber of words in target vocabulary: {}".format(len(index2word_target)))    

Total nummber of words in source vocabulary: 7709
Total nummber of words in target vocabulary: 17191


In [9]:
# helper funtions to convert sentence in natural language to list of word indexes
def sen2idx(sentence, word2index):
    return [word2index.get(word, 0) for word in sentence] # assume that 0 is for <unk>

def sen2tensor(sentence, word2index):
    idxes = sen2idx(sentence, word2index)
    idxes.append(EOS_token)
    return torch.tensor(idxes, dtype=torch.long, device=device)

## Token to be ignored

In [10]:
PAD_token = target_vocab_size # padding value

## Batch Generator

In [11]:
class BatchGenerator():
    def __init__(self, batch_size, sentences_source, sentences_target, word2index_source, word2index_target):
        self.batch_size = batch_size
        self.sentences_source = sentences_source
        self.sentences_target = sentences_target
        self.word2index_source = word2index_source
        self.word2index_target = word2index_target
        self.num_sentence = len(sentences_source)
        self.reset()
    
    def reset(self):
        self.consumed = 0
        self.permutation = np.random.permutation(self.num_sentence)
    
    def get_batch(self):
        # generate id in one batch
        if self.consumed + self.batch_size > self.num_sentence:
            self.reset()
        sample_id = self.permutation[self.consumed:self.consumed + self.batch_size]
        self.consumed += self.batch_size

        #generate a source batch
        sentences_source_tensor = [sen2tensor(self.sentences_source[id], self.word2index_source) for id in sample_id]

        len_array_source = [len(st) for st in sentences_source_tensor]
        reorder_idx_source = np.argsort(len_array_source, kind='mergesort')
        reorder_idx_source = np.argsort(np.flip(reorder_idx_source, 0)) #index to restore unsorted order

        sentences_source_tensor.sort(key=len)
        sentences_source_tensor.reverse()
        sentences_source_packed = pack_sequence(sentences_source_tensor)

        #generate a target batch
        sentences_target_tensor = [sen2tensor(self.sentences_target[id], self.word2index_target) for id in sample_id]

        len_array_target = [len(st) for st in sentences_target_tensor]
        reorder_idx_target = np.argsort(len_array_target, kind='mergesort')
        reorder_idx_target = np.argsort(np.flip(reorder_idx_target, 0)) #index to restore unsorted order

        sentences_target_tensor.sort(key=len)
        sentences_target_tensor.reverse()
        sentences_target_packed = pack_sequence(sentences_target_tensor)

        return (sentences_source_packed, reorder_idx_source), (sentences_target_packed, reorder_idx_target)

In [12]:
# to be depreciated

# def sentences2tensor(sentences, word2index):
#     sentences_tensor = [sen2tensor(s, word2index) for s in sentences]
#     sentences_tensor.sort(key=len, reverse=True)
#     output = pad_sequence(sentences_tensor, batch_first=True)
#     return output

def batch_generator(batch_size, sentences_source, sentences_target, word2index_source, word2index_target):
    #output: two PackedSequence object, two indexes to reorder sentences in a batch
    
    # generate id in one batch
    total = len(sentences_source)
    sample_id = np.random.choice(total, batch_size, replace=False)
    
    #generate a source batch
    sentences_source_tensor = [sen2tensor(sentences_source[id], word2index_source) for id in sample_id]
    
    len_array_source = [len(st) for st in sentences_source_tensor]
    reorder_idx_source = np.argsort(len_array_source, kind='mergesort')
    reorder_idx_source = np.argsort(np.flip(reorder_idx_source, 0)) #index to restore unsorted order
    
    sentences_source_tensor.sort(key=len)
    sentences_source_tensor.reverse()
    sentences_source_packed = pack_sequence(sentences_source_tensor)
    
    #generate a target batch
    sentences_target_tensor = [sen2tensor(sentences_target[id], word2index_target) for id in sample_id]
    
    len_array_target = [len(st) for st in sentences_target_tensor]
    reorder_idx_target = np.argsort(len_array_target, kind='mergesort')
    reorder_idx_target = np.argsort(np.flip(reorder_idx_target, 0)) #index to restore unsorted order
    
    sentences_target_tensor.sort(key=len)
    sentences_target_tensor.reverse()
    sentences_target_packed = pack_sequence(sentences_target_tensor)
    
    return (sentences_source_packed, reorder_idx_source), (sentences_target_packed, reorder_idx_target)

In [13]:
# test batch_generator
a = ['I','am','a','boy']
b = ['a']
c = ['the','goat']

sentences_test = [a,b,c]
sentences_test2 = [c,b,a]

BG_test = BatchGenerator(2, sentences_test, sentences_test2, word2index_target, word2index_target)

(output_test, reorder_idx_test), (output_test2, reorder_idx_test2) = BG_test.get_batch()
print(output_test)
print(reorder_idx_test)
print(pad_packed_sequence(output_test, batch_first=True))

del a, b, c, sentences_test, sentences_test2, output_test, reorder_idx_test, output_test2, reorder_idx_test2

PackedSequence(data=tensor([   20,     8,  1352,     2,     2]), batch_sizes=tensor([ 2,  2,  1]))
[0 1]
(tensor([[   20,  1352,     2],
        [    8,     2,     0]]), tensor([ 3,  2]))


## Helper function to resume order of sentences in a batch

Order recovery is necessary because sentences have to be sorted in descending order of sentence length to be packed as a PackedSequence object. PackedSequence object helps to deal with inputs with variable length in NMT setting. LSTM, RNN can accept PackedSequence objects.

In [14]:
def resume_order(input, idx):
    # input 
    #   input: Tensor: (batch_size, seq_length)
    #   idx: Tensor or ndarray: (batch_size)
    # output
    #   out: Tensor with reordered sentences in batch: (batch_size, seq_length) 
    
    if isinstance(idx, (np.ndarray)):
        idx = torch.from_numpy(idx)
        if device == torch.device("cuda"):
            idx = idx.cuda()
    out = torch.index_select(input, 0, idx)
    return out

In [15]:
# test resume_order
input_test = torch.tensor([[1,2,3],[4,5,6],[7,8,9]],device=device)
idx_test = np.array([2,1,0])
print(resume_order(input_test, idx_test))
input_test = torch.tensor([[1,2,3],[4,5,6],[7,8,9]],device=device)
idx_test = torch.tensor([2,1,0], device=device)
print(resume_order(input_test, idx_test))
print(idx_test.device)
del input_test, idx_test

tensor([[ 7,  8,  9],
        [ 4,  5,  6],
        [ 1,  2,  3]])
tensor([[ 7,  8,  9],
        [ 4,  5,  6],
        [ 1,  2,  3]])
cpu


## Define Encoder and Decoder classes

In [16]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size, num_layers=1, num_directions=1, dropout=0):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.num_directions = num_directions
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
    def forward(self, input_tuple, prev_h, prev_c):
        # input
        # input size: (batch_size, seq_length)
        # prev_h size: (num_layers*num_directions, batch_size, hidden_size)
        # prec_c size: (num_layers*num_directions, batch_size, hidden_size)
        # output
        # h_n size: (num_layers*num_directions, batch_size, hidden_size)
        # c_n size: (num_layers*num_directions, batch_size, hidden_size)
        (sentences_packed, reorder_idx) = input_tuple
        sentences_tensor, sentences_length = pad_packed_sequence(sentences_packed, batch_first=True, padding_value=0)
        #sentences_tensor: (batch_size, seq_length)
        input_embedded = self.embedding(sentences_tensor) # (batch_size, seq_length, hidden_size)
        input_embedded_packed = pack_padded_sequence(input_embedded, sentences_length, batch_first=True)
        output, (h_n, c_n) = self.lstm(input_embedded_packed, (prev_h, prev_c))
        return output, h_n, c_n
    def initHidden(self):
        return torch.zeros(self.num_layers*self.num_directions, self.batch_size, self.hidden_size, device=device)
    def setBatchSize(self, batch_size):
        self.batch_size = batch_size

In [17]:
# num_layers and num_directions for encoder must be 0 when this decoder is used
class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, batch_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTMCell(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, prev_h, prev_c, encoder_output=None):
        input_embedded = self.embedding(input)
        h, c = self.lstm(input_embedded, (prev_h, prev_c))
        output = self.softmax(self.out(h))
        return output, h, c
    def initHidden(self):
        return torch.zeros(self.batch_size, self.hidden_size, device=device)
    def setBatchSize(self, batch_size):
        self.batch_size = batch_size

In [18]:
class DotAttenDecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, batch_size):
        super(DotAttenDecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTMCell(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size*2, attention_vector_size)
        self.out2 = nn.Linear(attention_vector_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, prev_h, prev_c, encoder_output):
        # encoder_output: PackedSequence to be converted to (batch_size, seq_length, hidden_size*num_directions)
        input_embedded = self.embedding(input)
        h, c = self.lstm(input_embedded, (prev_h, prev_c))
        
        # trick: use padding value = -inf to do variable length attention correctly 
        encoder_output, _ = pad_packed_sequence(encoder_output, batch_first=True, padding_value=0)
        #print(encoder_output) # to be removed
        scores = torch.matmul(encoder_output, h.unsqueeze(-1)) # (batch_size, seq_length, 1)
        scores[scores==0] = -10e10
        #print(scores) # to be removed
        scores = F.softmax(scores, dim=1)
        context_vector = torch.matmul(torch.transpose(encoder_output, 1, 2), scores).squeeze(-1) # (batch_size, hidden_size)
        attention_vector = F.tanh(self.out(torch.cat((context_vector, h), -1))) # (batch_size, attention_vector_size)
        output = self.softmax(self.out2(attention_vector))
        return output, h, c
        
    def initHidden(self):
        return torch.zeros(self.batch_size, self.hidden_size, device=device)
    def setBatchSize(self, batch_size):
        self.batch_size = batch_size

In [19]:
# This DecoderLSTM can't do teacher forcing in training

# class DecoderLSTM(nn.Module):
#     def __init__(self, hidden_size, output_size, batch_size, num_layers=1, num_directions=1, dropout=0):
#         super(DecoderLSTM, self).__init__()
#         self.hidden_size = hidden_size
#         self.batch_size = batch_size
#         self.num_layers = num_layers
#         self.num_directions = num_directions
#         self.dropout = dropout
        
#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
#         self.out = nn.Linear(hidden_size*num_directions, output_size)
#         self.softmax = nn.LogSoftmax(dim=2)
#     def forward(self, input, prev_h, prev_c):
#         # input 
#         # input size: (batch_size, seq_length)
#         # prev_h size: (num_layers*num_directions, batch_size, hidden_size)
#         # prec_c size: (num_layers*num_directions, batch_size, hidden_size)
#         # output
#         # h_n size: (num_layers*num_directions, batch_size, hidden_size)
#         # c_n size: (num_layers*num_directions, batch_size, hidden_size)
#         # output size: (batch_size, seq_length, output_size)
#         input_embedded = self.embedding(input)
#         output, (h_n, c_n) = self.lstm(input_embedded, (prev_h, prev_c))
#         output =self.softmax(self.out(h))
#         return output, h_n, c_n
#     def initHidden(self, batch_size):
#         return torch.zeros(self.num_layers*self.num_directions, self.batch_size, self.hidden_size, device=device)

## Training

In [20]:
def train(source_tuple, target_tuple, encoder, decoder, encoder_optimizer, decoder_optimizer,batch_size, max_length=MAX_LENGTH):
    
    encoder_hidden_h = encoder.initHidden()
    encoder_hidden_c = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    criterion = nn.NLLLoss(ignore_index=PAD_token, size_average=False)
    
    (sentences_source_packed, reorder_idx_source) = source_tuple
    (sentences_target_packed, reorder_idx_target) = target_tuple
    sentences_target_tensor, sentences_target_length = pad_packed_sequence(sentences_target_packed, batch_first=True, padding_value=PAD_token)
    sentences_target_tensor = resume_order(sentences_target_tensor, reorder_idx_target)

    target_length = sentences_target_tensor.size(1)
    
    # encoder_output size: (batch_size, seq_length, hidden_size*num_directions)
    # encoder_hidden_h size: (num_layers*num_directions, batch_size, hidden_size)
    # encoder_hidden_c size: (num_layers*num_directions, batch_size, hidden_size)
    encoder_output, encoder_hidden_h, encoder_hidden_c = encoder(source_tuple, encoder_hidden_h, encoder_hidden_c)
    
    
    decoder_input = torch.full((batch_size,), SOS_token, dtype=torch.long, device=device)
    decoder_hidden_c = resume_order(encoder_hidden_c[0], reorder_idx_source)
    decoder_hidden_h = resume_order(encoder_hidden_h[0], reorder_idx_source)
    
    to_print = []
    
    for di in range(target_length):
        decoder_output, decoder_hidden_h, decoder_hidden_c = decoder(decoder_input, decoder_hidden_h, decoder_hidden_c, encoder_output)
        loss += criterion(decoder_output, sentences_target_tensor[:,di])
        decoder_input = sentences_target_tensor[:,di]
        
        decoder_output_np = np.argmax(decoder_output.detach().cpu().numpy(), 1)[0]
        to_print.append(index2word_target[decoder_output_np])
    
    denominator = torch.sum(sentences_target_length).float()
    if device == torch.device("cuda"):
        denominator = denominator.cuda()
    loss = loss / denominator
    
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item(), to_print

In [21]:
def trainIters(batch_generator, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.1):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.Adam(encoder.parameters(), learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), learning_rate)
    
    for iter in range(1, n_iters+1):
        source_tuple, target_tuple = batch_generator.get_batch()
        loss, to_print = train(source_tuple, target_tuple, encoder, decoder, encoder_optimizer, decoder_optimizer, batch_size)
        print_loss_total += loss
        plot_loss_total += loss
        
        if iter%print_every ==0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))
            print(to_print)

In [None]:
batch_generator1 = BatchGenerator(batch_size, sentences_source, sentences_target, word2index_source, word2index_target)
encoder1 = EncoderLSTM(source_vocab_size, hidden_size, batch_size).to(device)
decoder1 = DotAttenDecoderLSTM(hidden_size, target_vocab_size+1, batch_size).to(device) # +1 is a wordaround for ignore_index field of NLLLoss
trainIters(batch_generator1, encoder1, decoder1, 12000, print_every=100, learning_rate = 0.001)
torch.save(encoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step12000_encoder"))
torch.save(decoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step12000_decoder"))

In [None]:
trainIters(batch_generator1, encoder1, decoder1, 12000, print_every=100, learning_rate = 0.001)
torch.save(encoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step24000_encoder"))
torch.save(decoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step24000_decoder"))

In [None]:
trainIters(batch_generator1, encoder1, decoder1, 12000, print_every=100, learning_rate = 0.001)
torch.save(encoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step36000_encoder"))
torch.save(decoder1.state_dict(), os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step36000_decoder"))

## Inference

In [83]:
def infer(encoder, decoder, sentence, max_length=MAX_LENGTH):

    sentence_tensor = sen2tensor(sentence, word2index_source)
    #print(sentence_tensor) # to be removed
    
    encoder.setBatchSize(1)
    encoder_hidden_h = encoder.initHidden()
    encoder_hidden_c = encoder.initHidden()
    #print(sentence_tensor.size()) # to be removed
    input_tuple = (pack_padded_sequence(sentence_tensor.unsqueeze(0), [len(sentence_tensor)], batch_first=True), np.array([0]))
    #print(input_tuple) # to be removed
    encoder_output, h_n, c_n = encoder(input_tuple, encoder_hidden_h, encoder_hidden_c)
    #print(encoder_output) # to be removed
    # encoder_output is a PackedSequence object
    # encoder_hidden_h: (1, 1, hidden_size)
    # encoder_hidden_c: (1, 1, hidden_size)
    #encoder_output = pad_packed_sequence(encoder_output, batch_first=True) # encoder_output: (1, seq_length, hidden_size)
    #encoder_output = encoder_output[0] # encoder_output: (seq_length, hidden_size)

    decoder.setBatchSize(1)
    decoder_input = torch.full((1,), SOS_token, dtype=torch.long, device=device)
    decoder_hidden_h = h_n[0]
    decoder_hidden_c = c_n[0]
    
    output = []
    
    for di in range(max_length):
        decoder_output, decoder_hidden_h, decoder_hidden_c = decoder(decoder_input, decoder_hidden_h, decoder_hidden_c, encoder_output)
        idx = torch.argmax(decoder_output)
        decoder_input = idx.unsqueeze(0)
        output.append(index2word_target[idx])
        if idx == EOS_token:
            break
    return output

In [69]:
encoder = EncoderLSTM(source_vocab_size, hidden_size, batch_size).to(device)
decoder = DotAttenDecoderLSTM(hidden_size, target_vocab_size+1, batch_size).to(device) # +1 is a wordaround for ignore_index field of NLLLoss
encoder.load_state_dict(torch.load(os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step36000_encoder"), map_location='cpu'))
decoder.load_state_dict(torch.load(os.path.join("model_ckpt","model_maxlen25_dotatten_lr0.001_step36000_decoder"), map_location='cpu'))

In [104]:
# Some examples in training set
print("Some examples in training set")
print()
for i in range(3):
    idx = np.random.randint(len(sentences_source))
    translated_sentence = infer(encoder, decoder, sentences_source[idx])
    translated_sentence = ' '.join(w for w in translated_sentence)
    print("Source: {}".format(' '.join(w for w in sentences_source[idx])))
    print("Target: {}".format(' '.join(w for w in sentences_target[idx])))
    print ("NMT: {}".format(translated_sentence))
    print()

# Some examples in testing set
print("Some examples in testing set")
print()
for i in range(3):
    idx = np.random.randint(len(test_source))
    translated_sentence = infer(encoder, decoder, test_source[idx])
    translated_sentence = ' '.join(w for w in translated_sentence)
    print("Source: {}".format(' '.join(w for w in test_source[idx])))
    print("Target: {}".format(' '.join(w for w in test_target[idx])))
    print ("NMT: {}".format(translated_sentence))
    print()

Some examples in training set

Source: Nó phát ra toàn bộ năng lượng ngay lập tức , và đó là một vụ nổ khó có thể tưởng tượng nổi .
Target: It released its energy all at once , and it was an explosion that was mind-numbing .
NMT: It brought fusion to its energy exposure , and the one that was finished &#91; with &#93; . </s>

Source: Hẳn bạn có thể tưởng tượng , khi là một nhà văn , mọi thứ sẽ trở nên rất căng thẳng và gấp gáp .
Target: And you can imagine , if you &apos;re a writer , that things would get really crowded around deadlines .
NMT: You can imagine , they start with a very , very , very different character , rich , like a very slight difference . </s>

Source: Đây là năng lượng tạo bằng sức gió . Tất cả bóng đèn đều là bóng đèn tích kiệm năng lượng .
Target: This is powered by wind . All of the lights are daylight bulbs .
NMT: This is powered by wind . I mean every light about water quantum function flow . </s>

Some examples in testing set

Source: Tôi là một y sĩ theo khá