In [1]:
import torch 
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import os
from torch import optim
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import numpy as np

## Parameters Setup

In [2]:
# parameters
MAX_LENGTH = 25
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SOS_token = 1
EOS_token = 2

hidden_size = 512

batch_size = 128

## Data Setup

In [3]:
# set data path
data_dir = os.path.join('datasets', 'nmt_data_vi')
train_source = 'train.vi'
train_target = 'train.en'
train_source_dir = os.path.join(data_dir, train_source)
train_target_dir = os.path.join(data_dir, train_target)
vocab_source = 'vocab.vi'
vocab_target = 'vocab.en'
vocab_source_dir = os.path.join(data_dir, vocab_source)
vocab_target_dir = os.path.join(data_dir, vocab_target)

In [4]:
# load training sets
with open(train_source_dir) as f_source:
    sentences_source = f_source.readlines()
with open(train_target_dir) as f_target:
    sentences_target = f_target.readlines()

# check the total number of sentencs in training sets    
print("Total number of sentences in source training set: {}".format(len(sentences_source)))
print("Total number of sentences in target training set: {}".format(len(sentences_target)))

Total number of sentences in source training set: 133317
Total number of sentences in target training set: 133317


In [5]:
# Truncate sentences by maximum length
sentences_source = list(map(lambda src:src.split()[:MAX_LENGTH], sentences_source))
sentences_target = list(map(lambda src:src.split()[:MAX_LENGTH], sentences_target))

# check the longest sentence after sentence truncation
max = 0
for s in sentences_source:
    if len(s) > max:
        max = len(s)
        max_s = s
print("Number of words in the longest sentence in sentences_source: {}".format(max))
print("The longest sentence: \n{}".format(max_s))

Number of words in the longest sentence in sentences_source: 25
The longest sentence: 
['Trong', '4', 'phút', ',', 'chuyên', 'gia', 'hoá', 'học', 'khí', 'quyển', 'Rachel', 'Pike', 'giới', 'thiệu', 'sơ', 'lược', 'về', 'những', 'nỗ', 'lực', 'khoa', 'học', 'miệt', 'mài', 'đằng']


In [6]:
# load vocabularies

# build index2word
with open(vocab_source_dir) as f_vocab_source:
    #index2word_source = f_vocab_source.readlines()
    index2word_source = [line.rstrip() for line in f_vocab_source]
with open(vocab_target_dir) as f_vocab_target:
    #index2word_target = f_vocab_target.readlines()
    index2word_target = [line.rstrip() for line in f_vocab_target]

# build word2index
word2index_source = {}
for idx, word in enumerate(index2word_source):
    word2index_source[word] = idx
word2index_target = {}
for idx, word in enumerate(index2word_target):
    word2index_target[word] = idx
    
# check vocabularies size    
source_vocab_size = len(index2word_source)
target_vocab_size = len(index2word_target)
print("Total nummber of words in source vocabulary: {}".format(len(index2word_source)))
print("Total nummber of words in target vocabulary: {}".format(len(index2word_target)))    

Total nummber of words in source vocabulary: 7709
Total nummber of words in target vocabulary: 17191


In [7]:
# helper funtions to convert sentence in natural language to list of word indexes
def sen2idx(sentence, word2index):
    return [word2index.get(word, 0) for word in sentence] # assume that 0 is for <unk>

def sen2tensor(sentence, word2index):
    idxes = sen2idx(sentence, word2index)
    idxes.append(EOS_token)
    return torch.tensor(idxes, dtype=torch.long, device=device)

## Token to be ignored

In [8]:
PAD_token = target_vocab_size # padding value

## Batch Generator

In [9]:
# batch generator
def sentences2tensor(sentences, word2index):
    sentences_tensor = [sen2tensor(s, word2index) for s in sentences]
    sentences_tensor.sort(key=len, reverse=True)
    output = pad_sequence(sentences_tensor, batch_first=True)
    return output

def batch_generator(batch_size, sentences_source, sentences_target, word2index_source, word2index_target):
    #output: two PackedSequence object, two indexes to reorder sentences in a batch
    
    # generate id in one batch
    total = len(sentences_source)
    sample_id = np.random.choice(total, batch_size, replace=False)
    
    #generate a source batch
    sentences_source_tensor = [sen2tensor(sentences_source[id], word2index_source) for id in sample_id]
    
    len_array_source = [len(st) for st in sentences_source_tensor]
    reorder_idx_source = np.argsort(len_array_source)
    reorder_idx_source = np.argsort(np.flip(reorder_idx_source, 0)) #index to restore unsorted order
    
    sentences_source_tensor.sort(key=len, reverse=True)
    sentences_source_packed = pack_sequence(sentences_source_tensor)
    
    #generate a target batch
    sentences_target_tensor = [sen2tensor(sentences_target[id], word2index_target) for id in sample_id]
    
    len_array_target = [len(st) for st in sentences_target_tensor]
    reorder_idx_target = np.argsort(len_array_target)
    reorder_idx_target = np.argsort(np.flip(reorder_idx_target, 0)) #index to restore unsorted order
    
    sentences_target_tensor.sort(key=len, reverse=True)
    sentences_target_packed = pack_sequence(sentences_target_tensor)
    
    return (sentences_source_packed, reorder_idx_source), (sentences_target_packed, reorder_idx_target)

In [10]:
# test batch_generator
a = ['I','am','a','boy']
b = ['a']
c = ['the','goat']

sentences_test = [a,b,c]
sentences_test2 = [c,b,a]

(output_test, reorder_idx_test), (output_test2, reorder_idx_test2) = batch_generator(2, sentences_test, sentences_test2, word2index_target, word2index_target)
print(output_test)
print(reorder_idx_test)
print(pad_packed_sequence(output_test, batch_first=True))

del a, b, c, sentences_test, sentences_test2, output_test, reorder_idx_test, output_test2, reorder_idx_test2

PackedSequence(data=tensor([   47,    20,   382,  1352,     8,     2,   897,     2]), batch_sizes=tensor([ 2,  2,  2,  1,  1]))
[0 1]
(tensor([[   47,   382,     8,   897,     2],
        [   20,  1352,     2,     0,     0]]), tensor([ 5,  3]))


## Helper function to resume order of sentences in a batch

Order recovery is necessary because sentences have to be sorted in descending order of sentence length to be packed as a PackedSequence object. PackedSequence object helps to deal with inputs with variable length in NMT setting. LSTM, RNN can accept PackedSequence objects.

In [11]:
def resume_order(input, idx):
    # input 
    #   input: Tensor: (batch_size, seq_length)
    #   idx: Tensor or ndarray: (batch_size)
    # output
    #   out: Tensor with reordered sentences in batch: (batch_size, seq_length) 
    
    if isinstance(idx, (np.ndarray)):
        idx = torch.from_numpy(idx)
    out = torch.index_select(input, 0, idx)
    return out

In [12]:
# test resume_order
input_test = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
idx_test = np.array([2,1,0])
print(resume_order(input_test, idx_test))
input_test = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
idx_test = torch.tensor([2,1,0])
print(resume_order(input_test, idx_test))
del input_test, idx_test

tensor([[ 7,  8,  9],
        [ 4,  5,  6],
        [ 1,  2,  3]])
tensor([[ 7,  8,  9],
        [ 4,  5,  6],
        [ 1,  2,  3]])


## Define Encoder and Decoder classes

In [13]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size, num_layers=1, num_directions=1, dropout=0):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.num_directions = num_directions
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
    def forward(self, input_tuple, prev_h, prev_c):
        # input
        # input size: (batch_size, seq_length)
        # prev_h size: (num_layers*num_directions, batch_size, hidden_size)
        # prec_c size: (num_layers*num_directions, batch_size, hidden_size)
        # output
        # h_n size: (num_layers*num_directions, batch_size, hidden_size)
        # c_n size: (num_layers*num_directions, batch_size, hidden_size)
        (sentences_packed, reorder_idx) = input_tuple
        sentences_tensor, sentences_length = pad_packed_sequence(sentences_packed, batch_first=True, padding_value=0)
        #sentences_tensor: (batch_size, seq_length)
        input_embedded = self.embedding(sentences_tensor) # (batch_size, seq_length, hidden_size)
        input_embedded_packed = pack_padded_sequence(input_embedded, sentences_length, batch_first=True)
        _, (h_n, c_n) = self.lstm(input_embedded_packed, (prev_h, prev_c))
        return h_n, c_n
    def initHidden(self):
        return torch.zeros(self.num_layers*self.num_directions, self.batch_size, self.hidden_size, device=device)

In [14]:
# num_layers and num_directions for encoder must be 0 when this decoder is used
class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, batch_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTMCell(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, input, prev_h, prev_c):
        input_embedded = self.embedding(input)
        h, c = self.lstm(input_embedded, (prev_h, prev_c))
        output =self.softmax(self.out(h))
        return output, h, c
    def initHidden(self):
        return torch.zeros(self.batch_size, self.hidden_size, device=device)

In [15]:
# This DecoderLSTM can't do teacher forcing in training

# class DecoderLSTM(nn.Module):
#     def __init__(self, hidden_size, output_size, batch_size, num_layers=1, num_directions=1, dropout=0):
#         super(DecoderLSTM, self).__init__()
#         self.hidden_size = hidden_size
#         self.batch_size = batch_size
#         self.num_layers = num_layers
#         self.num_directions = num_directions
#         self.dropout = dropout
        
#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
#         self.out = nn.Linear(hidden_size*num_directions, output_size)
#         self.softmax = nn.LogSoftmax(dim=2)
#     def forward(self, input, prev_h, prev_c):
#         # input 
#         # input size: (batch_size, seq_length)
#         # prev_h size: (num_layers*num_directions, batch_size, hidden_size)
#         # prec_c size: (num_layers*num_directions, batch_size, hidden_size)
#         # output
#         # h_n size: (num_layers*num_directions, batch_size, hidden_size)
#         # c_n size: (num_layers*num_directions, batch_size, hidden_size)
#         # output size: (batch_size, seq_length, output_size)
#         input_embedded = self.embedding(input)
#         output, (h_n, c_n) = self.lstm(input_embedded, (prev_h, prev_c))
#         output =self.softmax(self.out(h))
#         return output, h_n, c_n
#     def initHidden(self, batch_size):
#         return torch.zeros(self.num_layers*self.num_directions, self.batch_size, self.hidden_size, device=device)

## Training

In [19]:
def train(source_tuple, target_tuple, encoder, decoder, encoder_optimizer, decoder_optimizer,batch_size, max_length=MAX_LENGTH):
    
    encoder_hidden_h = encoder.initHidden()
    encoder_hidden_c = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    criterion = nn.NLLLoss(ignore_index=PAD_token, size_average=False)
    
    (sentences_source_packed, reorder_idx_source) = source_tuple
    (sentences_target_packed, reorder_idx_target) = target_tuple
    sentences_target_tensor, sentences_target_length = pad_packed_sequence(sentences_target_packed, batch_first=True, padding_value=PAD_token)
    sentences_target_tensor = resume_order(sentences_target_tensor, reorder_idx_target)

    target_length = sentences_target_tensor.size(1)
    
    # encoder_hidden_h size: (num_layers*num_directions, batch_size, hidden_size)
    # encoder_hidden_c size: (num_layers*num_directions, batch_size, hidden_size)
    encoder_hidden_h, encoder_hidden_c = encoder(source_tuple, encoder_hidden_h, encoder_hidden_c)
    
    
    decoder_input = torch.full((batch_size,), SOS_token, dtype=torch.long, device=device)
    decoder_hidden_c = resume_order(encoder_hidden_c[0], reorder_idx_source)
    decoder_hidden_h = resume_order(encoder_hidden_h[0], reorder_idx_source)
    
    for di in range(target_length):
        decoder_output, decoder_hidden_h, decoder_hidden_c = decoder(decoder_input, decoder_hidden_h, decoder_hidden_c)
        loss += criterion(decoder_output, sentences_target_tensor[:,di])
        decoder_input = sentences_target_tensor[:,di]
    
    loss = loss / torch.sum(sentences_target_length).float()
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item()   # to do: count only valid target_length

In [20]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.1):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0
    
    encoder_optimizer = optim.SGD(encoder.parameters(), learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), learning_rate)
    
    for iter in range(1, n_iters+1):
        source_tuple, target_tuple = batch_generator(batch_size, sentences_source, sentences_target, word2index_source, word2index_target)
        loss = train(source_tuple, target_tuple, encoder, decoder, encoder_optimizer, decoder_optimizer, batch_size)
        print_loss_total += loss
        plot_loss_total += loss
        
        if iter%print_every ==0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, iter / n_iters * 100, print_loss_avg))

In [22]:
encoder1 = EncoderLSTM(source_vocab_size, hidden_size, batch_size).to(device)
decoder1 = DecoderLSTM(hidden_size, target_vocab_size+1, batch_size).to(device) # +1 is a wordaround for ignore_index field of NLLLoss
trainIters(encoder1, decoder1, 133317, print_every=1)

(1 0%) 9.7555
(2 0%) 9.7505
(3 0%) 9.7461
(4 0%) 9.7432
(5 0%) 9.7314
(6 0%) 9.7277
(7 0%) 9.7227
(8 0%) 9.7164
(9 0%) 9.7141
(10 0%) 9.7076
(11 0%) 9.7011
(12 0%) 9.6968
(13 0%) 9.6917
(14 0%) 9.6858
(15 0%) 9.6867
(16 0%) 9.6771
(17 0%) 9.6731
(18 0%) 9.6552
(19 0%) 9.6647
(20 0%) 9.6512
(21 0%) 9.6441
(22 0%) 9.6451
(23 0%) 9.6312
(24 0%) 9.6185


KeyboardInterrupt: 