In [25]:
%matplotlib inline
import math
import numpy as np
import os
import random
import tensorflow as tf
from matplotlib import pylab
from collections import Counter
import csv
import nltk
from underthesea import word_tokenize

# Seq2Seq Items
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.python.ops.rnn_cell import LSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.rnn import DropoutWrapper

In [26]:
vocab_size = 31411
num_units = 256
input_size = 128
batch_size = 128
source_sequence_length=40
target_sequence_length=60
decoder_type = 'luong-attention' # could be luong or Bahdanau styled attenion
sentences_to_read = 10
cell_type='LSTM'
embedding_dimensions = 512
encoder_type='bi_directional'
beam_search = 'true'
beam_width = 10

In [27]:
# Loading Vocabulary from vocabulary files
src_dictionary = dict()
with open('vocab.vi.txt', encoding='utf-8') as f:
    for line in f:
        src_dictionary[line[:-1]] = len(src_dictionary) # Assign an ID to each word in dictionary, ID is currently the entry and the word is the key

src_reverse_dictionary = dict(zip(src_dictionary.values(),src_dictionary.keys())) # Reverse order, now ID is the key and the word is the entry
#Somehow zipping the two list rearranges the zip to be in order.

print('Source')
print('\t',list(src_dictionary.items())[:10]) 
print('\t',list(src_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(src_dictionary))

tgt_dictionary = dict()
with open('vocab.en.txt', encoding='utf-8') as f:
    for line in f:
        #we are discarding last char as it is new line char
        tgt_dictionary[line[:-1]] = len(tgt_dictionary)

tgt_reverse_dictionary = dict(zip(tgt_dictionary.values(),tgt_dictionary.keys()))

print('Target')
print('\t',list(tgt_dictionary.items())[:10])
print('\t',list(tgt_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(tgt_dictionary))

Source
	 [('<unk>', 0), ('<s>', 1), ('</s>', 2), (',', 3), ('.', 4), ('và', 5), ('tôi', 6), ('là', 7), ('một', 8), ('những', 9)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'và'), (6, 'tôi'), (7, 'là'), (8, 'một'), (9, 'những')]
	 Vocabulary size:  31411
Target
	 [('<unk>', 0), ('<s>', 1), ('</s>', 2), (',', 3), ('.', 4), ('the', 5), ('and', 6), ('to', 7), ('of', 8), ('a', 9)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'the'), (6, 'and'), (7, 'to'), (8, 'of'), (9, 'a')]
	 Vocabulary size:  31411


In [28]:
#Loading Sentences(English and Vietnamese)
source_sent = []
target_sent = []

test_source_sent = []
test_target_sent = []

'''
with open('train.en.txt', encoding='utf-8') as f_en:
    for l_i, line in enumerate(f_en):
        target_sent.append(line)
        if len(target_sent)>=sentences_to_read:
            break
            
with open('train.vi.txt', encoding='utf-8') as f_vi:
    for l_i, line in enumerate(f_vi):
        source_sent.append(line)
        if len(source_sent)>=sentences_to_read:
            break
'''            
with open('test.vi.txt',encoding='utf-8') as f_vi_tst:
    for idx, line in enumerate(f_vi_tst):
        sent = word_tokenize(line)
        test_source_sent.append(sent)
        if len(test_source_sent)>= batch_size:
            break
        
# assert len(source_sent)==len(target_sent),'Source: %d, Target: %d'%(len(source_sent),len(target_sent))

In [29]:
def split_to_tokens(sent,is_source):
    #sent = sent.replace('-',' ')
    sent = sent.replace(',',' ,')
    sent = sent.replace('.',' .')
    sent = sent.replace('\n',' ') 
    
    sent_toks = sent.split(' ')
    for t_i, tok in enumerate(sent_toks):
        if is_source:
            if tok not in src_dictionary.keys():
                sent_toks[t_i] = '<unk>'
        else:
            if tok not in tgt_dictionary.keys():
                sent_toks[t_i] = '<unk>'
    return sent_toks

In [30]:
train_inputs=[]
train_inp_lengths=[]

src_max_sent_length= 41
tgt_max_sent_length= 61

for s_i,src_sent in enumerate (test_source_sent):
    src_sentence = ' '.join(src_sent)
    src_sent_tokens= split_to_tokens(src_sentence,True)
    
    #Process sentences for batch training(ADD special tokens and MAKE sentences into SAME length)
    #Source Language
    ############################################
    num_src_sent = []
    for tok in src_sent_tokens:
        num_src_sent.append(src_dictionary[tok])
    
    num_src_rvs= num_src_sent[::-1]
    num_src_sent.insert(0,src_dictionary['<s>'])
    
    train_inp_lengths.append(min(src_max_sent_length, len(num_src_sent)+1))
    if len(num_src_sent)<src_max_sent_length:
        num_src_sent.extend([src_dictionary['</s>'] for _ in range(src_max_sent_length - len(num_src_sent))])
    # if more than max length, truncate the sentence
    elif len(num_src_sent)>src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
    assert len(num_src_sent)==src_max_sent_length,len(num_src_sent)
    train_inputs.append(num_src_sent)
  
#for i in train_inputs[1]:
    #print(src_reverse_dictionary[i])
    
train_inputs = np.array(train_inputs, dtype=np.int32)
train_inp_lengths = np.array(train_inp_lengths, dtype=np.int32)

In [31]:
input_size = 128

class DataGeneratorMT(object):
    
    def __init__(self,batch_size,num_unroll,is_source):
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        
        
        self._src_word_embeddings = np.load('vi_embeddings.npy')
        
        self._tgt_word_embeddings = np.load('en_embeddings.npy')
        
        self._sent_ids = None
        
        self._is_source = is_source
        
                
    def next_batch(self, sent_ids, first_set):
        
        if self._is_source:
            max_sent_length = src_max_sent_length
        else:
            max_sent_length = tgt_max_sent_length
        batch_labels_ind = []
        batch_data = np.zeros((self._batch_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size),dtype=np.float32)
        
        for b in range(self._batch_size):
            
            sent_id = sent_ids[b]
            
            if self._is_source:
                sent_text = train_inputs[sent_id]
                             
                batch_data[b] = sent_text[self._cursor[b]]
                batch_labels[b]=sent_text[self._cursor[b]+1]

            else:
                sent_text = train_outputs[sent_id]
                
                batch_data[b] = sent_text[self._cursor[b]]
                batch_labels[b] = sent_text[self._cursor[b]+1]

            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)
                                    
        return batch_data,batch_labels
        
    def unroll_batches(self,sent_ids):
        
        if sent_ids is not None:
            
            self._sent_ids = sent_ids
            
            self._cursor = [0 for _ in range(self._batch_size)]
                
        unroll_data,unroll_labels = [],[]
        inp_lengths = []
        for ui in range(self._num_unroll):
            
            data, labels = self.next_batch(self._sent_ids, False)
                    
            unroll_data.append(data)
            unroll_labels.append(labels)
            
        for x in sent_ids:
            inp_lengths.append(train_inp_lengths[x])
            
        return unroll_data, unroll_labels, self._sent_ids, inp_lengths
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
# Running a tiny set to see if the implementation correct
dg = DataGeneratorMT(batch_size=2,num_unroll=40,is_source=True)
u_data, u_labels, _, _ = dg.unroll_batches([0,1])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    print([src_reverse_dictionary[w] for w in lbl.tolist()])


Source data
['xin', 'tôi']
['chào', 'tên']
['</s>', 'là']
['</s>', 'john']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']
['</s>', '</s>']


In [32]:
tf.reset_default_graph()
# Initial Encoding and Decoding mask layer
enc_train_inputs= []

#Embedding Layer, received from pre-built word2vec embedding
encoder_emb_layer = tf.convert_to_tensor(np.load('vi_embeddings.npy'),dtype=tf.float32)
decoder_emb_layer = tf.convert_to_tensor(np.load('en_embeddings.npy'),dtype=tf.float32)

#Defined unrolled training inputs
for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size], name='enc_train_inputs_%d'%ui))
    
encoder_emb_inp=[tf.nn.embedding_lookup(encoder_emb_layer,src) for src in enc_train_inputs]
encoder_emb_inp= tf.stack(encoder_emb_inp)

enc_train_inp_lengths= tf.placeholder(tf.int32, shape=[batch_size], name= "train_input_lengths")

In [33]:
# Define Encoder
if encoder_type == 'uni_directional':
    encoder_cell= tf.nn.rnn_cell.BasicLSTMCell(num_units)
    
    initial_state= encoder_cell.zero_state(batch_size, dtype= tf.float32)

    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp, initial_state=initial_state,
    sequence_length=enc_train_inp_lengths, 
    time_major=True, swap_memory=True)

else:
    forward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
    backward_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
    
    #forward_cell = DropoutWrapper(forward_cell, input_keep_prob = 0.8)
    #backward_cell = DropoutWrapper(backward_cell, input_keep_prob = 0.8)

    initial_state_fw= forward_cell.zero_state(batch_size, dtype= tf.float32)
    initial_state_bw= backward_cell.zero_state(batch_size, dtype= tf.float32)

    bi_outputs, encoder_final_state = tf.nn.bidirectional_dynamic_rnn(
    forward_cell, backward_cell, encoder_emb_inp, initial_state_fw=initial_state_fw,
    initial_state_bw=initial_state_bw,
    sequence_length=enc_train_inp_lengths, 
    time_major=True)
    
    encoder_outputs = tf.concat(bi_outputs,-1)
    
encoder_outputs= tf.transpose(encoder_outputs,[1,0,2])

In [34]:
# Implement Bahdanau Attention
# @attention_states=@memory in documentation  [batch_size, max_time, num_units]
#Define Decoder for Training
tgt_sos_id = 1
tgt_eos_id = 2

if encoder_type == 'uni_directional':
    decoder_cell= tf.nn.rnn_cell.BasicLSTMCell(num_units)
    
else:
    cells = [tf.nn.rnn_cell.BasicLSTMCell(num_units),tf.nn.rnn_cell.BasicLSTMCell(num_units)]
    decoder_cell= tf.nn.rnn_cell.MultiRNNCell(cells)

projection_layer= Dense(units=vocab_size,use_bias=True)


if beam_search:
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
    encoder_outputs, multiplier=beam_width)
    
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
    encoder_final_state, multiplier=beam_width)
    
    attention_mechanism= tf.contrib.seq2seq.LuongAttention(num_units, tiled_encoder_outputs,scale=True)
    attention_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,attention_layer_size=num_units) 
    
    decoder_initial_state = attention_cell.zero_state(dtype = tf.float32, batch_size= batch_size * beam_width).clone(cell_state=tiled_encoder_final_state)
    
    decoder = tf.contrib.seq2seq.BeamSearchDecoder(
        cell=attention_cell,
        embedding=decoder_emb_layer,
        start_tokens=tf.fill([batch_size], tgt_sos_id),
        end_token=tgt_eos_id,
        initial_state=decoder_initial_state,
        beam_width=beam_width,
        output_layer=projection_layer,
        length_penalty_weight=1.0)
    
    outputs_test, _,_ = tf.contrib.seq2seq.dynamic_decode(
    decoder, output_time_major=True
    )
    translations = outputs_test.predicted_ids
    translations = tf.transpose(translations, perm=[1, 2, 0])

else:
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
        embedding = decoder_emb_layer,
        start_tokens = tf.fill([batch_size], tgt_sos_id), 
        end_token = tgt_eos_id)


    maximum_iterations = tf.round(tf.reduce_max(enc_train_inp_lengths) * 2)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,attention_layer_size=num_units)
    
    attention_states = encoder_outputs
    attention_mechanism= tf.contrib.seq2seq.LuongAttention(num_units, attention_states,scale=True)
    initial_state= decoder_cell.zero_state(dtype=tf.float32, batch_size= batch_size).clone(cell_state=encoder_state)

    decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, initial_state,
        output_layer=projection_layer)

    # Get Outputs from Decoder
    outputs_test, _,_ = tf.contrib.seq2seq.dynamic_decode(
    decoder, output_time_major=True,
    maximum_iterations = maximum_iterations
    )
    
    translations = outputs_test.sample_id

In [35]:
#tf.reset_default_graph()

In [36]:
with tf.Session() as sess:
    #for tensor in tf.get_default_graph().as_graph_def().node:
        #print (str(tensor.name))
   
    # sess.run(tf.global_variables_initializer())
    for var_train in tf.trainable_variables():
        name = var_train.name
        # print(name)
        arr_init = tf.train.load_variable('./models/Attention-Bi-Tokenized-292K-ViEn-B64-Luong-98000',name)
        var_train.load(arr_init, session=sess)
        
    print('Done')

    bleu_scores_over_time = []

    src_word_embeddings = np.load('vi_embeddings.npy')
    tgt_word_embeddings = np.load('en_embeddings.npy')

    enc_data_generator = DataGeneratorMT(batch_size = batch_size, num_unroll=source_sequence_length,is_source=True)
    # input_sizes for each bin: [40]
    # output_sizes for each bin: [60]
        
    # ====================== ENCODER DATA COLLECTION ================================================
    inp = []
    for x in range(128):
        inp.append(x)
    eu_data, eu_labels, _, eu_lengths = enc_data_generator.unroll_batches(inp)
    feed_dict = {}
    
    #print(len(eu_lengths))
    feed_dict[enc_train_inp_lengths] = eu_lengths
    
    for ui, dat in enumerate(eu_data):            
        feed_dict[enc_train_inputs[ui]] = dat
    
    # ======================= OPTIMIZATION ==========================
   
    
    tr_pred = sess.run(translations, feed_dict=feed_dict)
    bleu_labels, bleu_preds = [],[]
        
    print('Done')
    print(tr_pred.shape)
    
    for i in range(sentences_to_read):
        print_str = 'Predicted: '
        for j in range(10):
            end = False
            for k in range (21):
                print_str += tgt_reverse_dictionary[tr_pred[i][j][k]] + ' '
                
                if tgt_reverse_dictionary[tr_pred[i][j][k]] == '</s>':
                    end = True
                    break
                    
            if (end): 
                break
        
                
        print(print_str.replace('<unk>','').replace('</s>',''))
        print()
        print('\n') 
    #sess.run(reset_train_state) # resetting hidden state for each batch


Done
Done
(128, 10, 21)
Predicted: hi  .   



Predicted: my name is john  .   



Predicted: i like to eat a dog  .   



Predicted: the day  ,  i went to dinner with my mother at home  .    



Predicted:  likes to smoke  .    



Predicted: the robot is a very interesting robot  .   



Predicted: i am going to the door  .   



Predicted: he goes out  .    



Predicted: her mother had a cat  ,  a dog and a mouse  .    



Predicted: i was 16  .    



