In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
import math
import numpy as np
import os
import random
import tensorflow as tf
from matplotlib import pylab
from collections import Counter
import csv
import nltk

# Seq2Seq Items
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.nn.rnn_cell import BasicLSTMCell
from tensorflow.python.ops.rnn_cell import MultiRNNCell
from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.rnn import DropoutWrapper

In [2]:
vocab_size = 31411
num_units = 256
input_size = 128
batch_size = 128
source_sequence_length=40
target_sequence_length=60
encoder_type = 'bidirectional'
decoder_type = 'attention' # could be basic or attention
sentences_to_read = 292275

In [3]:
# Loading Vocabulary from vocabulary files
src_dictionary = dict()
with open('vocab.vi.txt', encoding='utf-8') as f:
    for line in f:
        src_dictionary[line[:-1]] = len(src_dictionary) # Assign an ID to each word in dictionary, ID is currently the entry and the word is the key

src_reverse_dictionary = dict(zip(src_dictionary.values(),src_dictionary.keys())) # Reverse order, now ID is the key and the word is the entry
#Somehow zipping the two list rearranges the zip to be in order.

print('Source')
print('\t',list(src_dictionary.items())[:10]) 
print('\t',list(src_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(src_dictionary))

tgt_dictionary = dict()
with open('vocab.en.txt', encoding='utf-8') as f:
    for line in f:
        #we are discarding last char as it is new line char
        tgt_dictionary[line[:-1]] = len(tgt_dictionary)

tgt_reverse_dictionary = dict(zip(tgt_dictionary.values(),tgt_dictionary.keys()))

print('Target')
print('\t',list(tgt_dictionary.items())[:10])
print('\t',list(tgt_reverse_dictionary.items())[:10])
print('\t','Vocabulary size: ', len(tgt_dictionary))

Source
	 [('<unk>', 0), ('<s>', 1), ('</s>', 2), (',', 3), ('.', 4), ('và', 5), ('tôi', 6), ('là', 7), ('một', 8), ('những', 9)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'và'), (6, 'tôi'), (7, 'là'), (8, 'một'), (9, 'những')]
	 Vocabulary size:  31411
Target
	 [('<unk>', 0), ('<s>', 1), ('</s>', 2), (',', 3), ('.', 4), ('the', 5), ('and', 6), ('to', 7), ('of', 8), ('a', 9)]
	 [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, 'the'), (6, 'and'), (7, 'to'), (8, 'of'), (9, 'a')]
	 Vocabulary size:  31411


In [4]:
#Loading Sentences(English and Vietnamese)
source_sent = []
target_sent = []

test_source_sent = []
test_target_sent = []


with open('train.en.txt', encoding='utf-8') as f_en, open('train.vi.txt', encoding='utf-8') as f_vi:
    for l_i, (line,line_2) in enumerate(zip(f_vi,f_en)):
        if (len(line)==1 or len(line_2)==1):
            continue
        source_sent.append(line)
        target_sent.append(line_2)
        if len(target_sent)>=sentences_to_read:
            break

assert len(source_sent)==len(target_sent),'Source: %d, Target: %d'%(len(source_sent),len(target_sent))

sentences_to_read = len(source_sent)
print('Sample translations (%d)'%len(source_sent))
for i in range(0,sentences_to_read,10000):
    print('(',i,') VI: ', source_sent[i])
    print('(',i,') EN: ', target_sent[i])

Sample translations (292125)
( 0 ) VI:  khoa_học đằng_sau một tiêu_đề về khí_hậu

( 0 ) EN:  rachel pike : the science behind a climate headline

( 10000 ) VI:  vì thế việc chúng_tôi cần làm là lấy những phân_tử đó và cấy chúng vào các nơ - ron .

( 10000 ) EN:  so what we need to do is to take these molecules and somehow install them in neurons .

( 20000 ) VI:  nếu bạn không chỉ muốn ong_chúa , bạn cũng có_thể mua , thực_tế là , 1 gói 1.4 kg ong , đến bằng bưu_phẩm , và dĩ_nhiên , bưu_điện luôn_luôn quan_tâm khi họ nhận được , cái gói 1.4 kg ong của bạn

( 20000 ) EN:  if you do not just want a queen , you can buy , actually , a three-pound package of bees , which comes in the mail , and of course , the postal office is always very concerned when they get , you know , your three-pound packages of bees .

( 30000 ) VI:  và vì_vậy chường một thực_sự bắt đầu hôm_nay , với tất_cả chúng_ta , vì trong mỗi chúng_ta là sức_mạnh để sang bằng các thiếu_hụt cơ_hội và để đóng lại lỗ_hổng hy_vọng

In [5]:
def split_to_tokens(sent,is_source):
    #sent = sent.replace('-',' ')
    sent = sent.replace(',',' ,')
    sent = sent.replace('.',' .')
    sent = sent.replace('\n',' ') 
    
    sent_toks = sent.split(' ')
    for t_i, tok in enumerate(sent_toks):
        if is_source:
            if tok not in src_dictionary.keys():
                sent_toks[t_i] = '<unk>'
        else:
            if tok not in tgt_dictionary.keys():
                sent_toks[t_i] = '<unk>'
    return sent_toks

# Let us first look at some statistics of the sentences
source_len = []
source_mean, source_std = 0,0
for sent in source_sent:
    source_len.append(len(split_to_tokens(sent,True)))

print('(Source) Sentence mean length: ', np.mean(source_len))
print('(Source) Sentence stddev length: ', np.std(source_len))

target_len = []
target_mean, target_std = 0,0
for sent in target_sent:
    target_len.append(len(split_to_tokens(sent,False)))

print('(Target) Sentence mean length: ', np.mean(target_len))
print('(Target) Sentence stddev length: ', np.std(target_len))

(Source) Sentence mean length:  22.32731536157467
(Source) Sentence stddev length:  15.228319460982052
(Target) Sentence mean length:  23.9698622165169
(Target) Sentence stddev length:  16.154302459360334


In [6]:
train_inputs=[]
train_outputs=[]

train_inp_lengths=[]
train_out_lengths=[]

src_max_sent_length= 41
tgt_max_sent_length= 61

for s_i, (src_sent, tgt_sent) in enumerate(zip(source_sent,target_sent)):
    src_sent_tokens= split_to_tokens(src_sent,True)
    tgt_sent_tokens= split_to_tokens(tgt_sent,False)
    
    #Process sentences for batch training(ADD special tokens and MAKE sentences into SAME length)
    #Source Language
    ############################################
    num_src_sent = []
    for tok in src_sent_tokens:
        num_src_sent.append(src_dictionary[tok])
    
    num_src_rvs= num_src_sent[::-1]
    num_src_sent.insert(0,src_dictionary['<s>'])
    
    train_inp_lengths.append(min(src_max_sent_length, len(num_src_sent)+1))
    
    # append until the sentence reaches max length
    if len(num_src_sent)<src_max_sent_length:
        for i in range(src_max_sent_length-len(num_src_sent)):
            num_src_sent.append(src_dictionary['</s>'])
    
    #else truncate sentence until it reaches max length
    elif len(num_src_sent) > src_max_sent_length:
        num_src_sent = num_src_sent[:src_max_sent_length]
    assert len(num_src_sent)==src_max_sent_length,len(num_src_sent)
    
    train_inputs.append(num_src_sent)
    ##############################################
    
    #Target Language
    ##############################################
    num_tgt_sent=[tgt_dictionary['</s>']]
    for tok in tgt_sent_tokens:
        num_tgt_sent.append(tgt_dictionary[tok])
        
    train_out_lengths.append(min(len(num_tgt_sent)+1,tgt_max_sent_length))
    
    # append until the sentence reaches max length
    if len(num_tgt_sent)< tgt_max_sent_length:
        for i in range(tgt_max_sent_length-len(num_tgt_sent)):
            num_tgt_sent.append(tgt_dictionary['</s>'])
            
    #else truncate sentence until it reaches max length
    elif len(num_tgt_sent) > tgt_max_sent_length:
        num_tgt_sent = num_tgt_sent[:tgt_max_sent_length]
    assert len(num_tgt_sent)==tgt_max_sent_length,len(num_tgt_sent)
    
    train_outputs.append(num_tgt_sent)
    ################################################
    
assert len(train_inputs)  == len(source_sent),\
        'Size of total bin elements: %d, Total sentences: %d'\
                %(len(train_inputs),len(source_sent))


train_inputs = np.array(train_inputs, dtype=np.int32)
train_outputs = np.array(train_outputs, dtype=np.int32)
train_inp_lengths = np.array(train_inp_lengths, dtype=np.int32)
      

In [7]:
class DataGeneratorMT(object):
    def __init__(self,batch_size,num_unroll,is_source):
        self._batch_size= batch_size
        self._num_unroll= num_unroll
        self._cursor= [0 for offset in range (self._batch_size)]
        
        self._src_word_embeddings = np.load('vi_embeddings.npy')
        self._tgt_word_embeddings = np.load('en_embeddings.npy')
        
        self._sent_ids = None
        self._is_source =  is_source
        
    def next_batch(self,sent_ids, first_set):
        if self._is_source:
            max_sent_length= src_max_sent_length
        else:
            max_sent_length= tgt_max_sent_length
        batch_label_ind= []
        
        batch_data= np.zeros((self._batch_size),dtype= np.float32)
        batch_labels= np.zeros((self._batch_size),dtype= np.float32)
        
        for b in range(self._batch_size):
            sent_id= sent_ids[b]
            
            if self._is_source:
                sent_text= train_inputs[sent_id]
                
                batch_data[b]= sent_text[self._cursor[b]]
                batch_labels[b]= sent_text[self._cursor[b]+1]
            else:
                sent_text= train_outputs[sent_id]
                
                batch_data[b]= sent_text[self._cursor[b]]
                batch_labels[b]= sent_text[self._cursor[b]+1]
            self._cursor[b]= (self._cursor[b]+1)%(max_sent_length-1)
        return batch_data, batch_labels
    
    def unroll_batches(self,sent_ids):
        
        if sent_ids is not None:
            self._sent_ids= sent_ids
            self._cursor= [0 for _ in range(self._batch_size)]
            
        unroll_data,unroll_labels=[],[]
        inp_lengths= None
        
        for ui in range(self._num_unroll):
            
            data,labels= self.next_batch(self._sent_ids,False)
            unroll_data.append(data)
            unroll_labels.append(labels)
            inp_lengths= train_inp_lengths[sent_ids]
        
        return unroll_data, unroll_labels, self._sent_ids, inp_lengths
    
    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]
        
# Running a tiny set to see if the implementation correct        
dg= DataGeneratorMT(batch_size=5,num_unroll=40,is_source=True)

inp= []
for i in range(5):
    inp.append(i)

u_data, u_labels,_,_= dg.unroll_batches(inp)
print('Source data')
for _, lbl in zip(u_data,u_labels):
     print([src_reverse_dictionary[w] for w in lbl.tolist()])
    
dg= DataGeneratorMT(batch_size=5,num_unroll=60,is_source=False)
u_data, u_labels,_,_= dg.unroll_batches([0,2,3,4,5])

print('\nTarget data batch (first time)')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([tgt_reverse_dictionary[w] for w in lbl.tolist()])

Source data
['khoa_học', 'trong', 'tôi', 'có', 'cả']
['đằng_sau', '4', 'muốn', 'những', 'hai']
['một', 'phút', 'cho', 'dòng', 'đều']
['tiêu_đề', '<unk>', 'các', 'trông', 'là']
['về', ',', 'bạn', 'như', 'một']
['khí_hậu', 'chuyên_gia', 'biết', 'thế_này', 'nhánh']
['<unk>', 'hoá_học', 'về', 'khi', 'của']
['</s>', 'khí_quyển', 'sự', 'bàn', 'cùng']
['</s>', 'rachel', 'to_lớn', 'về', 'một']
['</s>', 'pike', 'của', 'biến_đổi', 'lĩnh_vực']
['</s>', 'giới_thiệu', 'những', 'khí_hậu', 'trong']
['</s>', 'sơ_lược', 'nỗ_lực', '<unk>', 'ngành']
['</s>', 'về', 'khoa_học', ',', 'khoa_học']
['</s>', 'những', 'đã', 'và', 'khí_quyển']
['</s>', 'nỗ_lực', 'góp_phần', 'như', '<unk>']
['</s>', 'khoa_học', 'làm_nên', 'thế_này', '.']
['</s>', 'miệt_mài', 'các', 'khi', '<unk>']
['</s>', 'đằng_sau', 'dòng', 'nói', '</s>']
['</s>', 'những', 'tít', 'về', '</s>']
['</s>', 'tiêu_đề', 'bạn', 'chất_lượng', '</s>']
['</s>', 'táo_bạo', 'thường', 'không_khí', '</s>']
['</s>', 'về', 'thấy', 'hay', '</s>']
['</s>', 'biến_đ

In [8]:
# Initial Encoding and Decoding mask layer
tf.reset_default_graph()

enc_train_inputs= []
dec_train_inputs= []

#Embedding Layer, received from pre-built word2vec embedding
encoder_emb_layer = tf.convert_to_tensor(np.load('vi_embeddings.npy'),dtype=tf.float32)
decoder_emb_layer = tf.convert_to_tensor(np.load('en_embeddings.npy'),dtype=tf.float32)


#Defined unrolled training inputs
for ui in range(source_sequence_length):
    enc_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size], name='enc_train_inputs_%d'%ui))
    
dec_train_labels=[]
dec_label_masks= []

for ui in range(target_sequence_length):
    dec_train_inputs.append(tf.placeholder(tf.int32, shape=[batch_size], name='dec_train_inputs_%d'%ui))
    dec_train_labels.append(tf.placeholder(tf.int32, shape=[batch_size], name='dec_train_inputs_%d'%ui))
    dec_label_masks.append(tf.placeholder(tf.float32, shape=[batch_size], name='dec_train_inputs_%d'%ui))
    
encoder_emb_inp=[tf.nn.embedding_lookup(encoder_emb_layer,src) for src in enc_train_inputs]
encoder_emb_inp= tf.stack(encoder_emb_inp)


decoder_emb_inp=[tf.nn.embedding_lookup(decoder_emb_layer,src) for src in dec_train_inputs]  
decoder_emb_inp= tf.stack(decoder_emb_inp)

enc_train_inp_lengths= tf.placeholder(tf.int32, shape=[batch_size], name= "train_input_lengths")
dec_train_inp_lengths= tf.placeholder(tf.int32, shape=[batch_size], name= "train_output_lengths")

In [9]:
# Define Encoder
forward_cell= BasicLSTMCell(num_units)
backward_cell= BasicLSTMCell(num_units)

forward_cell = DropoutWrapper(forward_cell, input_keep_prob = 0.8)
backward_cell = DropoutWrapper(backward_cell, input_keep_prob = 0.8)

initial_state_fw= forward_cell.zero_state(batch_size, dtype= tf.float32)
initial_state_bw= backward_cell.zero_state(batch_size, dtype= tf.float32)

bi_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
    forward_cell, backward_cell, encoder_emb_inp, initial_state_fw=initial_state_fw,
    initial_state_bw=initial_state_bw,
    sequence_length=enc_train_inp_lengths, 
    time_major=True)

encoder_outputs = tf.concat(bi_outputs,-1)
encoder_outputs= tf.transpose(encoder_outputs,[1,0,2])

print(encoder_outputs)

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
Tensor("transpose:0", shape=(128, 40, 512), dtype=float32)


In [10]:
#Define Decoder
cells = [BasicLSTMCell(num_units),BasicLSTMCell(num_units)]
decoder_cell = tf.nn.rnn_cell.MultiRNNCell(cells)

projection_layer= Dense(units=vocab_size,use_bias=True)

helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, [tgt_max_sent_length-1 for _ in range(batch_size)], time_major=True)


# Implement Bahdanau Attention
# @attention_states=@memory in documentation  [batch_size, max_time, num_units]

attention_states = encoder_outputs
attention_mechanism= tf.contrib.seq2seq.LuongAttention(num_units, attention_states,
                                                          scale=True)


decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism,attention_layer_size=num_units)

initial_state= decoder_cell.zero_state(dtype=tf.float32, batch_size= batch_size).clone(cell_state=encoder_state)

decoder = tf.contrib.seq2seq.BasicDecoder(
        decoder_cell, helper, initial_state,
        output_layer=projection_layer)

outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
    decoder, output_time_major=True,
    swap_memory=True
)
print(outputs)

BasicDecoderOutput(rnn_output=<tf.Tensor 'decoder/TensorArrayStack/TensorArrayGatherV3:0' shape=(?, 128, 31411) dtype=float32>, sample_id=<tf.Tensor 'decoder/TensorArrayStack_1/TensorArrayGatherV3:0' shape=(?, 128) dtype=int32>)


In [11]:
logits= outputs.rnn_output

cross_entropy= tf.nn.sparse_softmax_cross_entropy_with_logits(labels=dec_train_labels, logits=logits)
loss = (tf.reduce_sum(cross_entropy*tf.stack(dec_label_masks) / (batch_size*target_sequence_length)))

train_prediction = outputs.sample_id
print(train_prediction)

Tensor("decoder/TensorArrayStack_1/TensorArrayGatherV3:0", shape=(?, 128), dtype=int32)


In [None]:
print('Defining Optimizer')


global_step= tf.Variable(0,trainable=False)
inc_gstep= tf.assign(global_step,global_step+1)
learning_rate= tf.train.exponential_decay(0.001, global_step,decay_steps=20,decay_rate=0.9,staircase=True)

with tf.variable_scope('Adam'):
    adam_optimizer= tf.train.AdamOptimizer(learning_rate)
    
adam_gradients, v= zip(*adam_optimizer.compute_gradients(loss))
adam_gradients, _ = tf.clip_by_global_norm(adam_gradients,25.0)
adam_optimize= adam_optimizer.apply_gradients(zip(adam_gradients,v))

with tf.variable_scope('AdaDelta'):
    ada_optimizer = tf.train.AdadeltaOptimizer(learning_rate)

ada_gradients, v = zip(*ada_optimizer.compute_gradients(loss))
ada_gradients, _ = tf.clip_by_global_norm(ada_gradients, 25.0)
ada_optimize = ada_optimizer.apply_gradients(zip(ada_gradients, v))
sess = tf.InteractiveSession()

Defining Optimizer


In [None]:
saver= tf.train.Saver()

if not os.path.exists('logs'):
    os.mkdir('logs')
log_dir = 'logs'

bleu_scores_over_time = []
loss_over_time = []
tf.global_variables_initializer().run()

src_word_embeddings = np.load('vi_embeddings.npy')
tgt_word_embeddings = np.load('en_embeddings.npy')

# Defining data generators
enc_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=source_sequence_length,is_source=True)
dec_data_generator = DataGeneratorMT(batch_size=batch_size,num_unroll=target_sequence_length,is_source=False)

num_steps = 5000
avg_loss = 0

#writer = tf.summary.FileWriter('./graphs', sess.graph)
#scalar_summary = tf.summary.scalar('Loss: ', loss)

print('Started Training')

for step in range(num_steps):

    # input_sizes for each bin: [40]
    # output_sizes for each bin: [60]
    print('.',end='')
    if (step+1)%100==0:
        print('')
        
    sent_ids = np.random.randint(low=0,high=train_inputs.shape[0],size=(batch_size))
    # ====================== ENCODER DATA COLLECTION ================================================
    
    eu_data, eu_labels, _, eu_lengths = enc_data_generator.unroll_batches(sent_ids=sent_ids)
    
    feed_dict = {}
    feed_dict[enc_train_inp_lengths] = eu_lengths
    for ui,(dat,lbl) in enumerate(zip(eu_data,eu_labels)):            
        feed_dict[enc_train_inputs[ui]] = dat                
    
    # ====================== DECODER DATA COLLECITON ===========================
    # First step we change the ids in a batch
    du_data, du_labels, _, du_lengths = dec_data_generator.unroll_batches(sent_ids=sent_ids)
     
    feed_dict[dec_train_inp_lengths] = du_lengths
    for ui,(dat,lbl) in enumerate(zip(du_data,du_labels)):            
        feed_dict[dec_train_inputs[ui]] = dat
        feed_dict[dec_train_labels[ui]] = lbl
        feed_dict[dec_label_masks[ui]] = (np.array([ui for _ in range(batch_size)])<du_lengths).astype(np.int32)
    
    # ======================= OPTIMIZATION ==========================
    if step < 10000:
        _,l,tr_pred = sess.run([adam_optimize,loss,train_prediction], feed_dict=feed_dict)
    else:
        _,l,tr_pred = sess.run([ada_optimize,loss,train_prediction], feed_dict=feed_dict)

    # writer.add_summary(summary, step)    
    tr_pred = tr_pred.flatten()
    # print(len(feed_dict[dec_train_inputs[1]]))
        
    if (step+1)%250==0:  
        bleu_labels, bleu_preds = [],[]
        print('Step ',step+1)

        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[::batch_size].tolist():
            
            print_str += tgt_reverse_dictionary[w] + ' '
            bleu_labels.append(tgt_reverse_dictionary[w])
            
            if tgt_reverse_dictionary[w] == '</s>':
                break
                      
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[::batch_size].tolist():
            
            print_str += tgt_reverse_dictionary[w] + ' '
            bleu_preds.append(tgt_reverse_dictionary[w])
            
            if tgt_reverse_dictionary[w] == '</s>':
                break
                
        print(print_str)
        print()
        
        print('BLEU: ')
        print(nltk.translate.bleu_score.sentence_bleu([bleu_labels], bleu_preds))
       
        print('\n')  
        
        bleu_labels, bleu_preds = [],[]
        
        
        rand_idx = np.random.randint(low=1,high=batch_size)
        print_str = 'Actual: '
        for w in np.concatenate(du_labels,axis=0)[rand_idx::batch_size].tolist():
            
            print_str += tgt_reverse_dictionary[w] + ' '
            bleu_labels.append(tgt_reverse_dictionary[w])
            
            if tgt_reverse_dictionary[w] == '</s>':
                break
                
        print(print_str)
        print()
        
        print_str = 'Predicted: '
        for w in tr_pred[rand_idx::batch_size].tolist():
            
            print_str += tgt_reverse_dictionary[w] + ' '
            bleu_preds.append(tgt_reverse_dictionary[w])
            
            if tgt_reverse_dictionary[w] == '</s>':
                break
        print(print_str)
        print()
        
        print('BLEU: ')
        print(nltk.translate.bleu_score.sentence_bleu([bleu_labels], bleu_preds))
        print()  
        
    avg_loss += l
    
    #sess.run(reset_train_state) # resetting hidden state for each batchb 
    
    if (step+1)%500==0:
        print('============= Step ', str(step+1), ' =============')
        print('\t Loss: ',avg_loss/500.0)
        
        loss_over_time.append(avg_loss/500.0)
             
        avg_loss = 0.0
        sess.run(inc_gstep)
    
    #if(step)%2000==0:
        #saver.save(sess,'/NMT/models/Attention-Bi-Tokenized-292K-ViEn-B64-Luong',global_step=step)

