In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [None]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        data = f.read()
#     print(type(data))
    return data.lower()

In [None]:
source_path = ".nmt-eng-to-fr/small_vocab_en"
target_path = ".nmt-eng-to-fr/small_vocab_fr"
_source_data = str.join("\n",load_data(source_path).split("\n")[0:100000])
_target_data = str.join("\n",load_data(target_path).split("\n")[0:100000])

In [None]:
source_data=""
target_data=""
for idx, sentence in enumerate(_source_data.split("\n")):
    if len(sentence.split()) < 50 and len(_target_data.split("\n")[idx])<50:
        print(idx)
        source_data+=sentence+"\n"
        target_data+=_target_data.split("\n")[idx]+"\n"

In [None]:
print("Total sentences in Source Data : ",len(source_data.split("\n")))
print("Total sentences in Target Data : ",len(target_data.split("\n")))
print("Average words in source data : ", np.average([len(sentence.split()) for sentence in source_data.split("\n")]))
print("Average words in target data : ", np.average([len(sentence.split()) for sentence in target_data.split("\n")]))
print("Max words in source data : ", max([len(sentence.split()) for sentence in source_data.split("\n")]))
print("Max words in Target data : ", max([len(sentence.split()) for sentence in target_data.split("\n")]))
print("Min words in source data : ", min([len(sentence.split()) for sentence in source_data.split("\n")]))
print("Min words in Target data : ", min([len(sentence.split()) for sentence in target_data.split("\n")]))

In [None]:
#Def Dictionary
def create_dictionary(corpus):
    dictionary = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }
    vocab = list(set(corpus))
    for word in vocab:
        dictionary[word]=len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary, len(dictionary)    

In [None]:
source_word_to_int, source_int_to_word, source_vocab_size = create_dictionary(source_data.split())
target_word_to_int, target_int_to_word, target_vocab_size = create_dictionary(target_data.split())

In [None]:
print(source_vocab_size)
print(target_vocab_size)

In [None]:
def text_to_int(corpus, dictionary, isTarget=False):
    sentences = corpus.split("\n")
    converted_data=[]
    for sentence in sentences:
        temp_converted = []
        for word in sentence.split():
            temp_converted.append(dictionary[str(word)])
        if isTarget:
            temp_converted.append(dictionary[str("<EOS>")])
        converted_data.append(temp_converted)
    return np.array(converted_data)

In [None]:
#Add Padding to data
def add_padding_in_batch(data, start_index, end_index, dictionary):
    batch = data[start_index:end_index]
#     print(batch)
    max_words_in_batch = max([len(sentence) for sentence in batch])
#     print(max_words_in_batch)
    created_batch=[]
    batch_length = []
    for sentence in batch:
        padding_list = [dictionary[str("<PAD>")] for _ in range(max_words_in_batch - len(sentence))]
#         print(sentence+padding_list)
        created_batch.append(sentence+padding_list)
        batch_length.append(int(len(sentence+padding_list)))
    return created_batch,batch_length

def reverse_encoder_data(data):
    return np.array(data)[:,::-1]

In [None]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [None]:
source_processed_corpus = text_to_int(source_data, source_word_to_int)
target_processed_corpus = text_to_int(target_data, target_word_to_int, isTarget=True)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(source_processed_corpus,target_processed_corpus,
                                                    test_size=0.10, 
                                                    random_state=0)

In [None]:
#Hyperparameters
embedding_size = 200
output_size = target_vocab_size
batch_size=128
dataset_size = len(source_data.split("\n"))
keep_prob = 0.5
n_layers = 2
n_neurons = 512
training_steps = 15
l_rate = 0.001
target_vocab_size

In [None]:
#input placeholder
tf.reset_default_graph()
encoder_input = tf.placeholder(tf.int32, shape=[None, None], name="question_input")
target = tf.placeholder(tf.int32, shape=[None, None], name="answer")

keep_ratio = tf.placeholder(tf.float32, name="keep_ratio")
target_length = tf.placeholder(tf.int32, shape=[None], name="target_length")
max_target_length = tf.reduce_max(input_tensor=target_length, name="max_target_length")

In [None]:
#Encoder Cell
def encoder(inputs, n_neurons, keep_ratio, embedding_size, vocab_size, n_layers):
    #Create Word Embeddings
    embeddings = tf.contrib.layers.embed_sequence(ids=inputs, embed_dim=embedding_size, vocab_size=vocab_size)
    print(embeddings)
    
    #Create encoder_cell
    encoder_cell = [tf.nn.rnn_cell.LSTMCell(num_units=n_neurons) for _ in range(n_layers)]
    encoder_cell = tf.nn.rnn_cell.MultiRNNCell(cells=encoder_cell)
    encoder_cell = tf.nn.rnn_cell.DropoutWrapper(cell=encoder_cell, output_keep_prob=keep_ratio)
    
    #Encoder_Unrolling
    outputs, states = tf.nn.dynamic_rnn(cell=encoder_cell, inputs=embeddings, dtype=tf.float32)
    return outputs, states

In [None]:
#Attention Mechanism
def attention_mechanism(n_neurons, encoder_outputs, decoder_cell, batch_size, encoder_states):
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(memory=encoder_outputs, num_units=n_neurons)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=n_neurons) 
    initial_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_states)
    return decoder_cell, initial_state



In [None]:
#Decoder Word Embeddings
def process_decoder_inputs(inputs, target_vocab_size, embedding_size, start_token, batch_size):
    #Add SOS tag in the input
    sliced_input = tf.strided_slice(input_=inputs, begin=(0,0), end=(batch_size, -1), strides=(1,1))
    sliced_input = tf.concat([tf.fill(dims=[batch_size,1],value=start_token),sliced_input], 1, name="decoder_input")
    print(sliced_input)
    return sliced_input

#Decoder Embeddings
def create_decoder_embeddings(inputs, target_vocab_size, embedding_size):
    embedding_weights = tf.get_variable(name="decoder_embedding_weights", shape=[target_vocab_size, embedding_size], initializer=tf.random_normal_initializer())
    print(embedding_weights)
    embeddings = tf.nn.embedding_lookup(ids=inputs, params=embedding_weights, name="decoder_embedding")
    print(embeddings)
    return embedding_weights, embeddings

In [None]:
#Decoder Cell
def create_decoder_cell(n_neurons, n_layers, keep_ratio):
    #Decoder Cell
    decoder_cell = [tf.nn.rnn_cell.LSTMCell(num_units=n_neurons, name="decoder_cell") for _ in range(n_layers)]
    decoder_cell = tf.nn.rnn_cell.MultiRNNCell(decoder_cell)
    decoder_cell = tf.nn.rnn_cell.DropoutWrapper(decoder_cell, output_keep_prob=keep_ratio)
    
    return decoder_cell

#Create Training Helper
def create_training_helper(inputs, target_seq_length):
    #Decoder Helper
    helper = tf.contrib.seq2seq.TrainingHelper(inputs=inputs, sequence_length=target_seq_length, name="decoder_helper")
    return helper

    #Attention Mechanism
#     decoder_cell, initial_states = attention_mechanism(batch_size=batch_size, 
#                                                        decoder_cell=decoder_cell, 
#                                                        encoder_outputs=encoder_outputs, 
#                                                        encoder_states=initial_states, 
#                                                        n_neurons=n_neurons)
    
def create_basic_decoder(decoder_cell, helper, initial_states, projection_layer):
    #Basic Decoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, 
                                                    helper=helper, 
                                                    initial_state=initial_states, 
                                                    output_layer=projection_layer)
    
    #basic decoder dynamic unrolling
    final_decoder_outputs, final_decoder_states, final_decoder_sequence_length = tf.contrib.seq2seq.dynamic_decode(decoder=basic_decoder)
    print(final_decoder_outputs)
    print(final_decoder_states)
    print(initial_states)
    print(decoder_cell)
    return final_decoder_outputs, final_decoder_states



In [None]:
def inference(decoder_cell, decoder_embedding_weights, start_token, end_token, batch_size, projection_layer, max_sequence_length, initial_state):
    #Greedy Helper
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding=decoder_embedding_weights, start_tokens=tf.fill(dims=[batch_size], value=start_token), end_token=end_token)
    
    #basic decoder
    basic_decoder = tf.contrib.seq2seq.BasicDecoder(cell=decoder_cell, helper=helper, initial_state=initial_state, output_layer=projection_layer)
    
    #dynamic unrolling
    outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=basic_decoder, impute_finished=True, maximum_iterations=max_sequence_length)
    print(outputs)
    return outputs

In [None]:
encoder_outputs, encoder_states = encoder(inputs=encoder_input, 
                                          embedding_size=embedding_size, 
                                          keep_ratio=keep_prob, 
                                          n_layers=n_layers, 
                                          n_neurons=n_neurons,
                                          vocab_size=source_vocab_size)

processed_decoder_inputs = process_decoder_inputs(inputs=target, 
                                                  batch_size=batch_size, 
                                                  embedding_size=embedding_size, 
                                                  start_token=target_word_to_int["<GO>"], 
                                                  target_vocab_size=target_vocab_size)

decoder_embedding_weights, decoder_embeddings = create_decoder_embeddings(inputs=processed_decoder_inputs, 
                                                                          embedding_size=embedding_size, 
                                                                          target_vocab_size=target_vocab_size)

decoder_cell = create_decoder_cell(n_layers=n_layers, 
                                   keep_ratio=keep_ratio, 
                                   n_neurons=n_neurons)

helper = create_training_helper(inputs=decoder_embeddings, target_seq_length=target_length)

projection_layer = tf.layers.Dense(units=output_size)

decoder_cell, initial_states = attention_mechanism(decoder_cell=decoder_cell,
                                                   batch_size=batch_size, 
                                                   n_neurons=n_neurons, 
                                                   encoder_states=encoder_states, 
                                                   encoder_outputs=encoder_outputs)

decoder_outputs, decoder_states = create_basic_decoder(decoder_cell=decoder_cell, 
                                                       helper=helper, 
                                                       initial_states=initial_states, 
                                                       projection_layer=projection_layer)


In [None]:
#Inference Layer
infer_outputs = inference(batch_size=batch_size, 
                          decoder_cell=decoder_cell, 
                          decoder_embedding_weights=decoder_embedding_weights, 
                          start_token=target_word_to_int["<GO>"], 
                          end_token=target_word_to_int["<EOS>"], 
                          initial_state=initial_states, 
                          max_sequence_length=max_target_length, 
                          projection_layer=projection_layer)

In [None]:
logits = decoder_outputs.rnn_output
predictions = infer_outputs.sample_id
print(logits)
print(target)
print(predictions)
# test = tf.argmax(logits,axis=2)
# print(test)

In [None]:
#Sequence Masking
mask = tf.sequence_mask(lengths=target_length, maxlen=max_target_length, dtype=tf.float32, name="mask")
loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=target, weights=mask)
train_opt = tf.train.AdamOptimizer(learning_rate=l_rate).minimize(loss)


In [None]:
# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
gpu_options = tf.GPUOptions(allow_growth=True)
sess = tf.InteractiveSession(config=tf.ConfigProto(gpu_options=gpu_options))
saver = tf.train.Saver()
init = tf.global_variables_initializer()
sess.run(init)


In [None]:
#Training
display_step = 300
total_loss = []
total_acc = []

test_total_loss = []
test_total_acc = []
for step in range(training_steps):
    batch_loss = []
    display_loss = []
    batch_acc = []
    display_acc = []
    
    test_batch_loss = []
    test_display_loss = []
    test_batch_acc = []
    test_display_acc = []
    
    for mini_batch in range(dataset_size//batch_size):
        source_batch, _ = add_padding_in_batch(x_train, mini_batch,mini_batch+batch_size, source_word_to_int)
        source_batch = reverse_encoder_data(source_batch)

        target_batch, batch_target_length = add_padding_in_batch(y_train, mini_batch,mini_batch+batch_size, target_word_to_int)

        source_batch, target_batch, batch_target_length = shuffle(source_batch, target_batch, batch_target_length)
#         print(np.array(target_batch).shape)
        _,_loss = sess.run([train_opt,loss], feed_dict={encoder_input:source_batch, 
                                                        target:target_batch, 
                                                        target_length:np.array(batch_target_length), 
                                                        keep_ratio:keep_prob})
        Infer_pred = sess.run(predictions, feed_dict={encoder_input:source_batch, target_length:[30]*batch_size, keep_ratio:1.0})
        train_acc = get_accuracy(logits=np.array(Infer_pred), target=np.array(target_batch))
        batch_acc.append(train_acc)
        display_acc.append(train_acc)
        total_acc.append(train_acc)
        batch_loss.append(_loss)
        display_loss.append(_loss)
        total_loss.append(_loss)
        if mini_batch % display_step == 0 :
            print("Epoch : ", step+1, " MiniBatch : ",mini_batch,"/",dataset_size//batch_size, " Loss : ",np.mean(np.array(display_loss)), " Accuracy : ",np.mean(np.array(batch_acc)))
        
        
    for test_mini_batch in range(len(x_test)//batch_size):
        test_source_batch, _ = add_padding_in_batch(x_test, mini_batch,mini_batch+batch_size, source_word_to_int)
        test_source_batch = reverse_encoder_data(test_source_batch)

        test_target_batch, test_batch_target_length = add_padding_in_batch(y_test, mini_batch,mini_batch+batch_size, target_word_to_int)

        test_source_batch, test_target_batch, test_batch_target_length = shuffle(test_source_batch, test_target_batch, test_batch_target_length)
        test_loss = sess.run(loss, feed_dict={encoder_input:test_source_batch, 
                                                        target:test_target_batch, 
                                                        target_length:np.array(test_batch_target_length), 
                                                        keep_ratio:1.0})
        test_Infer_pred = sess.run(predictions, feed_dict={encoder_input:test_source_batch, target_length:[30]*batch_size, keep_ratio:1.0})
        test_acc = get_accuracy(logits=np.array(test_Infer_pred), target=np.array(test_target_batch))
        test_batch_acc.append(test_acc)
        test_display_acc.append(test_acc)
        test_total_acc.append(test_acc)
        test_batch_loss.append(test_loss)
        test_display_loss.append(test_loss)
        test_total_loss.append(test_loss)
        if test_mini_batch % display_step == 0 :
            print("Epoch : ", step+1, " MiniBatch : ",test_mini_batch,"/",len(x_test)//batch_size, " Loss : ",np.mean(np.array(test_display_loss)), " Accuracy : ",np.mean(np.array(test_display_acc)))
    
    print("Epoch : ", step+1, " Batch Train Average Loss : ", np.mean(np.array(batch_loss)), " Batch Train Average Accuracy : ", np.mean(np.array(batch_acc)))
    print("Epoch : ", step+1, " Batch Test Average Loss : ", np.mean(np.array(test_batch_loss)), " Batch Test Average Accuracy : ", np.mean(np.array(test_batch_acc)))
    start_num, end_num = 1000, 1050
    for i in range(start_num, end_num):
        translate_sentence = source_data.split("\n")[i].split()
        translate_sequence = [source_word_to_int[word] for word in translate_sentence]
        translate_sequence = reverse_encoder_data([translate_sequence]*batch_size)

        translate_logits = sess.run(predictions, feed_dict={encoder_input:translate_sequence,
                                                            target_length:[len(translate_sequence)*2]*batch_size, 
                                                            keep_ratio:1.0})[0]
        print("Source : ",source_data.split("\n")[i])
        print("Target : ",target_data.split("\n")[i])
        print("Predicted : "," ".join([target_int_to_word[idx] for idx in translate_logits]))
print( " Total Train Loss : ", np.mean(np.array(total_loss)), " \nTotal Train Accuracy : ", np.mean(np.array(total_acc)))
print( " Total Train Loss : ", np.mean(np.array(test_loss)), " \nTotal Test Accuracy : ", np.mean(np.array(test_total_acc)))

In [None]:
saver.restore(sess,"weights/weights.ckpt")
start_num, end_num = 1000, 1050
for i in range(start_num, end_num):
    translate_sentence = source_data.split("\n")[i].split()
    translate_sequence = [source_word_to_int[word] for word in translate_sentence]
#     translate_sequence = reverse_encoder_data([translate_sequence]*batch_size)
    
    translate_logits = sess.run(predictions, feed_dict={encoder_input:[translate_sequence]*batch_size,
                                                        target_length:[len(translate_sequence)*2]*batch_size, 
                                                        keep_ratio:1.0})[0]
    print("Source : ",source_data.split("\n")[i])
    print("Target : ",target_data.split("\n")[i])
    print("Predicted : "," ".join([target_int_to_word[idx] for idx in translate_logits]))
    print()

In [None]:
start_num, end_num = 10, 60
for i in range(start_num, end_num):
    translate_sentence = source_data.split("\n")[i].split()
    translate_sequence = [source_word_to_int[word] for word in translate_sentence]
#     translate_sequence = reverse_encoder_data([translate_sequence]*batch_size)
    
    translate_logits = sess.run(predictions, feed_dict={encoder_input:[translate_sequence]*batch_size,
                                                        target_length:[len(translate_sequence)*2]*batch_size, 
                                                        keep_ratio:1.0})[0]
    print("Source : ",source_data.split("\n")[i])
    print("Target : ",target_data.split("\n")[i])
    print("Predicted : "," ".join([target_int_to_word[idx] for idx in translate_logits]))
    print()

In [None]:
target_word_to_int

In [None]:
print("Source : ",source_data.split("\n")[0])
print("Target : ",target_data.split("\n")[0])

In [None]:
saver = tf.train.Saver()

In [None]:
saver.save(sess, "weights/weights_new.ckpt")

In [None]:
source_data