In [2]:
import os

def load_data(path):
    # Load input file
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split("\n")

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense, TimeDistributed, Dropout
from keras.models import load_model
from IPython.display import JSON


import numpy as np
# Build the RNN layers
def simple_model(input_shape, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape[1:], return_sequences=True))
    model.add(Dropout(0.5))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size + 1, activation='softmax')))

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model

# Load English data
english_sentences = load_data('neural_machine_translation/small_vocab_en')
# Load French data
french_sentences = load_data('neural_machine_translation/small_vocab_fr')

french_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()

In [3]:
# INPUT PROCESSING
# x: list of strings (English sentences)
# y: list of strings (French translations)
# STEP 1: tokenize and pad
#   1a: initialize tokenizer
#   1b: convert sentences to lists of integer word identifiers
#   1c: pad sequences: convert data to 2D ndarray (sentence number, word number)
#         by adding 0s to fill in blanks
def tokenize_and_pad(tokenizer, x):
    """
    Tokenize and pad x
    - Use the tokenizer to process the data set into a numeric form
    - Then ensure that it is padded properly to a consistent length
    :param tokenizer: Tokenizer object from Keras 
    :param x: list of sentences/strings to be tokenized and padded
    :return: tokenized and padded x data with pads added to end
    """
    raise Exception("tokenize and pad function not implemented")
# preprocess sentences in x and y
def preprocess(x, x_tokenizer, y, y_tokenizer):
    """
    Preprocess x and y
    :param x: list of sentence strings
    :param x_tokenzier: tokenizer for x
    :param y: list of sentence strings
    :param: y_tokenizer: tokenizer for y
    :return: Tuple of (Preprocessed x, Preprocessed y)
    """
    raw_x = tokenize_and_pad(x_tokenizer, x)
    raw_y = tokenize_and_pad(y_tokenizer, y)
    # STEP 2: pad English data to have same sentence length as French
    print("X Raw Shape: %s"%(raw_x.shape,))
    print("Y Raw Shape: %s"%(raw_y.shape,))
    # STEP 3: Add third dimension of size 1 required to work with loss function
    raise Exception("preprocess function not complete")
# OUTPUT PROCESSING
# Once we remove the extra dimension needed by the loss function,
# the output shape is (word number in sentence, French vocabulary items)
def output_to_text(output, tokenizer):
    """
    Turn output from neural network into text using the tokenizer
    :param output: output from the neural network with the extra dimension removed
    :param tokenizer: Keras Tokenizer for the output language
    :return: String that represents the text of the output
    """
    index_to_words = create_word_lookup_dict(tokenizer.word_index)     # Returns: {int index : str 'word' }
    raise Exception("output_to_text function not implemented")
def create_word_lookup_dict(word_index):
    """
    :param word_index: Tokenizer dictionary mapping words to integer identifiers {'str' : int}
    :return dictionary mapping integer to words or '<PAD>' {int : 'str'}
    """
    raise Exception("create_word_lookup_dict function not implemented")

In [None]:
# VALIDATION:
preproc_english_sentences, preproc_french_sentences = preprocess(english_sentences,
                                                            english_tokenizer,
                                                            french_sentences,
                                                            french_tokenizer)
# VERIFY: Look at types and shapes of the preprocessed Data
print("English")
print(" * preproc_english_sentences_type", str(type(preproc_english_sentences)))
print(" * preproc_english_sentences_shape", str(preproc_english_sentences.shape))
print("French")
print(" * preproc_french_sentences_type", str(type(preproc_french_sentences)))
print(" * preproc_french_sentences_shape", str(preproc_french_sentences.shape))
# load the pre-trainined network
# do we need french_vocab_size here?????
french_vocab_size = len(french_tokenizer.word_index)
model = load_model("neural_machine_translation/translation.tf")
# TEST: This translates a single sentence from English to French
pred = model.predict(preproc_english_sentences[:1], verbose=0)       # Predict the French sentence from NN
# OUTPUT SHAPE: (extra dimension of size 1, word number in sentence, French vocabulary items)
pred = pred[0]                                                       # Remove Keras-required 3rd dimension
translation = output_to_text(pred, french_tokenizer)                 # To human readable French sentence
print("Test Case - English to French")
print(" * English sentence:", english_sentences[0])
print(" * Actual translation:", french_sentences[0])
print(" * Predicted translation:", translation)