In [5]:
import os

def load_data(path):
    # Load input file
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split("\n")

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense, TimeDistributed, Dropout
from keras.models import load_model

import numpy as np
# Build the RNN layers
def simple_model(input_shape, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape[1:], return_sequences=True))
    model.add(Dropout(0.5))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size + 1, activation='softmax')))

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model

# Load English data
english_sentences = load_data('neural_machine_translation/small_vocab_en')
# Load French data
french_sentences = load_data('neural_machine_translation/small_vocab_fr')

french_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()

In [None]:
def preprocess_func(x, x_tokenizer, y, y_tokenizer):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param x_tokenzier: tokenizer for x
    :param y: Label List of sentences
    :param: y_tokenizer: tokenizer for y
    :return: Tuple of (Preprocessed x, Preprocessed y)
    """
    # TODO: add code here
    # Note: Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    raise Exception("Function not implemented")

def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    # TODO: add code here
    raise Exception("Function not implemented")

In [None]:
preproc_english_sentences, preproc_french_sentences = preprocess_func(english_sentences,
                                                            english_tokenizer,
                                                            french_sentences,
                                                            french_tokenizer)
# Train the neural network
french_vocab_size = len(french_tokenizer.word_index)
model = load_model("translation.tf")

JSON(preproc_english_sentences)
JSON(preproc_french_sentences)
print(type(preproc_english_sentences))
print(type(preproc_french_sentences))

print("English sentence:", english_sentences[0])
print("Actual translation:", french_sentences[0])
print("Predicted translation:", logits_to_text(model.predict(preproc_english_sentences[:1])[0], french_tokenizer))