In [None]:
import os

def load_data(path):
    # Load input file
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split("\n")

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense, TimeDistributed, Dropout

import numpy as np
# Build the RNN layers
def simple_model(input_shape, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape[1:], return_sequences=True))
    model.add(Dropout(0.5))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size + 1, activation='softmax')))

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model

# Load English data
english_sentences = load_data('neural_network/small_vocab_en')
# Load French data
french_sentences = load_data('neural_network/small_vocab_fr')

french_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()

def preprocess_data_and_fit_model(preprocess_func):
    preproc_english_sentences, preproc_french_sentences = preprocess_func(english_sentences,
                                                                 english_tokenizer,
                                                                 french_sentences,
                                                                 french_tokenizer)
    french_vocab_size = len(french_tokenizer.word_index)

    # Train the neural network
    model = simple_model(
        preproc_english_sentences.shape,
        french_vocab_size)
    model.summary()
    
    history = model.fit(preproc_english_sentences, preproc_french_sentences, batch_size=300,
                         epochs=2, validation_split=0.2)

In [None]:
# preprocess sentences in x and y and
# return tuple of x and y after tokenization, padding and reshaping
def preprocess_func(x, x_tokenizer, y, y_tokenizer):
    # TODO: add code here
    return

preprocess_data_and_fit_model(preprocess_func) # Do not change this line

def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    # TODO: add code here
    return

In [None]:
print("English sentence:", english_sentences[0])
print("Actual translation:", french_sentences[0])
print("Predicted translation:", logits_to_text(model.predict(preproc_english_sentences[:1])[0], french_tokenizer))