In [None]:
# Uneditable
# Preprocessing data: User is responsible for preprocessing
import collections
import os
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import GRU, Dense, TimeDistributed, Dropout
import matplotlib.pyplot as plt
def load_data(path):
    # Load input file
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split("\n")

# Load English data
english_sentences = load_data('small_vocab_en')
# Load French data
french_sentences = load_data('small_vocab_fr')

In [21]:
# preprocessing - tokenize and pad
# write a tokenize function
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer


# Pad sentences to a given length
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    return pad_sequences(x, maxlen=length, padding='post')


# Pad Tokenized output. Default padding value is 0.
# test_pad = pad(text_tokenized)
#
# for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
#     print('Sequence {} in x'.format(sample_i + 1))
#     print('  Input:  {}'.format(np.array(token_sent)))
#     print('  Output: {}'.format(pad_sent))

# preprocess sentences = tokenize + pad + reshape labels
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


preproc_english_sentences, preproc_french_sentences, english_tokenizer, \
french_tokenizer = preprocess(english_sentences, french_sentences)
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

# Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Undoes the preprocessing (so that you can read the output of the neural network)
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = "<PAD>"

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])


In [23]:
# Uneditable Display Code

# Define the NN: Build the RNN layers
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # Build the layers
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape[1:], return_sequences=True))
    model.add(Dropout(0.5))
    model.add(GRU(128, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(256, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size + 1, activation='softmax')))

    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model



# Params to model
model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size)
model.summary()

# train model
history = model.fit(tmp_x, preproc_french_sentences, batch_size=300,
                     epochs=2, validation_split=0.2, verbose=0)

2023-03-24 16:28:01.921655: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-24 16:28:01.923061: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-24 16:28:01.924477: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_10 (GRU)                (None, 21, 128)           50304     
                                                                 
 dropout_15 (Dropout)        (None, 21, 128)           0         
                                                                 
 gru_11 (GRU)                (None, 21, 128)           99072     
                                                                 
 dropout_16 (Dropout)        (None, 21, 128)           0         
                                                                 
 time_distributed_10 (TimeDi  (None, 21, 256)          33024     
 stributed)                                                      
                                                                 
 dropout_17 (Dropout)        (None, 21, 256)           0         
                                                      

2023-03-24 16:28:02.396719: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-03-24 16:28:02.399385: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-03-24 16:28:02.401378: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [17]:
# Print prediction(s)
def translate_first(model, source_sentences, target_sentences, source_preprocessed_sentences, target_tokenizer):
     print("source sentence", source_sentences[0])
     print("actual translation", target_sentences[0])
     print("predicted translation", model.predict(source_preprocessed_sentences[:1])[0])


# # Predict user inputted sentence
user_sentence = input("Enter English sentence: ")
print("You entered", user_sentence)

user_sentence = [english_tokenizer.word_index[word] for word in user_sentence.split()]
user_sentence = pad_sequences([user_sentence],
                              maxlen=preproc_french_sentences.shape[-2], padding='post')
tmp_x = user_sentence.reshape((-1, preproc_french_sentences.shape[-2], 1))

print(tmp_x.shape)
prediction = model.predict(tmp_x)
print(prediction)
print("Translation is", logits_to_text(prediction[0], french_tokenizer))

You entered he dislikes grapefruit
(1, 21, 1)
[[[6.9548046e-06 3.9689516e-04 2.6896394e-06 ... 1.0030043e-08
   2.5017338e-08 1.7226814e-08]
  [5.4793063e-09 7.5050690e-07 5.3460656e-08 ... 4.3677586e-13
   1.6343497e-12 4.1852456e-13]
  [2.0476557e-06 1.9601950e-06 3.5413148e-05 ... 1.2125438e-13
   4.6765407e-13 1.5464953e-13]
  ...
  [9.9995375e-01 8.2138467e-07 4.4887479e-06 ... 2.5995577e-14
   6.1177327e-14 6.7227582e-14]
  [9.9995756e-01 8.0436075e-07 4.3679634e-06 ... 2.2576132e-14
   5.2994474e-14 5.8953859e-14]
  [9.9996042e-01 7.9111283e-07 4.2709362e-06 ... 2.0005219e-14
   4.6808643e-14 5.2668644e-14]]]
Translation is elle aime le et pamplemousse <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
