In [None]:
%reload_ext autoreload
%autoreload 1

Using TensorFlow backend.


In [None]:
#imports
import os
import collections
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, LSTM, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional ,Dropout
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [None]:
#verify access to GPU
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6164194445097868486
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 666378641026499071
physical_device_desc: "device: XLA_CPU device"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 7263606609566238232
physical_device_desc: "device: XLA_GPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7304675328
locality {
  bus_id: 1
  links {
  }
}
incarnation: 3163008019492340206
physical_device_desc: "device: 0, name: Tesla P4, pci bus id: 0000:00:04.0, compute capability: 6.1"
]


In [None]:
#load dataset
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

### Load Data
The data is located in `data/small_vocab_en` and `data/small_vocab_fr`. The `small_vocab_en` file contains English sentences with their French translations in the `small_vocab_fr` file. Load the English and French data from these files from running the cell below.

In [None]:
#load English Data
english_sentences = load_data('data/small_vocab_en')
#load French Data
french_sentences = load_data('data/small_vocab_fr')

### Files
Each line in `small_vocab_en` contains an English sentence with the respective translation in each line of `small_vocab_fr`.  View the first two lines from each file.

In [None]:
#visualize data
for text in range(2):
    print('Englist text {}: {}'.format(text+1, english_sentences[text]))
    print('French text {}: {}'.format(text+1, french_sentences[text]))


Englist text 1: new jersey is sometimes quiet during autumn , and it is snowy in april .
French text 1: new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
Englist text 2: the united states is usually chilly during july , and it is usually freezing in november .
French text 2: les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [None]:
#total no of unique words in each vocabualary
english_word_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_word_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

print('{} English word.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique Englist words.'.format(len(english_word_counter)))
print('10 Most common words in English dataset:')
print('"' + '" "'. join(list(zip(*english_word_counter.most_common(10)))[0])+'"')
print("")
print('{} French word.'.format(len([word for sentence in french_sentences for word in sentence.split()])))
print('{} unique French words.'.format(len(french_word_counter)))
print('10 Most common words in French dataset:')
print('"' + '" "'. join(list(zip(*french_word_counter.most_common(10)))[0])+'"')

1823250 English word.
227 unique Englist words.
10 Most common words in English dataset:
"is" "," "." "in" "it" "during" "the" "but" "and" "sometimes"

1961295 French word.
355 unique French words.
10 Most common words in French dataset:
"est" "." "," "en" "il" "les" "mais" "et" "la" "parfois"


## Preprocess
we need to convert the text into sequences of integers using the following preprocess methods:
1. Tokenize the words into ids
2. Add padding to make all the sequences the same length.

### Tokenize
we will Turn each sentence into a sequence of words ids using Keras's [`Tokenizer`](https://keras.io/preprocessing/text/#tokenizer) function. 

In [None]:
def tokenize(sentences):
  """
  Tokenize sentences
  :param sentence: List of sentences to be tokenized
  :return: Tuple of tokenized sentences and tokenizer
  """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    text_tokenized = tokenizer.texts_to_sequences(sentences)

    return text_tokenized, tokenizer


# Tokenize Example output
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(sent))
    print('  Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
  Input:  The quick brown fox jumps over the lazy dog .
  Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
  Input:  By Jove , my quick study of lexicography won a prize .
  Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
  Input:  This is a short sentence .
  Output: [18, 19, 3, 20, 21]


### Padding
we need to make sure that all the sequences are of same length by adding padding to the end of each sequence

In [None]:
def pad(sentences, length=None):
  """
  pad sentences
  :param sentences: List of sentences
  :parem length: Lenght to pad the sequence to. if None use the length of the longest sequence
  :return: padded numpy array of sequences
  """
    if length is None:
        length=max([len(sentence) for sentence in sentences])

    return pad_sequences(sentences, maxlen=length, padding='post')


#pad Tokenized output
test_pad = pad(text_tokenized)
for text, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(text+1))
    print('   Input: {}'.format(np.array(token_sent)))
    print('   Output: {}'.format(pad_sent))

Sequence 1 in x
   Input: [1 2 4 5 6 7 1 8 9]
   Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
   Input: [10 11 12  2 13 14 15 16  3 17]
   Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
   Input: [18 19  3 20 21]
   Output: [18 19  3 20 21  0  0  0  0  0]


### Preprocess Pipeline


In [None]:
def preprocess(x, y):
    """
  preprocess x and y
  :param x: Feature List of sentences
  :param y: Label List of sentences
  :return: Tuple of (Preprocessed x, Preprocessed y, x_tokenizer, y_tokenizer)
  """
    preprocess_x, x_tokenizer = tokenize(x)
    preprocess_y, y_tokenizer = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

  #keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tokenizer, y_tokenizer

preprocessed_english_sentences, preprocessed_french_sentences, english_tokenizer, french_tokenizer =\
  preprocess(english_sentences, french_sentences)


max_english_sequence_length = preprocessed_english_sentences.shape[1]
max_french_sequence_length = preprocessed_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


In [None]:
def logits_to_text(logits, tokenizer):
    """
  Turn logits from Neural Network into text using the tokenizer
  :param logits: Logits from Neural Network
  :param tokenizer: keras Tokenizer fit on the labels
  :return: String that represents the text of logits
  """
    index_to_words = {id:word for word, id in tokenizer.word_index.items()}
    index_to_words[0]='<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

  
print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


### Model 1: RNN
A basic RNN model is a good baseline for sequence data. 

In [None]:
def simple_model(input_shape, out_sequence_length, english_vocab_size, french_vocab_size):
    """
  Build and Train a basic RNN on x and y
  :param input_shape: Tuple of input shape
  :param output_sequence_length: Length of output sequence
  :param english_vocab_size: Number of unique English words in the dataset
  :param french_vocab_size: Number of unique French words in the dataset
  :return: keras model built, but not trained
  """
    lr = 1e-3
    input_seq = Input(input_shape[1:])
    rnn = GRU(out_sequence_length, return_sequences=True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)
    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(learning_rate=lr),
                  metrics=['accuracy'])
    return model


#reshape input
tmp_x = pad(preprocessed_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preprocessed_french_sentences.shape[-2], 1))

#Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size
)

simple_rnn_model.fit(tmp_x, preprocessed_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

#print predictions
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
les les est est est en en est est est en est en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Build the layers

    
    learning_rate = 1e-2
    
    input_seq = Input(input_shape[1:])
    rnn = LSTM(64, return_sequences=True)(input_seq)
    dropout = Dropout(0.2)(rnn)
    # we change `plaintext_vocab_size` to `french_vocab_size` as our ouput is now French sentences intead of plain words
    logits = TimeDistributed(Dense(french_vocab_size))(dropout)

    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])

    
    return model


#reshape input
tmp_x = pad(preprocessed_english_sentences, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, preprocessed_french_sentences.shape[-2], 1))

#Train the neural network
simple_rnn_model = simple_model(
    tmp_x.shape,
    max_french_sequence_length,
    english_vocab_size,
    french_vocab_size
)

simple_rnn_model.fit(tmp_x, preprocessed_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

#print predictions
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en l' et il est il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
