In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import collections
from keras_preprocessing.text import  Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import keras

**Language Translation**<br>English to French

In [0]:
#Helper Function
import os


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()

    return data.split('\n')

In [0]:
#Getting the data

source_path ='/content/drive/My Drive/Colab Notebooks/data/small_vocab_en'
target_path = '/content/drive/My Drive/Colab Notebooks/data/small_vocab_fr'
#loading English data
source_text = load_data(source_path)
#loading French data
target_text = load_data(target_path)

print('Loading Complete')

Loading Complete


**Sample data and its french transaltion**

In [0]:
print('small_vocab_en Line {} : {}'.format(1,source_text[0]))
print('small_vocab_fr Line {} : {}'.format(1, target_text[0]))
print('small_vocab_en Line {} : {}'.format(2,source_text[1]))
print('small_vocab_fr Line {} : {}'.format(2, target_text[1]))

small_vocab_en Line 1 : new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1 : new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2 : the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2 : les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


**Vocabulary**<br> Let's look at the vocabulary of the dataset

In [0]:
english_word_counter = collections.Counter([word for sentence in source_text for word in sentence.split()])
french_word_counter = collections.Counter([word for sentence in target_text for word in sentence.split() ])

print('{} of English Words'.format(len([word for sentence in source_text for word in sentence.split()])))
print('10 most common English words {}'.format(english_word_counter.most_common(10)))
print('{} of French Words'.format(len([word for sentence in target_text for word in sentence.split()])))
print('10 most common French words {}'.format(french_word_counter.most_common(10)))

1823250 of English Words
10 most common English words [('is', 205858), (',', 140897), ('.', 129039), ('in', 75525), ('it', 75137), ('during', 74933), ('the', 67628), ('but', 63987), ('and', 59850), ('sometimes', 37746)]
1961295 of French Words
10 most common French words [('est', 196809), ('.', 135619), (',', 123135), ('en', 105768), ('il', 84079), ('les', 65255), ('mais', 63987), ('et', 59851), ('la', 49861), ('parfois', 37746)]


In [0]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: tokenized x data
    """
    # TODO: Implement
    tokenized_x =Tokenizer()
    tokenized_x.fit_on_texts(x)
    return tokenized_x.texts_to_sequences(x),tokenized_x

**Tokenization of data**<br>tokenization function

In [0]:
# Sample tokenization
text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .']
text_tokenized,x_tokenizer= tokenize(text_sentences)
print(text_tokenized)
print(x_tokenizer.word_index)

[[1, 2, 4, 5, 6, 7, 1, 8, 9], [10, 11, 12, 2, 13, 14, 15, 16, 3, 17], [18, 19, 3, 20, 21]]
{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}


**Padding**<br>for making all the sequences of same length

In [0]:
def pad(x,length=None):
    """Padding x
        params:
        x: list of sequences
        length: length to Pad the sequence to if None is provided use length of the longest sequence
        return: padded numpy array of sequence
    """
    return pad_sequences(x,maxlen=length,truncating='post',padding='post')

In [0]:
#test pad
test_pad = pad(text_tokenized)
print(test_pad)

[[ 1  2  4  5  6  7  1  8  9  0]
 [10 11 12  2 13 14 15 16  3 17]
 [18 19  3 20 21  0  0  0  0  0]]


**Preprocess Pipeline**<br>Preprocessing the data

In [0]:
def preprocess(x,y):
    """
    Pre-process x and y
    params:
    x: Feature list of sentences
    y: Label list of sentences
    return :
    return preprocessed_x, pre_processed_y
    """
    preprocessed_x,x_tokenizer = tokenize(x)
    preprocessed_y,y_tokenizer= tokenize(y)

    preprocessed_x = pad(preprocessed_x)
    preprocessed_y = pad(preprocessed_y)

    #Keras's sparse_categorical_crossentropy requires the labels to be in 3 dimensions
    preprocessed_y = preprocessed_y.reshape(*preprocessed_y.shape,1)

    return preprocessed_x,preprocessed_y,x_tokenizer,y_tokenizer


preprocessd_english_sentences,preprocessd_french_sentences,en_tokenizer,fr_tokenizer = preprocess(source_text,target_text)
max_en_seq_length = preprocessd_english_sentences.shape[1]
max_fr_seq_length = preprocessd_french_sentences.shape[1]
en_vocab_size = len(en_tokenizer.word_index)
fr_vocab_size = len(fr_tokenizer.word_index)
print('Data Pre-Processed...')
print('Max sequence length of English ',max_en_seq_length)
print('Max sequence length of French ',max_fr_seq_length)
print('Length of English dictionary ',en_vocab_size)
print('Length of French dictionary ',fr_vocab_size)

Data Pre-Processed...
Max sequence length of English  15
Max sequence length of French  21
Length of English dictionary  199
Length of French dictionary  344


**Ids Back to text**

In [0]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


**Model 1** **RNN** <br>creating
Simple RNN Model

In [0]:

def simple_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a basic RNN on x and y

    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    model = Sequential()
    # layer 1 uses an GRU module with english_vocab_size hidden units
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(GRU(english_vocab_size, return_sequences=True,
                  input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(french_vocab_size)))
    model.add(Activation('softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(10e-3),
                  metrics=['accuracy'])
    return model

# Reshaping the input to work with a basic RNN
tmp_x = pad(preprocessd_english_sentences, max_fr_seq_length)
# print(tmp_x.shape)
tmp_x = tmp_x.reshape((-1, preprocessd_french_sentences.shape[-2],1))
# print(tmp_x.shape)
# print((tmp_x.shape[1:][0]))
# print(preprocessd_french_sentences.shape)

# Build the model
simple_rnn_model = simple_model(
   tmp_x.shape,
   max_fr_seq_length,
   en_vocab_size,
    fr_vocab_size)

# print a summary of the model
simple_rnn_model.summary()
print('\n')

# Train the neural network
simple_rnn_model.fit(tmp_x, preprocessd_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)

# Print prediction(s)
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], fr_tokenizer))

Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_30 (GRU)                 (None, 21, 199)           119997    
_________________________________________________________________
time_distributed_22 (TimeDis (None, 21, 344)           68800     
_________________________________________________________________
activation_28 (Activation)   (None, 21, 344)           0         
Total params: 188,797
Trainable params: 188,797
Non-trainable params: 0
_________________________________________________________________


Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en l' et il est beau en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


**Creating A RNN model using embedding**

In [0]:
def embed_model(input_shape, output_sequence_length, en_vocab_size, fr_vocab_size):
    """Bulid and train a RNN model using word embedding on x and y
    params:
        input_shape: tuple of input shape
        output_sequnce_length: length of output sequence
        en_vocab_size: Number of unique words in english dictionary
        fr_vocab_size: Number of unique words in french dictionary
        return:
            RNN model built but not trained 
    """
    model = Sequential()
    #Layer 1 uses embedding layer to enhance the word representation
    model.add(Embedding(input_dim=en_vocab_size, output_dim=output_sequence_length,
                        input_length=input_shape[1:][0]))    
    #Layer 2 uses GRU with english vocab size hidden units
    model.add(TimeDistributed(Dense(fr_vocab_size)))
    model.add(Activation('softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(10e-3),
                  metrics=['accuracy'])
    return model

#Reshaping Input
tmp_x = pad(preprocessd_english_sentences,max_fr_seq_length)
#print('temp-x shape ',tmp_x.shape)
#print('frnch sentence shape',preprocessd_french_sentences.shape)
tmp_x = tmp_x.reshape((-1,preprocessd_french_sentences.shape[-2]))
#print('final shape', tmp_x.shape)

#Building the model
embedded_rnn_model = embed_model(
   tmp_x.shape,
   max_fr_seq_length,
   en_vocab_size,
   fr_vocab_size)
#Summary of the model
embedded_rnn_model.summary()
print('\n')

#Train the model
embedded_rnn_model.fit(tmp_x, preprocessd_french_sentences, batch_size = 1024, epochs = 10, validation_split = 0.2)

#Make Predictions
print(logits_to_text(embedded_rnn_model.predict(tmp_x[:1])[0], fr_tokenizer))



Model: "sequential_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 21, 21)            4179      
_________________________________________________________________
time_distributed_23 (TimeDis (None, 21, 344)           7568      
_________________________________________________________________
activation_29 (Activation)   (None, 21, 344)           0         
Total params: 11,747
Trainable params: 11,747
Non-trainable params: 0
_________________________________________________________________


Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme en l' et il est enneigée en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


**Bi-Directional RNN**<br> They are able to see future data.

In [0]:
def bidirectional_model(input_shape, output_sequence_length, en_vocab_size, fr_vocab_size):
    """
    Build and Train a bidirectional RNN model on x and y
    params:
        input_shape: tuple of input shape
        output_sequnece_length: length of the output sequence
        en_vocab_size: number of unique words in english dictionary
        fr_vocab_size: number of unique word in french vocabulary
        Return:
    RNN model built but not trained
    """
    model = Sequential()
    #Layer 1 is BiDirectional Wrapper and GRU layer
    model.add(Bidirectional(GRU(en_vocab_size,return_sequences=True),input_shape=input_shape[1:]))
    # A dense layer to the every temporal slice of an input. For each of step
    # of the output sequence, decide which sequence should be chosen.
    model.add(TimeDistributed(Dense(fr_vocab_size)))
    model.add(Activation('softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(10e-3),
                  metrics=['accuracy'])
    return model

#Reshaping input
tmp_x = pad(preprocessd_english_sentences,max_fr_seq_length)
tmp_x = tmp_x.reshape((-1, preprocessd_french_sentences.shape[-2], 1))

#Building Model

bid_model = bidirectional_model(tmp_x.shape,
                                max_fr_seq_length,
                                en_vocab_size,
                                fr_vocab_size)
#Printing Summary of the model
bid_model.summary()
print('\n')

#Training Model
bid_model.fit(tmp_x,preprocessd_french_sentences, batch_size = 1024, epochs = 10, validation_split = 0.2)

# Making Prediction
print(logits_to_text(bid_model.predict(tmp_x[:1])[0],fr_tokenizer))

Model: "sequential_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_10 (Bidirectio (None, 21, 398)           239994    
_________________________________________________________________
time_distributed_24 (TimeDis (None, 21, 344)           137256    
_________________________________________________________________
activation_30 (Activation)   (None, 21, 344)           0         
Total params: 377,250
Trainable params: 377,250
Non-trainable params: 0
_________________________________________________________________


Train on 110288 samples, validate on 27573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
new jersey est parfois calme au mois de il est il en en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


**Encoder-Decoder Model**<br>
Encoder creates a matrix and decoder takes this matrix and predicts output transaltion.

In [0]:
def encod_decod_model(input_shape, output_sequence_length, en_vocab_size, fr_vocab_size):
    """
    Build and Train a encoder-decoder RNN model on x and y
    params:
        input_shape: tuple of input shape
        output_sequnece_length: length of the output sequence
        en_vocab_size: number of unique words in english dictionary
        fr_vocab_size: number of unique word in french vocabulary
        Return:
    RNN model built but not trained
    """
    model = Sequential()

    #The first network is encoder which accepts source language sentence, one word at a
    #time and stores its overall meaning in a vector
    #Encoder network is not used to produce any output

    model = Sequential()
    model.add(GRU(en_vocab_size, return_sequences=False,
                  input_shape=input_shape[1:]))
    model.add(Dense(fr_vocab_size))
    model.add(Activation('relu'))

    # The second network is called decoder which takes vector from encoder and 
    #expands it into the transaltion of target language, one word at a time
    
    #Get the last output of GRU and repeat it
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(en_vocab_size, return_sequences=True))
    model.add(TimeDistributed(Dense(fr_vocab_size)))
    model.add(Activation('softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(10e-3),
                  metrics=['accuracy'])
    return model
#Reshaping input
tmp_x = pad(preprocessd_english_sentences,max_fr_seq_length)
tmp_x = tmp_x.reshape((-1, preprocessd_french_sentences.shape[-2], 1))

#Building Model

enc_dec_model = encod_decod_model(tmp_x.shape,
                                max_fr_seq_length,
                                en_vocab_size,
                                fr_vocab_size)
#Printing Summary of the model
enc_dec_model.summary()
print('\n')

#Training Model
enc_dec_model.fit(tmp_x,preprocessd_french_sentences, batch_size = 1024, epochs = 10, validation_split = 0.2)

# Making Prediction
print(logits_to_text(bid_model.predict(tmp_x[:1])[0],fr_tokenizer))

Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_32 (GRU)                 (None, 199)               119997    
_________________________________________________________________
dense_31 (Dense)             (None, 344)               68800     
_________________________________________________________________
activation_31 (Activation)   (None, 344)               0         
_________________________________________________________________
repeat_vector_9 (RepeatVecto (None, 21, 344)           0         
_________________________________________________________________
gru_33 (GRU)                 (None, 21, 199)           324768    
_________________________________________________________________
time_distributed_25 (TimeDis (None, 21, 344)           68800     
_________________________________________________________________
activation_32 (Activation)   (None, 21, 344)         

**Final Model**<br>Custom Implementation

In [0]:
def final_model(input_shape, output_sequence_length, en_vocab_size, fr_vocab_size):
    """
    Build and Train a RNN model that incorporates embedding, encoder-decoder, bidirectional on x and y
    params:
        input_shape: tuple of input shape
        output_sequnece_length: length of the output sequence
        en_vocab_size: number of unique words in english dictionary
        fr_vocab_size: number of unique word in french vocabulary
        Return:
    RNN model built but not trained
    """  
    model = Sequential()

    #Layer 1 is uses embedding layer
    model.add(Embedding(input_dim=en_vocab_size, output_dim=output_sequence_length,
                        input_length=input_shape[1:][0])) 
    #Layer 2 Uses Bidirectional Wrapper and GRU Layer
    model.add(Bidirectional(GRU(en_vocab_size, return_sequences=False),
                                input_shape=input_shape[1:])) 
    #Layer 3 Uses Encode layer
    model.add(Dense(fr_vocab_size))
    model.add(Activation('relu'))
    #Layer 4 uses decoder layer
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(en_vocab_size, return_sequences=True)))
    #Layer 5 is dense layer
    model.add(TimeDistributed(Dense(fr_vocab_size)))
    model.add(Activation('softmax'))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(10e-3),
                  metrics=['accuracy'])
    return model

#Building Model

final_rnn_model = final_model(preprocessd_english_sentences.shape,
                                max_fr_seq_length,
                                en_vocab_size,
                                fr_vocab_size)
#Printing Summary of the model
final_rnn_model.summary()
print('\n')

#Training Model
final_rnn_model.fit(preprocessd_english_sentences,preprocessd_french_sentences, batch_size = 1024, epochs = 10, validation_split = 0.2)


Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 15, 21)            4179      
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 398)               263874    
_________________________________________________________________
dense_33 (Dense)             (None, 344)               137256    
_________________________________________________________________
activation_33 (Activation)   (None, 344)               0         
_________________________________________________________________
repeat_vector_10 (RepeatVect (None, 21, 344)           0         
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 21, 398)           649536    
_________________________________________________________________
time_distributed_26 (TimeDis (None, 21, 344)         

<keras.callbacks.History at 0x7fceec93e6a0>

In [0]:
    
    ## Making final predictions
    y_id_to_word = {value: key for key, value in fr_tokenizer.word_index.items()}
    y_id_to_word[0] = '<EOD>'

    sentence = 'he saw a old yellow truck'
    sentence = [en_tokenizer.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=preprocessd_english_sentences.shape[-1], padding='post')
    sentences = np.array([sentence[0], preprocessd_english_sentences[0]])
    predictions = final_rnn_model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in preprocessd_french_sentences[0]]))

Sample 1:
il a vu un vieux camion jaune <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD>
Il a vu un vieux camion jaune
Sample 2:
new jersey est parfois calme pendant l' automne et il est neigeux en avril <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD>
new jersey est parfois calme pendant l' automne et il est neigeux en avril <EOD> <EOD> <EOD> <EOD> <EOD> <EOD> <EOD>
