In [0]:
import os
import re
import collections
import numpy as np

In [0]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, CuDNNLSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [0]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

def process_data(data):
    new_sentence = []
    for sentence in data:
        clean = re.compile('<.*>')
        new_sentence.append(re.sub(clean, '', sentence))
    del new_sentence[-2:]
    return new_sentence


In [0]:
english = process_data(load_data('/content/small_vocab_en.txt'))
french = process_data(load_data('/content/small_vocab_fr.txt'))

In [5]:
english_words_counter = collections.Counter([word for sentence in english for word in sentence.split(" ")])
print("Total no. of words in english: ", len([word for sentence in english for word in sentence.split(" ")]))
print("Total no. of unique words: ", len(english_words_counter))

french_words_counter = collections.Counter([word for sentence in french for word in sentence.split(" ")])
print("Total no. of words in french: ", len([word for sentence in french for word in sentence.split(" ")]))
print("Total no. of unique words: ", len(french_words_counter))

Total no. of words in english:  1823292
Total no. of unique words:  228
Total no. of words in french:  1961298
Total no. of unique words:  356


In [0]:
def tokenize(x):
    tk = Tokenizer(char_level = False)
    tk.fit_on_texts(x)
    return tk.texts_to_sequences(x), tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [0]:
def preprocess(x, y):
    data_x, x_tk = tokenize(x)
    data_y, y_tk = tokenize(y)

    data_x = pad(data_x)
    data_y = pad(data_y)

    data_y = data_y.reshape(*data_y.shape, 1)

    return data_x, data_y, x_tk, y_tk

In [0]:
processed_english, processed_french, english_tokenizer, french_tokenizer = preprocess(english, french)

In [9]:
max_english_sequence_length = processed_english.shape[1]
max_french_sequence_length = processed_french.shape[1]
print(max_english_sequence_length)
print(max_french_sequence_length)
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

15
21


In [0]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

#**Simple RNN**



In [0]:
def simple_model(input_shape, french_vocab_size):
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)

    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, optimizer = Adam(1e-3), metrics = ['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2], 1))

In [13]:
simple_rnn_model = simple_model(tmp_x.shape, french_vocab_size)

W0625 08:47:12.225424 140256952489856 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
simple_rnn_model.fit(tmp_x, processed_french, batch_size=1024, epochs=30, validation_split=0.2)

W0625 08:47:12.639853 140256952489856 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 110288 samples, validate on 27572 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f8fd28eb390>

In [15]:
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en mois de et il il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#**Using Embeddings**

In [0]:
def embedding_model(input_shape, french_vocab_size):
    embedding = Embedding(french_vocab_size, 64, input_length = input_shape[1])
    rnn = GRU(64, return_sequences = True, activation='tanh')
    logits = TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    
    model = Sequential()
    
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(1e-3),
                  metrics=['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2]))

In [18]:
simple_embedd_model = embedding_model(tmp_x.shape, french_vocab_size)

W0625 08:51:08.874375 140256952489856 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [19]:
simple_embedd_model.fit(tmp_x, processed_french, batch_size=1024, epochs=10, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f8fd18419b0>

In [20]:
print(logits_to_text(simple_embedd_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en l' et il il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#**Bidirectional RNN**

In [0]:
def simple_bidirectional_model(input_shape, french_vocab_size):
    
    model = Sequential()
    
    bdd = Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), input_shape = input_shape[1:])
    logits = TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    
    model.add(bdd)
    model.add(logits)
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(1e-3), 
                 metrics = ['accuracy'])
    
    return model

In [0]:
tmp_x = pad(processed_english, processed_french.shape[1])
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2], 1))

In [23]:
simple_bdd_model = simple_bidirectional_model(tmp_x.shape,  french_vocab_size+1)

W0625 08:52:32.487078 140256952489856 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0625 08:52:32.489733 140256952489856 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0625 08:52:32.491825 140256952489856 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is de

In [24]:
simple_bdd_model.fit(tmp_x, processed_french, batch_size=1024, epochs=20, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8fcf2e4390>

In [25]:
print(logits_to_text(simple_bdd_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois froid au janvier mais il est agréable en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#***Encoder Decoder***

In [0]:
def encdec_model(input_shape, output_sequence_length, french_vocab_size):
    
    model = Sequential()
    
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))

    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(1e-3), 
                 metrics = ['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english)
tmp_x = tmp_x.reshape((-1, processed_english.shape[1], 1))

In [0]:
enco_deco_model = encdec_model(
    tmp_x.shape,
    processed_french.shape[1],
    french_vocab_size+1)

In [30]:
enco_deco_model.fit(tmp_x, processed_french, batch_size=1024, epochs=20, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8fcdeabfd0>

In [31]:
print(logits_to_text(enco_deco_model.predict(tmp_x[:10])[2], french_tokenizer))

la est jamais agréable en mois et il est est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


# *Custom*

Create a model that incorporates embedding and a bidirectional RNN into one model.

In [0]:
def final_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    model = Sequential()
    
    embed_layer = Embedding(input_dim = english_vocab_size, output_dim = 128, input_length = input_shape[1])
    bd_layer_1 = Bidirectional(GRU(256, return_sequences = False))
    repeat_vector_1 = RepeatVector(output_sequence_length)
    bd_layer_2 = Bidirectional(GRU(256, return_sequences = True))
    time_distributed = TimeDistributed(Dense(french_vocab_size, activation = 'softmax'))
    
    model.add(embed_layer)
    model.add(bd_layer_1)
    model.add(repeat_vector_1)
    model.add(bd_layer_2)
    model.add(time_distributed)
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(5e-3), 
                 metrics = ['accuracy'])
    
    return model

In [0]:
tmp_x = pad(processed_english)

In [0]:
custom_model = final_model(tmp_x.shape, processed_french.shape[1], english_vocab_size + 1, french_vocab_size + 1)

In [35]:
custom_model.fit(tmp_x, processed_french, batch_size = 1024, epochs = 18, validation_split = 0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<tensorflow.python.keras.callbacks.History at 0x7f8f8189c9e8>

In [36]:
print(logits_to_text(custom_model.predict(tmp_x[:10])[2], french_tokenizer))

california est généralement calme en mars et il est généralement chaud en juin <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


# *LSTM*

In [0]:
def lstm_model(input_shape, french_vocab_size):
    
    model = Sequential()
    
    embedding = Embedding(french_vocab_size, 64, input_length = input_shape[1])
    lstm_layer_1 = CuDNNLSTM(64, return_sequences = True)
    logits = TimeDistributed(Dense(french_vocab_size, activation = 'softmax'))
    
    model.add(embedding)
    model.add(lstm_layer_1)
    model.add(logits)
    
    model.compile(loss = sparse_categorical_crossentropy, optimizer = Adam(5e-3), metrics = ['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2]))

In [0]:
simple_lstm_model = lstm_model(tmp_x.shape, french_vocab_size)

In [46]:
simple_lstm_model.fit(tmp_x, processed_french, batch_size=1024, epochs=80, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/8

<tensorflow.python.keras.callbacks.History at 0x7f8f80d461d0>

In [47]:
print(logits_to_text(simple_lstm_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en l' et et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
