In [0]:
import os
import re
import collections
import numpy as np

In [0]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [0]:
def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, "r") as f:
        data = f.read()
    return data.split('\n')

def process_data(data):
    new_sentence = []
    for sentence in data:
        clean = re.compile('<.*>')
        new_sentence.append(re.sub(clean, '', sentence))
    del new_sentence[-2:]
    return new_sentence


In [0]:
english = process_data(load_data('/data/small_vocab_en.txt'))
french = process_data(load_data('/data/small_vocab_fr.txt'))

In [5]:
english_words_counter = collections.Counter([word for sentence in english for word in sentence.split(" ")])
print("Total no. of words in english: ", len([word for sentence in english for word in sentence.split(" ")]))
print("Total no. of unique words: ", len(english_words_counter))

french_words_counter = collections.Counter([word for sentence in french for word in sentence.split(" ")])
print("Total no. of words in french: ", len([word for sentence in french for word in sentence.split(" ")]))
print("Total no. of unique words: ", len(french_words_counter))

Total no. of words in english:  1823292
Total no. of unique words:  228
Total no. of words in french:  1961298
Total no. of unique words:  356


In [0]:
def tokenize(x):
    tk = Tokenizer(char_level = False)
    tk.fit_on_texts(x)
    return tk.texts_to_sequences(x), tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

In [0]:
def preprocess(x, y):
    data_x, x_tk = tokenize(x)
    data_y, y_tk = tokenize(y)

    data_x = pad(data_x)
    data_y = pad(data_y)

    data_y = data_y.reshape(*data_y.shape, 1)

    return data_x, data_y, x_tk, y_tk

In [0]:
processed_english, processed_french, english_tokenizer, french_tokenizer = preprocess(english, french)

In [11]:
max_english_sequence_length = processed_english.shape[1]
max_french_sequence_length = processed_french.shape[1]
print(max_english_sequence_length)
print(max_french_sequence_length)
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

15
21


In [0]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

#**Simple RNN**



In [0]:
def simple_model(input_shape, french_vocab_size):
    input_seq = Input(input_shape[1:])
    rnn = GRU(64, return_sequences = True)(input_seq)
    logits = TimeDistributed(Dense(french_vocab_size))(rnn)

    model = Model(input_seq, Activation('softmax')(logits))
    model.compile(loss = sparse_categorical_crossentropy, optimizer = Adam(1e-3), metrics = ['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2], 1))

In [0]:
simple_rnn_model = simple_model(tmp_x.shape, french_vocab_size)

In [0]:
simple_rnn_model.fit(tmp_x, processed_french, batch_size=1024, epochs=30, validation_split=0.2)

In [0]:
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois chaud en l' et il est est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [0]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

#**Using Embeddings**

In [0]:
def embedding_model(input_shape, french_vocab_size):
    embedding = Embedding(french_vocab_size, 64, input_length = input_shape[1])
    rnn = GRU(64, return_sequences = True, activation='tanh')
    logits = TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    
    model = Sequential()
    
    model.add(embedding)
    model.add(rnn)
    model.add(logits)
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(1e-3),
                  metrics=['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english, max_french_sequence_length)
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2]))

In [0]:
print(tmp_x.shape)

(137860, 21)


In [0]:
simple_embedd_model = embedding_model(tmp_x.shape, french_vocab_size)

In [0]:
simple_embedd_model.fit(tmp_x, processed_french, batch_size=1024, epochs=10, validation_split=0.2)

In [0]:
print(logits_to_text(simple_embedd_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois calme en l' et il il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#**Bidirectional RNN**

In [0]:
def simple_bidirectional_model(input_shape, french_vocab_size):
    
    model = Sequential()
    
    bdd = Bidirectional(GRU(128, return_sequences = True, dropout = 0.1), input_shape = input_shape[1:])
    logits = TimeDistributed(Dense(french_vocab_size, activation='softmax'))
    
    model.add(bdd)
    model.add(logits)
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(1e-3), 
                 metrics = ['accuracy'])
    
    return model

In [0]:
tmp_x = pad(processed_english, processed_french.shape[1])
tmp_x = tmp_x.reshape((-1, processed_french.shape[-2], 1))

In [0]:
simple_bdd_model = simple_bidirectional_model(tmp_x.shape,  french_vocab_size+1)

In [0]:
simple_bdd_model.fit(tmp_x, processed_french, batch_size=1024, epochs=20, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f84049b5f28>

In [0]:
print(logits_to_text(simple_bdd_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois occupé en printemps mais il est agréable en mai <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


#***Encoder Decoder***

In [0]:
def encdec_model(input_shape, output_sequence_length, french_vocab_size):
    
    model = Sequential()
    
    model.add(GRU(128, input_shape = input_shape[1:], return_sequences = False))
    model.add(RepeatVector(output_sequence_length))
    model.add(GRU(128, return_sequences = True))
    model.add(TimeDistributed(Dense(french_vocab_size, activation = 'softmax')))

    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(1e-3), 
                 metrics = ['accuracy'])
    return model

In [0]:
tmp_x = pad(processed_english)
tmp_x = tmp_x.reshape((-1, processed_english.shape[1], 1))

In [0]:
enco_deco_model = encdec_model(
    tmp_x.shape,
    processed_french.shape[1],
    french_vocab_size+1)

In [0]:
encodeco_model.fit(tmp_x, processed_french, batch_size=1024, epochs=20, validation_split=0.2)

Train on 110288 samples, validate on 27572 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f84036dd518>

# *Custom*

Create a model that incorporates embedding and a bidirectional RNN into one model.

In [0]:
def final_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    model = Sequential()
    
    embed_layer = Embedding(input_dim = english_vocab_size, output_dim = 128, input_length = input_shape[1])
    bd_layer_1 = Bidirectional(GRU(256, return_sequences = False))
    repeat_vector_1 = RepeatVector(output_sequence_length)
    bd_layer_2 = Bidirectional(GRU(256, return_sequences = True))
    time_distributed = TimeDistributed(Dense(french_vocab_size, activation = 'softmax'))
    
    model.add(embed_layer)
    model.add(bd_layer_1)
    model.add(repeat_vector_1)
    model.add(bd_layer_2)
    model.add(time_distributed)
    
    model.compile(loss = sparse_categorical_crossentropy, 
                 optimizer = Adam(5e-3), 
                 metrics = ['accuracy'])
    
    return model

In [0]:
tmp_x = pad(processed_english)

In [15]:
custom_model = final_model(tmp_x.shape, processed_french.shape[1], english_vocab_size + 1, french_vocab_size + 1)

W0625 06:50:06.253511 140378541336448 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0625 06:50:06.257546 140378541336448 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0625 06:50:06.269702 140378541336448 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init

In [21]:
custom_model.fit(tmp_x, processed_french, batch_size = 1024, epochs = 18, validation_split = 0.2)

Train on 110288 samples, validate on 27572 samples


<tensorflow.python.keras.callbacks.History at 0x7fac21d42278>

In [31]:
print(logits_to_text(custom_model.predict(tmp_x[:10])[2], french_tokenizer))

california est généralement calme en mars et il est généralement chaud en juin <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [30]:
la californie est généralement calme en mars et habituellement chaude en juin

array([[17, 23,  1, ..., 44,  0,  0],
       [ 5, 20, 21, ..., 51,  2, 45],
       [22,  1,  9, ..., 34,  0,  0],
       ...,
       [19,  1, 10, ..., 37,  0,  0],
       [24,  1, 10, ..., 54,  0,  0],
       [ 5, 84,  1, ...,  0,  0,  0]], dtype=int32)