In [1]:
import requests
import numpy as np
import tensorflow as tf 
from tensorflow import keras

from tensorflow.keras.layers import RepeatVector, Dense, TimeDistributed

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
from tensorflow.keras.utils import to_categorical

In [5]:
french_url = r"https://assets.datacamp.com/production/repositories/4609/datasets/644e461abb0910edb038e8b2c4ce7071b5aeca12/vocab_fr.txt"
english_url = r"https://assets.datacamp.com/production/repositories/4609/datasets/3459f954752fb2fce7c0b29e25f067e9784b69fb/vocab_en.txt"

In [6]:
french_sentences = requests.get(french_url).text
english_sentences = requests.get(english_url).text

In [7]:
french_sentences[:100]

"new jersey est parfois calme pendant l' automne , et il est neigeux en avril .\nles Ã©tats-unis est g"

In [8]:
french_sentences = french_sentences.split("\n")
english_sentences = english_sentences.split("\n")

In [9]:
len(french_sentences)

137861

In [10]:
len(english_sentences)

137861

In [11]:
en_len = 20
en_vocab = 100
fr_len = 25
fr_vocab = 125
hsize = 48

In [12]:
encoder_input = keras.layers.Input(shape = (en_len,en_vocab))
encoded_out, encoded_state =  keras.layers.GRU(hsize, return_state=True)(encoder_input)

In [13]:
decoder_input = RepeatVector(fr_len)(encoded_state)
decoder_gru_output = keras.layers.GRU(hsize, return_sequences=True)(decoder_input, initial_state=encoded_state)

In [14]:
dense_time = TimeDistributed(Dense(fr_vocab, activation = "softmax"))

In [15]:
pred = dense_time(decoder_gru_output)

In [16]:
machine_translation = keras.models.Model(inputs = encoder_input, outputs = pred)

In [17]:
machine_translation.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["acc"])

In [18]:
machine_translation.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20, 100)]    0                                            
__________________________________________________________________________________________________
gru (GRU)                       [(None, 48), (None,  21600       input_1[0][0]                    
__________________________________________________________________________________________________
repeat_vector (RepeatVector)    (None, 25, 48)       0           gru[0][1]                        
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, 25, 48)       14112       repeat_vector[0][0]              
                                                                 gru[0][1]                    

In [19]:
english_sentences[0]

'new jersey is sometimes quiet during autumn , and it is snowy in april .'

In [20]:
french_sentences[0]

"new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."

In [21]:
french_token = Tokenizer(num_words=fr_vocab, oov_token="UNK")
english_token = Tokenizer(num_words=en_vocab, oov_token="UNK")

In [22]:
french_token.fit_on_texts(french_sentences)
english_token.fit_on_texts(english_sentences)

In [23]:
english_token.texts_to_sequences(["I have never done this"])

[[97, 1, 11, 1, 1]]

In [24]:
english_token.index_word[97]

'i'

In [25]:
a = [1,2,4,5,6]

In [26]:
a[::-1]

[6, 5, 4, 2, 1]

In [27]:
def english_to_vector(sentences):
    tokenized_sent = english_token.texts_to_sequences(sentences)
    preprocessed = pad_sequences(tokenized_sent, padding="post", truncating="post", maxlen=en_len)
    
    preprocessed = preprocessed[:,::-1]
    preprocessed = to_categorical(preprocessed, num_classes=en_vocab)
    return preprocessed

In [28]:
inp = english_to_vector(["I have never done this"])

In [29]:
inp.shape

(1, 20, 100)

In [30]:
def french_to_vector(sentences):
    tokenized_sent = french_token.texts_to_sequences(sentences)
    preprocessed = pad_sequences(tokenized_sent, padding="post", truncating="post", maxlen=fr_len)
    
    preprocessed = to_categorical(preprocessed, num_classes=fr_vocab)
    return preprocessed

In [31]:
english_X = english_to_vector(english_sentences)
french_y = french_to_vector(french_sentences)

In [32]:
english_X.shape

(137861, 20, 100)

In [33]:
machine_translation

<tensorflow.python.keras.engine.training.Model at 0x7f119842ee48>

In [34]:
english_X.shape[0] / 1000

137.861

In [35]:
machine_translation.fit(english_X[:10000], french_y[:10000], epochs = 5,batch_size=500)

Train on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f11ad989048>

In [36]:
machine_translation.fit(english_X[:10000], french_y[:10000], epochs = 50,batch_size=500)

Train on 10000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f117a599f28>

In [37]:
required_text = [english_sentences[0]]

In [38]:
required_text

['new jersey is sometimes quiet during autumn , and it is snowy in april .']

In [39]:
vector = english_to_vector(required_text)

In [40]:
vector

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)

In [41]:
preds = machine_translation.predict(vector)

In [42]:
preds

array([[[1.80368329e-06, 3.38521576e-03, 2.80015287e-04, ...,
         1.69900304e-05, 8.38451888e-06, 1.01187696e-04],
        [7.40098983e-07, 8.81211273e-03, 8.54672026e-03, ...,
         7.23950870e-05, 3.95257739e-05, 2.80389999e-04],
        [3.48503386e-06, 5.61072771e-03, 7.60605395e-01, ...,
         7.47767772e-05, 4.07623447e-05, 7.58126029e-04],
        ...,
        [9.97195244e-01, 2.34961757e-04, 7.04787744e-05, ...,
         1.70168630e-06, 1.39013741e-06, 4.56218373e-07],
        [9.97227013e-01, 2.31085811e-04, 6.99774973e-05, ...,
         1.68194674e-06, 1.38182929e-06, 4.49790662e-07],
        [9.97245073e-01, 2.28846649e-04, 6.96911957e-05, ...,
         1.67034807e-06, 1.37698032e-06, 4.46097999e-07]]], dtype=float32)

In [43]:
preds.shape

(1, 25, 125)

In [45]:
np.argmax(preds, axis = -1)[0]

array([36, 35,  2, 10, 16,  3,  3,  7,  4,  4,  2,  3,  3,  3,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0])

In [46]:
french_token.index_word[36]

'new'

In [49]:
" ".join([french_token.index_word[index] for index in np.argmax(preds, axis = -1)[0] if index != 0])

'new jersey est jamais au en en et il il est en en en'