In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/ita.txt


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
stop = stopwords.words('english')


In [3]:
train, test = train_test_split(pd.read_csv('../input/ita.txt', sep = '\t', header = None), test_size = 0.10)

In [4]:
train.columns = ['english', 'italian']
print (train.shape)
print (test.shape)
train.head()

(289289, 2)
(32144, 2)


Unnamed: 0,english,italian
133839,We can't leave Tom here.,Noi non possiamo lasciare qui Tom.
146675,Tom got on the wrong bus.,Tom è salito sull'autobus sbagliato.
157856,I'm up to my neck in work.,Io ne ho fino al collo con il lavoro.
131707,Tom can understand Mary.,Tom riesce a comprendere Mary.
28763,Tom called back.,Tom ha richiamato.


In [5]:
# basic text preprocessing to both 'english' and 'italian' text 
# converting all text to lowercase and removing unneccesary characters
train['eng_lower'] = train.english.str.lower()
train['eng_no_punc'] = train['eng_lower'].str.replace('[^\w\s]','')

'''
we add 'start' and 'end' at the begining and ending off the italian text so have some indecation so that our decoder
can tell where to start and where to stop
'''
train['ita_lower'] = train['italian'].str.lower()
train['ita_no_punc'] = '_start_'+' '+train['ita_lower'].str.replace('[^\w\s]','')+' '+'_end_'




In [6]:
train.head()

Unnamed: 0,english,italian,eng_lower,eng_no_punc,ita_lower,ita_no_punc
133839,We can't leave Tom here.,Noi non possiamo lasciare qui Tom.,we can't leave tom here.,we cant leave tom here,noi non possiamo lasciare qui tom.,_start_ noi non possiamo lasciare qui tom _end_
146675,Tom got on the wrong bus.,Tom è salito sull'autobus sbagliato.,tom got on the wrong bus.,tom got on the wrong bus,tom è salito sull'autobus sbagliato.,_start_ tom è salito sullautobus sbagliato _end_
157856,I'm up to my neck in work.,Io ne ho fino al collo con il lavoro.,i'm up to my neck in work.,im up to my neck in work,io ne ho fino al collo con il lavoro.,_start_ io ne ho fino al collo con il lavoro _...
131707,Tom can understand Mary.,Tom riesce a comprendere Mary.,tom can understand mary.,tom can understand mary,tom riesce a comprendere mary.,_start_ tom riesce a comprendere mary _end_
28763,Tom called back.,Tom ha richiamato.,tom called back.,tom called back,tom ha richiamato.,_start_ tom ha richiamato _end_


In [7]:
max_features = 5000
max_len = 35


In [8]:
# word embedding
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words = max_features)
tok1.fit_on_texts(list(train['eng_no_punc']))


In [9]:
tf_train_eng = tok1.texts_to_sequences(list(train['eng_no_punc']))
tf_train_eng = tf.keras.preprocessing.sequence.pad_sequences(tf_train_eng, maxlen = max_len)


In [10]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words = max_features, filters = '*')
tok2.fit_on_texts(list(train['ita_no_punc']))
tf_train_ita = tok2.texts_to_sequences(list(train['ita_no_punc']))
tf_train_ita = tf.keras.preprocessing.sequence.pad_sequences(tf_train_ita, maxlen = max_len, padding = 'post')

In [11]:
#defining model inputs and outputs for ecoder and decoder
encoder_input_data = tf_train_eng
doc_length = encoder_input_data.shape[1]
print (f'shape of encoder input:{encoder_input_data.shape}')

# decoder
vectorized_italian = tf_train_ita
# for decoder input we dont need last word as it for prediction
decoder_input_data = vectorized_italian[:,:-1]
# decoder target data is 1 time step ahead from 'decoder inputdata' 
decoder_target_data = vectorized_italian[:,1:]

print (f'shape of decoder input data:{decoder_input_data.shape}')
print (f'shape of decder target data:{decoder_target_data.shape}')

shape of encoder input:(289289, 35)
shape of decoder input data:(289289, 34)
shape of decder target data:(289289, 34)


In [12]:
vocab_size_encoder = len(tok1.word_index)+1
vocab_size_decoder = len(tok1.word_index)+1

In [13]:
# model atchitecture
latent_dim = 40

# encoder model
encoder_inputs = tf.keras.Input(shape = (doc_length,), name= 'Encoder-Input')
# word embedding for encoder
x=tf.keras.layers.Embedding(vocab_size_encoder,latent_dim,name='Body-word-embedding',mask_zero=False)(encoder_inputs)
# batch normalization
x = tf.keras.layers.BatchNormalization(name = 'Encoder-Batchnorm-1')(x)
# we need only hidden state of the encoder we dont need the output
_,state_h = tf.keras.layers.GRU(latent_dim, return_state = True, name = 'Encoder-last-gru')(x)
# encoder model
encoder_model = tf.keras.Model(inputs = encoder_inputs, outputs = state_h, name = 'Encoder-model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

# decoder model
decoder_inputs = tf.keras.Input(shape = (None,), name = 'Decoder-input')
# word embedding for decoder italian
dec_emb=tf.keras.layers.Embedding(vocab_size_decoder,latent_dim,name='Decoder-word-embedding',mask_zero=False)(decoder_inputs)
#batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)  
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)
 
# Seq2Seq Model 

seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [14]:
seq2seq_Model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Decoder-word-embedding (Embeddi (None, None, 40)     526600      Decoder-input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      [(None, 35)]         0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 40)     160         Decoder-word-embedding[0][0]     
______________________________________________________________________________________________

In [15]:
batch_size = 1200
epochs = 15
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,  epochs=epochs,  validation_split=0.12) 

Train on 254574 samples, validate on 34715 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [16]:
seq2seq_Model1 = seq2seq_Model

In [17]:
test_text = ['where is this restaurant?']

In [18]:
tok1.fit_on_texts(test_text)

In [19]:
raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=max_len)

In [20]:
body_encoding = encoder_model.predict(raw_tokenized)

In [21]:
latent_dim = seq2seq_Model.get_layer('Decoder-word-embedding').output_shape[-1]

In [22]:
decoder_inputs = seq2seq_Model.get_layer('Decoder-input').input
dec_emb = seq2seq_Model.get_layer('Decoder-word-embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)

In [23]:
gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')


In [24]:
gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

In [25]:
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [26]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [27]:
original_body_encoding = body_encoding

In [28]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [29]:
state_value

array([[1]])

In [30]:
decoded_sentence = []
stop_condition = False

In [31]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())


In [32]:

while not stop_condition:
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    #print(pred_idx)
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= max_len:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)



dovè
questo
ristorante
_end_


In [33]:
print ('sample text is %s' % test_text)
translated_text = ' '.join(decoded_sentence)
print ('-----------------------')
print ('translated sample text is: "%s"' %translated_text)

sample text is ['where is this restaurant?']
-----------------------
translated sample text is: "dovè questo ristorante"


when we translate "**where is this restaurant?**" in google-translate we get "**dov'è questo ristorante?**"<br> and when we translated the sentence we got "**dovè questo ristorante**" which means "**where this restaurant is**"...<br>the difference is "**dov'è**" from google translate means "**where is it**" ,while "**dovè**" from this model's translation mean "**where is**". <br> we can see the model that can translate the essence of the sample text correctly, well since this a simple model i think it did a good job. 