In [169]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

In [170]:
data_path = "por-eng/por.txt"

In [184]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2
        self.n_sentence = 0
        self.max_len = 0 

    def add_sentence(self, sentence):
        self.n_sentence += 1
        if len(sentence.split()) > self.max_len:
            self.max_len = len(sentence.split()) 
        for word in sentence.split():
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [187]:
def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s/%s.txt' % (lang2, lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[s for s in l.split('\t')] for l in lines[:10000]]

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    
    for pair in pairs:
        if len(pair[0]) < 20:
            input_lang.add_sentence(pair[0])
            output_lang.add_sentence(pair[1])

    print("Vocabulário de input %d" % input_lang.n_words) 
    print("Vocabulário de output %d" %  output_lang.n_words)      
    
    return input_lang, output_lang, pairs

In [188]:
input_lang, output_lang, pairs = read_langs('eng', 'por')

Reading lines...
Vocabulário de input 3104
Vocabulário de output 5485


In [189]:
encoder_input_data = np.zeros(
    (input_lang.n_sentence , input_lang.max_len),
    dtype='float32')
decoder_input_data = np.zeros(
    (output_lang.n_sentence, output_lang.max_len),
    dtype='float32')
decoder_target_data = np.zeros(
    (output_lang.n_sentence, output_lang.max_len, output_lang.n_words),
    dtype='float32')

for i,pair  in enumerate(pairs):
    for t, word in enumerate(pair[0].split()):
        encoder_input_data[i, t] = input_lang.word2index[word] 
    for t, word in enumerate(pair[1].split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] =  output_lang.word2index[word] 
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1,  output_lang.word2index[word] ] = 1


In [196]:
def get_model():

    embedding_size = 50

    encoder_inputs = Input(shape=(None,))
    decoder_inputs = Input(shape=(None,))

    input_lang_embedding = Embedding(input_lang.n_words, embedding_size)(encoder_inputs)
    output_lang_embedding = Embedding(output_lang.n_words, embedding_size)(decoder_inputs)

    # Encoder lstm
    encoder = LSTM(embedding_size, return_state=True)
    encoder_outputs, state_h, state_c = encoder(input_lang_embedding)
    encoder_states = [state_h, state_c]

    # decoder lstm
    decoder_lstm = LSTM(embedding_size, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(output_lang_embedding,
                                         initial_state=encoder_states)
    decoder_dense = Dense(output_lang.n_words, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    return  Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [197]:
model = get_model()
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=128,epochs=100,validation_split=0.20)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7f607a709b38>

In [201]:
# define the encoder model 
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

dex = Embedding(output_lang.n_words, embedding_size)

# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2= dex(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_37 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, None, 50)          1135600   
_________________________________________________________________
lstm_3 (LSTM)                [(None, 50), (None, 50),  20200     
Total params: 1,155,800
Trainable params: 1,155,800
Non-trainable params: 0
_________________________________________________________________


NameError: name 'input_token_index' is not defined

In [199]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
return decoded_sentence

SyntaxError: 'return' outside function (<ipython-input-199-731284743224>, line 29)

In [200]:
for seq_index in [14077,20122,40035,40064, 40056, 40068, 40090, 40095, 40100, 40119, 40131, 40136, 40150, 40153]:
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', lines.eng[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

NameError: name 'target_token_index' is not defined

In [181]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_47 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, None, 50)     155200      input_47[0][0]                   
__________________________________________________________________________________________________
embedding_15 (Embedding)        (None, None, 50)     274250      input_47[0][0]                   
__________________________________________________________________________________________________
lstm_13 (LSTM)                  [(None, 50), (None,  20200       embedding_14[0][0]               
__________________________________________________________________________________________________
lstm_14 (L