In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

<h2>Carregando os dados</h2>

In [2]:
# Pares de sequências English -> French.
# http://www.manythings.org/anki/fra-eng.zip

In [3]:
fra_eng_data = pd.read_csv('fra.txt', delimiter='\t', header=None, names=['eng_sent', 'fra_sent'])
fra_eng_data.head()

Unnamed: 0,eng_sent,fra_sent
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !


<h2>Processamento dos dados</h2>

<h3>Separando os dados de entrada e saída</h3>

In [4]:
eng_texts = []
fra_texts = []

eng_chars = set()
fra_chars = set()

for i in range(len(fra_eng_data)):
    eng_sent = fra_eng_data['eng_sent'][i]
    eng_texts.append(eng_sent)
    
    fra_sent = '\t' + fra_eng_data['fra_sent'][i] + '\n' # Adicionando token de inicio "\t" e token de fim de sentença "\n"
    fra_texts.append(fra_sent)
    
    for c in eng_sent:
        if not c in eng_chars:
            eng_chars.add(c)
            
    for c in fra_sent:
        if not c in fra_chars:
            fra_chars.add(c)

print("Número de sentenças: {}".format(len(eng_texts)))
print("Número de caracteres em inglês: {}".format(len(eng_chars)))
print("Número de caracteres em francês: {}".format(len(fra_chars)))
maxlen_eng_text = max([len(sent) for sent in eng_texts])
maxlen_fra_text = max([len(sent) for sent in fra_texts])
print("Maior sentença em inglês tem {} caracteres".format(maxlen_eng_text))
print("Maior sentença em francês tem {} caracteres".format(maxlen_fra_text))

Número de sentenças: 160872
Número de caracteres em inglês: 95
Número de caracteres em francês: 112
Maior sentença em inglês tem 286 caracteres
Maior sentença em francês tem 362 caracteres


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train_eng_texts, test_eng_texts, train_fra_texts, test_fra_texts = train_test_split(eng_texts, fra_texts, test_size=0.33, random_state=42)

In [13]:
train_eng_texts[0], train_fra_texts[0]

('The bottle smashed to pieces.', '\tLa bouteille se brisa en morceaux.\n')

In [14]:
# Training data
train_data = []

for i in range(len(train_eng_texts)):
    aux = {}
    aux['encoder_text'] = train_eng_texts[i]
    aux['decoder_text'] = train_fra_texts[i]
    
    train_data.append(aux)

# Evaluation/Test data
valid_data = []

for i in range(len(test_eng_texts)):
    aux = {}
    aux['encoder_text'] = test_eng_texts[i]
    aux['decoder_text'] = test_fra_texts[i]
    
    valid_data.append(aux)

<h3>Mapeamento char - id</h3>

In [15]:
eng_char_id = {c: i for i, c in enumerate(sorted(eng_chars))}
fra_char_id = {c: i for i, c in enumerate(sorted(fra_chars))}

<h3>Data Generator</h3>

In [16]:
from __future__ import division
from keras.utils import Sequence

In [43]:
class BatchGenerator(Sequence):
    
    def __init__(self, instances, encoder_char_dict, decoder_char_dict, maxlen_encoder_text, maxlen_decoder_text, batch_size):
        self.instances = instances
        self.encoder_char_dict = encoder_char_dict
        self.decoder_char_dict = decoder_char_dict
        self.maxlen_encoder_text = maxlen_encoder_text
        self.maxlen_decoder_text = maxlen_decoder_text
        self.batch_size = batch_size
        self.shuffle = True
        
        if self.shuffle: np.random.shuffle(self.instances)
        
    def __len__(self):
        return int(np.ceil(len(self.instances) / self.batch_size))
    
    def on_epoch_end(self):
        if self.shuffle: np.random.shuffle(self.instances)
            
    def eval_batch(self, idx):
        l_bound = idx * self.batch_size
        r_bound = (idx + 1) * self.batch_size
        
        if r_bound > len(self.instances):
            r_bound = len(self.instances)
            l_bound = r_bound - self.batch_size
        
        train_instances = self.instances[l_bound:r_bound]
        
        encoder_input_data = []
        decoder_input_data = []
        decoder_target_data = []
        
        for i in range(self.batch_size):
            input_text = train_instances[i]['encoder_text']
            target_text = train_instances[i]['decoder_text']
            
            aux1 = []
            aux2 = []
            aux3 = []
            
            for t, char in enumerate(input_text):
                aux1.append(char)
            for t, char in enumerate(target_text):
                aux2.append(char)
                if t > 0:
                    aux3.append(char)
            
            encoder_input_data.append(''.join(aux1))
            decoder_input_data.append(''.join(aux2))
            decoder_target_data.append(''.join(aux3))
        
        return [encoder_input_data, decoder_input_data], decoder_target_data
        
    
    def __getitem__(self, idx):
        
        l_bound = idx * self.batch_size
        r_bound = (idx + 1) * self.batch_size
        
        if r_bound > len(self.instances):
            r_bound = len(self.instances)
            l_bound = r_bound - self.batch_size
        
        train_instances = self.instances[l_bound:r_bound]
        
        encoder_input_data = np.zeros((self.batch_size, self.maxlen_encoder_text, len(self.encoder_char_dict)))
        decoder_input_data = np.zeros((self.batch_size, self.maxlen_decoder_text, len(self.decoder_char_dict)))
        decoder_target_data = np.zeros((self.batch_size, self.maxlen_decoder_text, len(self.decoder_char_dict)))
        
        for i in range(self.batch_size):
            input_text = train_instances[i]['encoder_text']
            target_text = train_instances[i]['decoder_text']
            
            for t, char in enumerate(input_text):
                encoder_input_data[i, t, self.encoder_char_dict[char]] = 1.
            for t, char in enumerate(target_text):
                decoder_input_data[i, t, self.decoder_char_dict[char]] = 1.
                if t > 0:
                    decoder_target_data[i, t - 1, self.decoder_char_dict[char]] = 1.
        
        return [encoder_input_data, decoder_input_data], decoder_target_data

In [75]:
train_generator = BatchGenerator(train_data + valid_data, eng_char_id, fra_char_id, maxlen_eng_text, maxlen_fra_text, 128)
valid_generator = BatchGenerator(valid_data, eng_char_id, fra_char_id, maxlen_eng_text, maxlen_fra_text, 128)

In [76]:
train_generator.__len__(), valid_generator.__len__()

(1257, 415)

In [77]:
[a, b], c = train_generator.eval_batch(0)

In [78]:
a[1], b[1], c[1]

('I need a little time.',
 "\tJ'ai besoin d'un peu de temps.\n",
 "J'ai besoin d'un peu de temps.\n")

<h2>Criando o modelo encoder-decoder</h2>

In [21]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

from keras.models import *
from keras.layers import *

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

In [37]:
encoder_input_dim = len(eng_char_id)
decoder_input_dim = len(fra_char_id)
lstm_units = 256
    
# Encoder
encoder_inputs = Input(shape=(None, encoder_input_dim))
encoder = LSTM(lstm_units, return_state=True)
_, state_h, state_c = encoder(encoder_inputs)

encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None, decoder_input_dim))
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(decoder_input_dim, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo completo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='Adam', loss='categorical_crossentropy')

In [39]:
model.load_weights("temp_weights.hdf5")

In [28]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 95)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 112)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 360448      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  377856      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [81]:
# model.fit_generator(train_generator, 
#                     epochs=100, 
#                     verbose=1,
#                     validation_data=valid_generator,
#                     workers=8)

In [82]:
model.save_weights("temp_weights.hdf5")

<h1>Inferência</h1>

<h2>Modelos Encoder e Decoder</h2>

In [83]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

<h2>Mapeamento reverso -> id - char</h2>

In [84]:
eng_id_char = {eng_char_id[key]: key for key in eng_char_id.keys()}
fra_id_char = {fra_char_id[key]: key for key in fra_char_id.keys()}

<h2>Decodificando</h2>

In [85]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, len(fra_id_char)))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, fra_char_id['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = fra_id_char[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or len(decoded_sentence) > maxlen_fra_text):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, len(fra_id_char)))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [86]:
eval_generator = BatchGenerator(valid_data, eng_char_id, fra_char_id, maxlen_eng_text, maxlen_fra_text, 1)

In [87]:
[encoder_input_data, _], _ = eval_generator.__getitem__(0)

In [88]:
encoder_input_data.shape

(1, 286, 95)

In [89]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    [encoder_input_data, _], _ = eval_generator.__getitem__(seq_index)
    [encoder_input_text, _], _ = eval_generator.eval_batch(seq_index)
    input_seq = encoder_input_data
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', encoder_input_text[0])
    print('Decoded sentence:', decoded_sentence)

-
('Input sentence:', "I can't understand anything he said.")
('Decoded sentence:', 'Je ne peux pas vous dire ce que tu veux.\n')
-
('Input sentence:', 'I may as well go now.')
('Decoded sentence:', "J'ai perdu la porte de chance.\n")
-
('Input sentence:', 'The situation became chaotic.')
('Decoded sentence:', 'Le chien de concert de conseil.\n')
-
('Input sentence:', 'Please smile.')
('Decoded sentence:', 'Veuillez me rendre.\n')
-
('Input sentence:', "I don't believe it happened that way.")
('Decoded sentence:', 'Je ne pense pas que tu aies de mois de cette chance.\n')
-
('Input sentence:', 'All you do is party.')
('Decoded sentence:', 'Les chats sont tr\xc3\xa8s souvents.\n')
-
('Input sentence:', "You're double-parked.")
('Decoded sentence:', 'Vous \xc3\xaates tr\xc3\xa8s bonnes.\n')
-
('Input sentence:', 'I changed the arrangement of the furniture in my room.')
('Decoded sentence:', 'Je pensais que tu as dit que tu ne peux pas vous dire.\n')
-
('Input sentence:', "He's got an uncl

-
('Input sentence:', 'French was the language of diplomacy.')
('Decoded sentence:', 'Les chats sont des probl\xc3\xa8mes de la police.\n')
-
('Input sentence:', 'She asked him questions.')
('Decoded sentence:', "Elle l'a dit qu'il a dit.\n")
-
('Input sentence:', "I'll lend you the tools that you need to do that.")
('Decoded sentence:', 'Je ne peux pas vous dire ce que tu as dit de ne pas vouloir.\n')
-
('Input sentence:', "Aren't you going to give me a kiss?")
('Decoded sentence:', 'Nous avons d\xc3\xa9j\xc3\xa0 entendu des choses de la police.\n')
-
('Input sentence:', "It's no use thinking about one's lost youth.")
('Decoded sentence:', "Ce n'est pas la porte de la porte de la porte.\n")
-
('Input sentence:', "Tom's grandfather was illiterate.")
('Decoded sentence:', "Tom a dit qu'il ne pouvait pas le faire.\n")
-
('Input sentence:', "Didn't you promise never to tell a lie?")
('Decoded sentence:', 'Ne voulez-vous pas de ce que je voulais ?\n')
-
('Input sentence:', 'Where are you p