In [2]:
import tensorflow as tf
import string
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
#from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [31]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [32]:
translation_file = open("pruebaLSTM.csv","r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split(',') for sentence in  raw_data]
pairs = pairs[:-1]

In [33]:
out_sentences = [clean_sentence(pair[1]) for pair in pairs]
in_sentences = [clean_sentence(pair[0]) for pair in pairs]

# Tokenize words
in_text_tokenized, in_text_tokenizer = tokenize(in_sentences)
out_text_tokenized, out_text_tokenizer = tokenize(out_sentences)

print('Maximum length spanish sentence: {}'.format(len(max(in_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(out_text_tokenized,key=len))))


# Check language length
in_vocab = len(in_text_tokenizer.word_index) + 1
out_vocab = len(out_text_tokenizer.word_index) + 1
print("Spanish vocabulary is of {} unique words".format(in_vocab))
print("English vocabulary is of {} unique words".format(out_vocab))

Maximum length spanish sentence: 10
Maximum length english sentence: 6
Spanish vocabulary is of 7 unique words
English vocabulary is of 10 unique words


In [5]:

max_in_len = int(len(max(in_text_tokenized,key=len)))
max_out_len = int(len(max(out_text_tokenized,key=len)))

in_pad_sentence = pad_sequences(in_text_tokenized, max_in_len, padding = "post")
out_pad_sentence = pad_sequences(out_text_tokenized, max_out_len, padding = "post")

# Reshape data
in_pad_sentence = in_pad_sentence.reshape(*in_pad_sentence.shape, 1)
out_pad_sentence = out_pad_sentence.reshape(*out_pad_sentence.shape, 1)

In [6]:
input_sequence = Input(shape=(max_in_len,))
embedding = Embedding(input_dim=in_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_out_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(out_vocab))(decoder)

In [7]:

enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer='Adam',
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding (Embedding)       (None, 10, 128)           896       
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVector  (None, 6, 64)            0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 6, 64)             33024     
                                                                 
 time_distributed (TimeDistr  (None, 6, 10)            650       
 ibuted)                                                     

In [8]:
model_results = enc_dec_model.fit(in_pad_sentence, out_pad_sentence, batch_size=30, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [9]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 60
print("The english sentence is: {}".format(out_sentences[index]))
print("The spanish sentence is: {}".format(in_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(in_pad_sentence[index:index+1])[0], out_text_tokenizer))

The english sentence is: 3
The spanish sentence is: 2 2 0 2 2 2 2 2 2 2
The predicted sentence is :
3 <empty> <empty> <empty> <empty> <empty>


In [10]:
in_text_tokenized_p = in_text_tokenizer.texts_to_sequences(['0 0 0 0 3 3 3 0 0 0'])

in_pad_sentence_p = pad_sequences(in_text_tokenized_p, max_in_len, padding = "post")

resul = logits_to_sentence(enc_dec_model.predict(in_pad_sentence_p[0:0+1])[0], out_text_tokenizer)
resul = resul.split(' <empty>')[0]
resul

'0 1 6 7'

In [12]:
import io
import json

# saving
with open('in_tokenizer.pickle', 'wb') as handle:
    pickle.dump(in_text_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open('out_tokenizer.pickle', 'wb') as handle:
    pickle.dump(out_text_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Data to be written 
dictionary ={ 
  "in_max_length": max_in_len,
  "out_max_length": max_out_len
} 
with io.open('max_length.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dictionary, ensure_ascii=False))


In [6]:

# loading
with open('in_tokenizer.pickle', 'rb') as handle:
    in_text_tokenizer = pickle.load(handle)

with open('out_tokenizer.pickle', 'rb') as handle:
    out_text_tokenizer = pickle.load(handle)

Spanish vocabulary is of 5 unique words
English vocabulary is of 7 unique words


In [13]:
enc_dec_model.save('detector_secuencias.h5')