In [14]:
import tensorflow as tf
import string
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
#from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [15]:
from keras.utils import pad_sequences

In [16]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "¡" + '¿'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [17]:
translation_file = open("pruebaLSTM.csv","r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split(',') for sentence in  raw_data]
pairs = pairs[:-1]

In [18]:
out_sentences = [clean_sentence(pair[1]) for pair in pairs]
in_sentences = [clean_sentence(pair[0]) for pair in pairs]

# Tokenize words
in_text_tokenized, in_text_tokenizer = tokenize(in_sentences)
out_text_tokenized, out_text_tokenizer = tokenize(out_sentences)

print('Maximum length input sequence: {}'.format(len(max(in_text_tokenized,key=len))))
print('Maximum length output sequence: {}'.format(len(max(out_text_tokenized,key=len))))


# Check language length
in_vocab = len(in_text_tokenizer.word_index) + 1
out_vocab = len(out_text_tokenizer.word_index) + 1
print("Input vocabulary is of {} unique symbols".format(in_vocab))
print("Output vocabulary is of {} unique actions".format(out_vocab))

Maximum length input sequence: 10
Maximum length output sequence: 6
Input vocabulary is of 7 unique symbols
Output vocabulary is of 10 unique actions


In [19]:

max_in_len = int(len(max(in_text_tokenized,key=len)))
max_out_len = int(len(max(out_text_tokenized,key=len)))

in_pad_sentence = pad_sequences(in_text_tokenized, max_in_len, padding = "post")
out_pad_sentence = pad_sequences(out_text_tokenized, max_out_len, padding = "post")

# Reshape data
in_pad_sentence = in_pad_sentence.reshape(*in_pad_sentence.shape, 1)
out_pad_sentence = out_pad_sentence.reshape(*out_pad_sentence.shape, 1)

In [20]:
input_sequence = Input(shape=(max_in_len,))
embedding = Embedding(input_dim=in_vocab, output_dim=7,)(input_sequence)
encoder = LSTM(25, return_sequences=False)(embedding)
r_vec = RepeatVector(max_out_len)(encoder)
decoder = LSTM(25, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(out_vocab))(decoder)

In [21]:

enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer='Adam',
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 10, 9)             63        
                                                                 
 lstm_2 (LSTM)               (None, 28)                4256      
                                                                 
 repeat_vector_1 (RepeatVect  (None, 6, 28)            0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 6, 28)             6384      
                                                                 
 time_distributed_1 (TimeDis  (None, 6, 10)            290       
 tributed)                                                 

In [22]:
model_results = enc_dec_model.fit(in_pad_sentence, out_pad_sentence, batch_size=30, epochs=1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [66]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

index = 60
print("The english sentence is: {}".format(out_sentences[index]))
print("The spanish sentence is: {}".format(in_sentences[index]))
print('The predicted sentence is :')
print(logits_to_sentence(enc_dec_model.predict(in_pad_sentence[index:index+1])[0], out_text_tokenizer))

The english sentence is: 3
The spanish sentence is: 2 2 0 2 2 2 2 2 2 2
The predicted sentence is :
3 <empty> <empty> <empty> <empty> <empty>


In [67]:
in_text_tokenized_p = in_text_tokenizer.texts_to_sequences(['0 0 0 0 3 3 3 0 0 0'])

in_pad_sentence_p = pad_sequences(in_text_tokenized_p, max_in_len, padding = "post")

resul = logits_to_sentence(enc_dec_model.predict(in_pad_sentence_p[0:0+1])[0], out_text_tokenizer)
resul = resul.split(' <empty>')[0]
resul



'0 1 6 7 0 1'

In [68]:
import io
import json

# saving
with open('in_tokenizer.pickle', 'wb') as handle:
    pickle.dump(in_text_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open('out_tokenizer.pickle', 'wb') as handle:
    pickle.dump(out_text_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Data to be written 
dictionary ={ 
  "in_max_length": max_in_len,
  "out_max_length": max_out_len
} 
with io.open('max_length.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(dictionary, ensure_ascii=False))

In [69]:

# loading
with open('in_tokenizer.pickle', 'rb') as handle:
    in_text_tokenizer = pickle.load(handle)

with open('out_tokenizer.pickle', 'rb') as handle:
    out_text_tokenizer = pickle.load(handle)

In [70]:
enc_dec_model.save('detector_secuencias.h5')

In [23]:

#Test the model with the test dataset
test_model = tf.keras.models.load_model('detector_secuencias.h5')
translation_file = open("test_Secuencias.csv","r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split(',') for sentence in  raw_data]
pairs = pairs[:-1]
out_sentences = [clean_sentence(pair[1]) for pair in pairs]
in_sentences = [clean_sentence(pair[0]) for pair in pairs]
max_in_len = 10
max_out_len = 6
in_text_tokenized = in_text_tokenizer.texts_to_sequences(in_sentences)
in_pad_sentence = pad_sequences(in_text_tokenized, maxlen=max_in_len, padding="post")

# Check language length
out_text_tokenized = out_text_tokenizer.texts_to_sequences(out_sentences)
out_pad_sentence = pad_sequences(out_text_tokenized, maxlen=max_out_len, padding="post")



# Reshape data


in_pad_sentence = in_pad_sentence.reshape(*in_pad_sentence.shape, 1)
out_pad_sentence = out_pad_sentence.reshape(*out_pad_sentence.shape, 1)
test_model.evaluate(in_pad_sentence, out_pad_sentence, batch_size=30)



[0.5067500472068787, 0.9270833134651184]