In [1]:
import numpy as np
import json
import re
import tensorflow as tf
import warnings
import random

import spacy

#extraemos un diccionario en español del NPL
nlp = spacy.load("es_core_news_sm")

#quitamos algunas alvertencias durante la ejecucion
warnings.filterwarnings('ignore')

In [4]:
#extraemos la informacion
with open('data.json', 'rb') as file:
    data = json.load(file)

In [5]:
#extraemos la informacion
inputs, targets = [],[]
cls = []
intent_doc = {}

for i in data['data']:
    if i['type'] not in cls:
        cls.append(i['type'])

    if i['type'] not in intent_doc:
        intent_doc[i['type'] ] = []
    
    for text in i['texto']:
        inputs.append(text)
        targets.append(i['type'])

    for response in i['response']:
        intent_doc[i['type']].append(response)

In [40]:
print(inputs, targets)
print(cls)
print(intent_doc)

['Hola', 'Hola, me ayudas', 'Ey', 'oye', 'me ayudas', 'necesito ayuda', 'me puedes ayudar', 'Adios', 'Bye', 'cuidate', 'hasta luego', 'ya no te necesito'] ['saludo', 'saludo', 'saludo', 'pedir_ayuda', 'pedir_ayuda', 'pedir_ayuda', 'pedir_ayuda', 'despedida', 'despedida', 'despedida', 'despedida', 'despedida']
['saludo', 'pedir_ayuda', 'despedida']
{'saludo': ['Hola humano, ¿En qué puedo ayudarte?', 'Hola humano, ¿En qué te ayudo?'], 'pedir_ayuda': ['¿En qué puedo ayudarte?', '¿En qué te ayudo?', '¿Dime lo q necesitas?'], 'despedida': ['Adios humanos', 'Good bye, adios en ingles', 'Cualquier cosa me puedes volver hablar', 'Cuidate, adios']}


In [10]:
'''tokenize data'''
def token_data(inp_list):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<unk>')
    tokenizer.fit_on_texts(inp_list)

    inp_seq = tokenizer.texts_to_sequences(inp_list)

    '''adding padding'''
    inp_seq = tf.keras.preprocessing.sequence.pad_sequences(inp_seq, padding='pre')

    return tokenizer, inp_seq

'''preprocesss input data'''
tokenizer,inp_tensor = token_data(inputs)

In [11]:
print(tokenizer)
print(inp_tensor)

<keras.preprocessing.text.Tokenizer object at 0x00000259417CB190>
[[ 0  0  0  5]
 [ 0  6  2  3]
 [ 0  0  0  7]
 [ 0  0  0  8]
 [ 0  0  2  3]
 [ 0  0  4  9]
 [ 0  2 10 11]
 [ 0  0  0 12]
 [ 0  0  0 13]
 [ 0  0  0 14]
 [ 0  0 15 16]
 [17 18 19  4]]


In [18]:
def cr_cat_target(targets):
    word = {}
    cat_t = []
    counter = 0

    for trg in targets:
        if trg not in word:
            word[trg] = counter
            counter += 1
        cat_t.append(word[trg])

    cat_tensor = tf.keras.utils.to_categorical(cat_t, num_classes = len(word), dtype ="int32")
    return cat_tensor, dict((v,k) for k, v in word.items())

'''preprocess output data'''
target_tensor, target_idx_word= cr_cat_target(targets)

In [19]:
print(target_tensor)
print(target_idx_word)

[[1 0 0]
 [1 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]]
{0: 'saludo', 1: 'pedir_ayuda', 2: 'despedida'}


In [45]:
f'input shape: {np.shape(inp_tensor)} and output shape: {target_tensor.shape}'

'input shape: (12, 4) and output shape: (12, 3)'

In [46]:
from tensorflow.keras import models
from tensorflow.keras import layers
#creamos la red neuronal
epochs=50
vocab_size = len(tokenizer.word_index)+1
embed_dim = 512
units =128
target_len = target_tensor.shape[1]

'''Model'''
model = models.Sequential()

model.add(layers.Embedding(vocab_size, embed_dim))
model.add(layers.Bidirectional(layers.LSTM(units, dropout=0.2)))
model.add(layers.Dense(units,activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(target_len, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 512)         10240     
                                                                 
 bidirectional_1 (Bidirectio  (None, 256)              656384    
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 3)                 387       
                                                                 
Total params: 699,907
Trainable params: 699,907
Non-trainable params: 0
________________________________________________

In [47]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [52]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

es = EarlyStopping(monitor='loss', mode='min', patience = 5)
mc = ModelCheckpoint('modelAI/modeloSpanish.h5', monitor='loss', mode='min', save_best_only=True)

'''training'''
history = model.fit(inp_tensor, 
    target_tensor, 
    epochs=epochs,
    callbacks=[es, mc])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [14]:
import tensorflow
model = tensorflow.keras.models.load_model('modelAI/modeloSpanish.h5', compile=False)

In [15]:
def response(sentence):
    sent_seq = []
    doc = nlp(repr(sentence))

    for token in doc :
        if token.text in tokenizer.word_index:
            sent_seq.append(tokenizer.word_index[token.text])
        else:
            sent_seq.append(tokenizer.word_index['<unk>'])

    sent_seq = tf.expand_dims(sent_seq, 0)

    pred = model(sent_seq)

    pred_class = np.argmax(pred.numpy(), axis = 1)

    rest = random.choice(intent_doc[target_idx_word[pred_class[0]]])
    return rest, target_idx_word[pred_class[0]]

In [27]:

response('que pedo perro')

('¿Dime lo q necesitas?', 'pedir_ayuda')