<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## LSTM Bot QA

### Datos
El objecto es utilizar datos disponibles del challenge ConvAI2 (Conversational Intelligence Challenge 2) de conversaciones en inglés. Se construirá un BOT para responder a preguntas del usuario (QA).\
[LINK](http://convai.io/data/)

In [1]:
!pip install --upgrade --no-cache-dir gdown --quiet

In [2]:
import re

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, SimpleRNN
from keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.utils import to_categorical


In [3]:
# Parámetros
MAX_VOCAB_SIZE = 8000
MAX_LENGTH = 10
EMBEDDING_DIM = 300
N_UNITS = 256
DROPOUT = 0.1
EPOCHS = 60

In [4]:
# Descargar la carpeta de dataset
import os
import gdown
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

El dataset ya se encuentra descargado


In [5]:
# dataset_file
import json

text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario



In [6]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()

dict_keys(['dialog', 'start_time', 'end_time', 'bot_profile', 'user_profile', 'eval_score', 'profile_match', 'participant1_id', 'participant2_id'])

In [7]:
chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
global_output = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)

    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in.lower(), chat_out.lower()

        # output sentence (decoder_output) tiene <eos>
        output_sentence = output + ' <eos>'
        # output sentence input (decoder_input) tiene <sos>
        output_sentence_input = '<sos> ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)
        global_output.append(output)

print("Cantidad de rows utilizadas:", len(input_sentences))

Cantidad de rows utilizadas: 6033


In [8]:
input_sentences[1], output_sentences[1], output_sentences_inputs[1]

('hi how are you ', 'not bad and you  <eos>', '<sos> not bad and you ')

### 2 - Preprocesamiento
Realizar el preprocesamiento necesario para obtener:
- word2idx_inputs, max_input_len
- word2idx_outputs, max_out_len, num_words_output
- encoder_input_sequences, decoder_output_sequences, decoder_targets

In [9]:
# tokenizador de inglés
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='!"#$%&()*+,-./:;=¿?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(input_sentences + global_output + ['<sos>', '<eos>'])

input_sequences  = tokenizer.texts_to_sequences(input_sentences)
output_sequences = tokenizer.texts_to_sequences(global_output)

output_sequences = [[tokenizer.word_index['<sos>']] + seq + [tokenizer.word_index['<eos>']] for seq in output_sequences]

word2idx_inputs = tokenizer.word_index
print("Palabras en el vocabulario:", len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_sequences )
print("Sentencia de entrada más larga:", max_input_len)

Palabras en el vocabulario: 2157
Sentencia de entrada más larga: 9


In [10]:
# Padding
encoder_input_data = pad_sequences(input_sequences, maxlen=MAX_LENGTH, padding='post')
decoder_input_data = pad_sequences([seq[:-1] for seq in output_sequences], maxlen=MAX_LENGTH, padding='post')
decoder_target_data = pad_sequences([seq[1:] for seq in output_sequences], maxlen=MAX_LENGTH, padding='post')

# Convertir a numpy arrays
encoder_input_data = np.array(encoder_input_data)
decoder_input_data = np.array(decoder_input_data)
decoder_target_data = np.array(decoder_target_data)

### 3 - Preparar los embeddings
Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores

In [11]:
!pip install --quiet gdown
import gdown

file_id = '1IxxOgufC_9h0a-44c5EbVp2rmYiFlipm'  # Reemplaza con tu ID de archivo
destination = 'crawl-300d-2M.vec'  # Reemplaza con el nombre deseado para el archivo

gdown.download(id=file_id, output=destination, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1IxxOgufC_9h0a-44c5EbVp2rmYiFlipm
From (redirected): https://drive.google.com/uc?id=1IxxOgufC_9h0a-44c5EbVp2rmYiFlipm&confirm=t&uuid=31b2ba9d-0326-418d-9518-e8d9274131e0
To: /content/crawl-300d-2M.vec
100%|██████████| 4.51G/4.51G [00:50<00:00, 88.6MB/s]


'crawl-300d-2M.vec'

In [13]:
import gensim
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format('./crawl-300d-2M.vec', binary=False)

In [14]:
# Crear matriz de embeddings
word_index = tokenizer.word_index
num_tokens = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_tokens, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM,))

print("Matriz de embeddings creada.")

Matriz de embeddings creada.


In [15]:
# Dimensión de los embeddings de la secuencia en inglés
embedding_matrix.shape

(2158, 300)

### 4 - Entrenar el modelo
Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.

In [16]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout

# Tamaño del vocabulario
vocab_size = num_tokens

# Embedding Layer
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_LENGTH,
    trainable=False  # Puedes ajustar a True si deseas entrenar las embeddings
)

# Encoder
encoder_inputs = Input(shape=(MAX_LENGTH,), name='encoder_inputs')
encoder_embedding = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(N_UNITS, return_state=True, dropout=DROPOUT, name='encoder_lstm')
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_LENGTH,), name='decoder_inputs')
decoder_embedding = embedding_layer(decoder_inputs)
decoder_lstm = LSTM(N_UNITS, return_sequences=True, return_state=True, dropout=DROPOUT, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()




In [17]:
# Añadir una dimensión extra a decoder_target_data para que coincida con la salida esperada
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Entrenamiento
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=EPOCHS
)


Epoch 1/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.4841 - loss: 4.1168
Epoch 2/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 18ms/step - accuracy: 0.6219 - loss: 2.0974
Epoch 3/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.6987 - loss: 1.7719
Epoch 4/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.7096 - loss: 1.6142
Epoch 5/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7225 - loss: 1.5120
Epoch 6/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.7308 - loss: 1.4454
Epoch 7/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7387 - loss: 1.3853
Epoch 8/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7456 - loss: 1.3366
Epoch 9/60
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━

### 5 - Inferencia
Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.

In [18]:
# Modelos de inferencia

# Encoder inference
encoder_model_inf = Model(encoder_inputs, encoder_states)

# Decoder inference
decoder_state_input_h = Input(shape=(N_UNITS,), name='input_h')
decoder_state_input_c = Input(shape=(N_UNITS,), name='input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_inf = embedding_layer(decoder_inputs)
decoder_outputs_inf, state_h_inf, state_c_inf = decoder_lstm(
    decoder_embedding_inf, initial_state=decoder_states_inputs
)
decoder_states_inf = [state_h_inf, state_c_inf]
decoder_outputs_inf = decoder_dense(decoder_outputs_inf)

decoder_model_inf = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_inf] + decoder_states_inf
)


In [19]:
import sys

# Invertir el word index para obtener palabras a partir de índices
reverse_word_index = {i: word for word, i in tokenizer.word_index.items()}

def decode_sequence(input_seq):
    # Obtener los estados internos del encoder
    states_value = encoder_model_inf.predict(input_seq)

    # Generar una secuencia vacía de tokens de inicio
    target_seq = np.array([[tokenizer.word_index.get('<sos>', 1)]])

    # Inicializar la secuencia de salida
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

        # Obtener el token con mayor probabilidad
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '<unk>')

        if sampled_word == '<eos>' or len(decoded_sentence.split()) > MAX_LENGTH:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Actualizar la secuencia de entrada para el siguiente paso
        target_seq = np.array([[sampled_token_index]])

        # Actualizar los estados
        states_value = [h, c]

    return decoded_sentence.strip()

# Función para limpiar y preparar la entrada del usuario
def prepare_input(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post')
    return padded

## Hello

In [20]:
user_input = 'Hello'
input_seq = prepare_input(user_input.lower())
response = decode_sequence(input_seq)
print(f"Chatbot: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Chatbot: hello how are you


## How are you?

In [21]:
user_input = 'How are you?'
input_seq = prepare_input(user_input.lower())
response = decode_sequence(input_seq)
print(f"Chatbot: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Chatbot: i am doing well how are you


## Do you like music?

In [22]:
user_input = 'Do you like music?'
input_seq = prepare_input(user_input.lower())
response = decode_sequence(input_seq)
print(f"Chatbot: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Chatbot: yes


## What do you do for a living?

In [23]:
user_input = 'What do you do for a living?'
input_seq = prepare_input(user_input.lower())
response = decode_sequence(input_seq)
print(f"Chatbot: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Chatbot: i m a student


## Chatbot en vivo

In [None]:
# Interacción con el usuario
print("¡Hola! Soy tu chatbot. Escribe 'salir' para terminar.")

while True:
    user_input = input("Tú: ")
    if user_input.lower() == 'salir':
        print("Chatbot: ¡Hasta luego!")
        break

    input_seq = prepare_input(user_input.lower())
    response = decode_sequence(input_seq)
    print(f"Chatbot: {response}")
