**Carga de Datos**

In [28]:
import os
import re
import zipfile
import requests
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import gensim.downloader as api

# 1. Descargar el Cornell Movie Dialogues Dataset
dataset_url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
dataset_path = "cornell_movie_dialogs_corpus.zip"
extracted_folder = "cornell_movie_dialogs_corpus"

if not os.path.exists(extracted_folder):
    print("Descargando el dataset...")
    response = requests.get(dataset_url)
    with open(dataset_path, 'wb') as f:
        f.write(response.content)
    print("Dataset descargado. Extrayendo...")
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_folder)
    print("Extracción completada.")

# 2. Función para procesar datos
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

def load_and_process_data(filepath, max_pairs=20000):
    input_sentences = []
    output_sentences = []
    with open(filepath, "r", encoding="iso-8859-1") as file:
        for line in file:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) >= 2:
                question = clean_text(parts[0])
                answer = clean_text(parts[1])
                if len(question.split()) > 3 and len(answer.split()) > 3:  # Filtra frases muy cortas
                    input_sentences.append(question)
                    output_sentences.append(f"<sos> {answer} <eos>")
            if len(input_sentences) >= max_pairs:
                break
    return input_sentences, output_sentences


# 3. Procesar los datos
lines_file = os.path.join(extracted_folder, "movie_lines.txt")
conversations_file = os.path.join(extracted_folder, "movie_conversations.txt")

input_sentences, output_sentences = load_and_process_cornell_data(lines_file, conversations_file, max_pairs=10000)
print("Número de pares de conversación cargados:", len(input_sentences))

# 4. Tokenización y padding
MAX_VOCAB_SIZE = 8000
MAX_LEN = 10

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters="", oov_token="<unk>")
tokenizer.fit_on_texts(input_sentences + output_sentences)

vocab_size = len(tokenizer.word_index) + 1
print("Tamaño del vocabulario:", vocab_size)

encoder_input_seq = pad_sequences(tokenizer.texts_to_sequences(input_sentences), maxlen=MAX_LEN, padding='pre')
decoder_input_seq = pad_sequences(tokenizer.texts_to_sequences(output_sentences), maxlen=MAX_LEN, padding='post')

# Targets
decoder_target_seq = np.zeros_like(decoder_input_seq)
decoder_target_seq[:, :-1] = decoder_input_seq[:, 1:]

# Guardar los datos
np.savez("datos_entrenamiento.npz", 
         encoder_input_seq=encoder_input_seq, 
         decoder_input_seq=decoder_input_seq, 
         decoder_target_seq=decoder_target_seq, 
         vocab_size=vocab_size, 
         tokenizer_word_index=tokenizer.word_index)
print("Datos guardados en 'datos_entrenamiento.npz'.")


Número de pares de conversación cargados: 10000
Tamaño del vocabulario: 11059
Datos guardados en 'datos_entrenamiento.npz'.


**Entrenamiento**

In [29]:
# entrenamiento.py
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
import gensim.downloader as api

# Cargar los datos
data = np.load("datos_entrenamiento.npz", allow_pickle=True)
encoder_input_seq = data["encoder_input_seq"]
decoder_input_seq = data["decoder_input_seq"]
decoder_target_seq = data["decoder_target_seq"]
vocab_size = int(data["vocab_size"])
tokenizer_word_index = data["tokenizer_word_index"].item()

MAX_LEN = encoder_input_seq.shape[1]
embedding_dim = 300
n_units = 256

# Embeddings FastText
embedding_matrix = np.zeros((vocab_size, embedding_dim))
print("Cargando embeddings FastText...")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
for word, idx in tokenizer_word_index.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]
print("Embeddings cargados.")

dropout_rate = 0.4

encoder_inputs = Input(shape=(MAX_LEN,))
encoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(n_units, return_state=True, dropout=dropout_rate)(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(MAX_LEN,))
decoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True, dropout=dropout_rate)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# Entrenar el modelo
print("Iniciando el entrenamiento...")
model.fit(
    [encoder_input_seq, decoder_input_seq],
    np.expand_dims(decoder_target_seq, -1),
    batch_size=64,
    epochs=30,
    validation_split=0.2
)
print("Entrenamiento finalizado.")
model.save("seq2seq_qa_model.h5")
print("Modelo guardado como 'seq2seq_qa_model.h5'.")


Cargando embeddings FastText...
Embeddings cargados.


Iniciando el entrenamiento...
Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 218ms/step - accuracy: 0.3135 - loss: 6.1356 - val_accuracy: 0.3007 - val_loss: 4.6639
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 208ms/step - accuracy: 0.3617 - loss: 4.1651 - val_accuracy: 0.3871 - val_loss: 4.3959
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 209ms/step - accuracy: 0.4197 - loss: 3.9122 - val_accuracy: 0.4031 - val_loss: 4.2893
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 210ms/step - accuracy: 0.4335 - loss: 3.7891 - val_accuracy: 0.4078 - val_loss: 4.2223
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 207ms/step - accuracy: 0.4390 - loss: 3.7075 - val_accuracy: 0.4120 - val_loss: 4.1800
Epoch 6/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 212ms/step - accuracy: 0.4468 - loss: 3.6174 - val_accuracy: 0.4138



Entrenamiento finalizado.
Modelo guardado como 'seq2seq_qa_model.h5'.


**Inferencia**

In [34]:
# inferencia.py
import numpy as np
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Cargar datos
data = np.load("datos_entrenamiento.npz", allow_pickle=True)
tokenizer_word_index = data["tokenizer_word_index"].item()
vocab_size = int(data["vocab_size"])
MAX_LEN = 10

# Cargar el modelo entrenado
model = load_model("seq2seq_qa_model.h5")
print("Modelo cargado.")

# Configurar el modelo para inferencia
encoder_inputs = model.input[0]
encoder_embedding = model.layers[2]
encoder_lstm = model.layers[4]
encoder_model = Model(encoder_inputs, encoder_lstm.output[1:])

decoder_inputs = model.input[1]
decoder_embedding = model.layers[3]
decoder_lstm = model.layers[5]
decoder_dense = model.layers[6]

state_input_h = Input(shape=(256,))
state_input_c = Input(shape=(256,))
decoder_states_inputs = [state_input_h, state_input_c]
decoder_embedding_input = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_input, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Generar respuestas
tokenizer_index_word = {v: k for k, v in tokenizer_word_index.items()}
def generate_response(input_text):
    input_seq = pad_sequences(tokenizer.texts_to_sequences([input_text]), maxlen=MAX_LEN)
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_word_index['<sos>']
    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_idx = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_index_word.get(sampled_idx, "")

        if sampled_word == "<eos>" or len(decoded_sentence.split()) >= MAX_LEN:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word

        target_seq[0, 0] = sampled_idx
        states_value = [h, c]

    return decoded_sentence.strip()




Modelo cargado.


In [35]:

# Prueba del modelo
print("what's your name? QA Bot: ", generate_response("what's your name"))
print("do you like movies? QA Bot: ", generate_response("do you like movies"))
print("Do you read? QA Bot: ", generate_response("Do you read?"))
print("Do you have any pet? QA Bot: ", generate_response("Do you have any pet?"))
print("Where are you from? QA Bot: ", generate_response("Where are you from?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
what's your name? QA Bot:  i dont know
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
do you like movies? QA Bot:  i dont know
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0