In [11]:
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import gensim.downloader as api

# 1. Cargar el Dataset ConvAI2
import os
import requests

dataset_url = "https://raw.githubusercontent.com/huggingface/datasets/master/datasets/conv_ai/convai2/train.txt"
dataset_path = "convai2_train.txt"

if not os.path.exists(dataset_path):
    print("Descargando el dataset...")
    response = requests.get(dataset_url)
    with open(dataset_path, 'wb') as f:
        f.write(response.content)
    print("Dataset descargado.")

# Leer y procesar las conversaciones
def load_and_process_data(filepath, max_pairs=10000):
    input_sentences = []
    output_sentences = []
    with open(filepath, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            if len(parts) >= 2:  # Solo tomamos líneas con al menos 2 partes
                question = clean_text(parts[0])
                answer = clean_text(parts[1])
                if question and answer:  # Evitar textos vacíos
                    input_sentences.append(question)
                    output_sentences.append(f"<sos> {answer} <eos>")
            if len(input_sentences) >= max_pairs:
                break
    return input_sentences, output_sentences


def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.strip()

input_sentences, output_sentences = load_and_process_data(dataset_path, max_pairs=10000)
print("Número de pares de conversación:", len(input_sentences))

# 2. Tokenización
MAX_VOCAB_SIZE = 8000
MAX_LEN = 10

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters="")
tokenizer.fit_on_texts(input_sentences + output_sentences)

vocab_size = len(tokenizer.word_index) + 1
print("Tamaño del vocabulario:", vocab_size)

encoder_input_seq = pad_sequences(tokenizer.texts_to_sequences(input_sentences), maxlen=MAX_LEN, padding='pre')
decoder_input_seq = pad_sequences(tokenizer.texts_to_sequences(output_sentences), maxlen=MAX_LEN, padding='post')

# Targets (decodificador sin token <sos>)
decoder_target_seq = np.zeros_like(decoder_input_seq)
decoder_target_seq[:, :-1] = decoder_input_seq[:, 1:]

# 3. Preparar Embeddings FastText
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

print("Cargando embeddings FastText...")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
for word, idx in tokenizer.word_index.items():
    if word in fasttext_model:
        embedding_matrix[idx] = fasttext_model[word]
print("Embeddings cargados.")

# 4. Construcción del Modelo Seq2Seq
n_units = 128

# Encoder
encoder_inputs = Input(shape=(MAX_LEN,), name="encoder_inputs")
encoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(n_units, return_state=True, dropout=0.2)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(MAX_LEN,), name="decoder_inputs")
decoder_embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True, dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo de entrenamiento
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 5. Entrenamiento del Modelo
decoder_targets = np.expand_dims(decoder_target_seq, -1)

print("Iniciando el entrenamiento...")
model.fit(
    [encoder_input_seq, decoder_input_seq],
    decoder_targets,
    batch_size=64,
    epochs=30,
    validation_split=0.2
)
print("Entrenamiento finalizado.")

# 6. Configuración de Inferencia
# Modelo del encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Modelo del decoder para inferencia
decoder_state_input_h = Input(shape=(n_units,))
decoder_state_input_c = Input(shape=(n_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_infer = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], trainable=False)
decoder_embedding_input = decoder_embedding_infer(decoder_inputs)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_input, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# 7. Generación de Respuestas
def generate_response(input_text):
    states_value = encoder_model.predict(pad_sequences(tokenizer.texts_to_sequences([input_text]), maxlen=MAX_LEN))
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<sos>']
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, "")
        
        if sampled_word == "<eos>" or len(decoded_sentence.split()) >= MAX_LEN:
            stop_condition = True
        else:
            decoded_sentence += " " + sampled_word
        
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]
    
    return decoded_sentence.strip()

# Prueba del modelo
print("QA Bot:", generate_response("do you have any pets"))
print("QA Bot:", generate_response("where are you from"))
print("QA Bot:", generate_response("what is your name"))


Número de pares de conversación: 0
Tamaño del vocabulario: 1
Cargando embeddings FastText...
Embeddings cargados.


Iniciando el entrenamiento...


ValueError: Training data contains 0 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.