# Desafío 4
## Construir un QA Bot basado en el ejemplo del traductor pero con un dataset QA.

In [1]:
# 0. Gestionar versiones para evitar incompatibilidades
#!pip uninstall -y numpy scipy tensorflow ml-dtypes


Found existing installation: numpy 1.26.1
Uninstalling numpy-1.26.1:
  Successfully uninstalled numpy-1.26.1
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Found existing installation: ml_dtypes 0.5.1
Uninstalling ml_dtypes-0.5.1:
  Successfully uninstalled ml_dtypes-0.5.1


# Carga de datos

In [1]:
# Instalar HF Datasets
!pip install --quiet datasets
#!pip install --upgrade --force-reinstall pandas numpy scipy

# Importar
from datasets import load_dataset

# Cargar XQuAD-es, use split='validation'
ds = load_dataset("xquad", "xquad.es", split="validation")

# descarga directa (aprox. 6 GB descomprimido)
!wget -c https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz
!gunzip cc.es.300.vec.gz

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


--2025-04-26 19:05:20--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.170.152.30, 3.170.152.69, 3.170.152.93, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.170.152.30|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

gzip: cc.es.300.vec already exists; do you wish to overwrite (y or n)? n
	not overwritten


In [2]:
vec_path = 'cc.es.300.vec'
# Import KeyedVectors
#from gensim.models import KeyedVectors
#FT_VEC = KeyedVectors.load_word2vec_format('cc.es.300.vec')
from gensim.models import KeyedVectors
fasttext = KeyedVectors.load_word2vec_format(vec_path, binary=False, limit=100_000)

# Imports y funciones auxiliares

In [5]:
import numpy as np
import pandas as pd
import re
import pickle
!pip install tensorflow
from tensorflow.keras.preprocessing.text     import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

def clean_text(s):
    s = s.lower().strip()
    s = re.sub(r"[^a-z0-9áéíóúñü¿¡ ]+", "", s)
    return s




# Carga y preprocesamiento de datos

In [6]:
# Preparar listas de question/answer

# Concatenamos context + pregunta como entrada
questions = []
answers   = []
for ex in ds:
    context = ex["context"]
    q       = ex["question"]
    ans_txt = ex["answers"]["text"][0]   # tomamos la primera respuesta
    questions.append(clean_text(context + " " + q))
    answers.append( "<start> " + clean_text(ans_txt) + " <end>" )


# Tokenización y secuencias

In [7]:
MAX_VOCAB_SIZE = 8000
MAX_LEN = 25

# Encoder
tokenizer_enc = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<unk>')
tokenizer_enc.fit_on_texts(questions)
enc_seqs = tokenizer_enc.texts_to_sequences(questions)
pad_questions = pad_sequences(enc_seqs, maxlen=MAX_LEN, padding='post')

# Decoder
tokenizer_dec = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<unk>')
tokenizer_dec.fit_on_texts(answers)
dec_seqs = tokenizer_dec.texts_to_sequences(answers)
pad_answers = pad_sequences(dec_seqs, maxlen=MAX_LEN, padding='post')

decoder_input_data = pad_answers[:, :-1]
decoder_target_data = np.expand_dims(pad_answers[:, 1:], -1)

vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer_enc.word_index) + 1)
vocab_size_dec = min(MAX_VOCAB_SIZE, len(tokenizer_dec.word_index) + 1)


# Cargar FastText y crear matrices de embedding

In [8]:
EMBED_DIM = 300
def build_embedding_matrix(tokenizer, vocab_size):
    matrix = np.zeros((vocab_size, EMBED_DIM))
    for word, idx in tokenizer.word_index.items():
        if idx < vocab_size and word in fasttext:
            matrix[idx] = fasttext[word]
    return matrix

embedding_matrix_enc = build_embedding_matrix(tokenizer_enc, vocab_size)
embedding_matrix_dec = build_embedding_matrix(tokenizer_dec, vocab_size_dec)

# Definir y compilar el modelo Seq2Seq

In [11]:
from tensorflow.keras.layers import Masking

n_units = 256
lstm_dropout = 0.2

from tensorflow.keras.layers import AdditiveAttention, Concatenate

# — Encoder —
encoder_inputs = Input(shape=(MAX_LEN,), name='encoder_inputs')
enc_emb = Embedding(vocab_size, EMBED_DIM, weights=[embedding_matrix_enc],
                    trainable=False, name='encoder_embedding')(encoder_inputs)
enc_masked    = Masking(mask_value=0.0, name='encoder_masking')(enc_emb)
enc_outputs, state_h, state_c = LSTM(
    n_units, return_sequences=True, return_state=True, name='encoder_lstm'
)(enc_emb)
encoder_states = [state_h, state_c]

# — Decoder —
decoder_inputs = Input(shape=(MAX_LEN-1,), name='decoder_inputs')
dec_emb = Embedding(vocab_size_dec, EMBED_DIM, weights=[embedding_matrix_dec],
                    trainable=False, name='decoder_embedding')(decoder_inputs)
dec_masked     = Masking(mask_value=0.0, name='decoder_masking')(dec_emb)
dec_outputs, _, _ = LSTM(
    n_units, return_sequences=True, return_state=True, name='decoder_lstm'
)(dec_emb, initial_state=encoder_states)

# — Attention —
attn = AdditiveAttention(name='attention_layer')(
    [dec_outputs, enc_outputs]
)
# Combinas la salida del LSTM del decoder con el vector de contexto
dec_concat = Concatenate(axis=-1, name='concat_layer')([dec_outputs, attn])

# — Output —
decoder_outputs = Dense(vocab_size_dec, activation='softmax',
                        name='decoder_dense')(dec_concat)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# Entrenamiento

In [12]:
epochs = 70
batch_size = 64

model.fit(
    [pad_questions, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


Epoch 1/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - accuracy: 0.6480 - loss: 5.0343 - val_accuracy: 0.7992 - val_loss: 1.5197
Epoch 2/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8372 - loss: 1.2177 - val_accuracy: 0.8018 - val_loss: 1.5058
Epoch 3/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8378 - loss: 1.1932 - val_accuracy: 0.8048 - val_loss: 1.4898
Epoch 4/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8330 - loss: 1.2191 - val_accuracy: 0.8129 - val_loss: 1.4304
Epoch 5/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8430 - loss: 1.2108 - val_accuracy: 0.8102 - val_loss: 1.4236
Epoch 6/70
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8498 - loss: 1.0712 - val_accuracy: 0.8167 - val_loss: 1.3796
Epoch 7/70
[1m15/15[0m [32m━━━━

<keras.src.callbacks.history.History at 0x7a5df41aef50>

# Guardar modelo y tokenizers

In [13]:
model.save('qa_seq2seq_fasttext.h5')
with open('tokenizer_enc.pkl','wb') as f: pickle.dump(tokenizer_enc, f)
with open('tokenizer_dec.pkl','wb') as f: pickle.dump(tokenizer_dec, f)



# Preparar modelos de inferencia

In [23]:
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Layer

# 1) Carga el modelo entrenado (sin custom_objects si usaste Masking)
model = load_model('qa_seq2seq_fasttext.h5', compile=False)

# 2) Reconstruir encoder_model (ahora devuelve outputs + estados)
encoder_inputs = model.input[0]  # la entrada de encoder
encoder_lstm = model.get_layer('encoder_lstm')
# enc_outputs: secuencia completa, state_h y state_c
enc_outputs, state_h_enc, state_c_enc = encoder_lstm.output
encoder_model = Model(
    inputs=encoder_inputs,
    outputs=[enc_outputs, state_h_enc, state_c_enc]
)

# 3) Reconstruir decoder_model con atención
# a) crear placeholders para las 4 entradas
decoder_inputs      = model.input[1]  # token actual
enc_outputs_input   = Input(shape=enc_outputs.shape[1:], name='enc_out_in')
decoder_state_h_in  = Input(shape=state_h_enc.shape[1:], name='dec_h_in')
decoder_state_c_in  = Input(shape=state_c_enc.shape[1:], name='dec_c_in')
dec_states_inputs   = [decoder_state_h_in, decoder_state_c_in]

# b) capas originales reutilizadas
dec_emb_layer   = model.get_layer('decoder_embedding')
dec_lstm_layer  = model.get_layer('decoder_lstm')
attn_layer      = model.get_layer('attention_layer')    # AdditiveAttention
concat_layer    = model.get_layer('concat_layer')       # Concatenate
dense_layer     = model.get_layer('decoder_dense')

# c) paso a paso
dec_emb2     = dec_emb_layer(decoder_inputs)
dec_out2, h2, c2 = dec_lstm_layer(
    dec_emb2,
    initial_state=dec_states_inputs
)
# aplicar atención: query=dec_out2, value=enc_outputs_input
attn_out     = attn_layer([dec_out2, enc_outputs_input])
dec_concat2  = concat_layer([dec_out2, attn_out])
dec_preds    = dense_layer(dec_concat2)

decoder_model = Model(
    inputs=[decoder_inputs, enc_outputs_input] + dec_states_inputs,
    outputs=[dec_preds, h2, c2]
)

# Función decode_sequence y pruebas

In [28]:
# 8. Función de decodificación mejorada
from tensorflow.keras.preprocessing.sequence import pad_sequences
trained_maxlen = model.input_shape[0][1]

# 4) Función de inferencia actualizada:
def decode_sequence(input_text):
    # a) prepara la secuencia de entrada como antes...
    seq = tokenizer_enc.texts_to_sequences([clean_text(input_text)])
    seq = pad_sequences(seq, maxlen=MAX_LEN, padding='post')

    # b) obtené enc_outputs y estados
    enc_outs, h, c = encoder_model.predict(seq)

    # c) iniciás con <start>
    #start_idx   = tokenizer_dec.word_index['<start>']
    # 1) Intenta varios nombres comunes de token start
    candidates = ['<start>', 'start', 'startseq', '<startseq>']
    start_idx = None
    for t in candidates:
        if t in tokenizer_dec.word_index:
            start_idx = tokenizer_dec.word_index[t]
            break

    # 2) Si no hay ninguno, levanta un error informativo
    if start_idx is None:
        valid = list(tokenizer_dec.word_index.keys())[:20]
        raise KeyError(
            f"No hallé ningún token de inicio en tokenizer_dec.word_index. "
            f"Probables claves del tokenizer (primeras 20): {valid}"
        )
    target_seq  = np.array([[start_idx]])
    decoded     = []
    prev_word   = None

    for _ in range(MAX_LEN):
      # LLAMADO CON LAS 4 entradas
      preds, h, c = decoder_model.predict([target_seq, enc_outs, h, c])

      probs       = preds[0, -1, :]
      sampled_idx = np.argmax(probs)
      sampled_word = tokenizer_dec.index_word.get(sampled_idx, '<unk>')

      if sampled_word in ('<end>','end') or sampled_word==prev_word:
        break

      decoded.append(sampled_word)
      prev_word = sampled_word

      target_seq = np.array([[sampled_idx]])

    return ' '.join(decoded)

In [30]:
import random

# Muestra 5 ejemplos aleatorios del dataset
for ex in ds.shuffle(seed=123).select(range(5)):
    context = ex["context"]
    question = ex["question"]
    gold_answer = ex["answers"]["text"][0]
    print("Contexto:", context)
    print("Pregunta:", question)
    print("Respuesta esperada:", gold_answer)
    print("Respuesta del bot:", decode_sequence(context + " " + question))
    print("-"*80)

Contexto: El Partido Laborista Australiano (ALP), de centro-izquierda, el Partido Liberal de Australia, de centro-derecha, el Partido Nacional de Australia, del medio rural, y los Verdes australianos, ecologistas, son los principales partidos políticos de Victoria. Tradicionalmente, los laboristas son más fuertes en la periferia oeste y norte de clase trabajadora de Melbourne y en las ciudades de Ballarat, Bendigo y Geelong, situadas en la región. El principal apoyo a los liberales radica en la periferia este y exterior de Melbourne, más acomodada, y en algunos centros rurales y regionales. Los nacionales tienen más apoyo en las áreas regionales rurales del noroeste y el este de Victoria. Los Verdes, que ganaron sus primeros escaños en la cámara baja en 2014, tienen más apoyo en el centro de Melbourne.
Pregunta: ¿Qué partido político tiene más apoyo en la periferia de clase trabajadora de Melbourne?
Respuesta esperada: Partido Laborista Australiano
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

# Prueba interactiva usando ipywidgets

In [29]:
!pip install --quiet ipywidgets
from ipywidgets import Text, Button, Output
from IPython.display import display

# Creamos los widgets
caja_pregunta = Text(placeholder='Escribe tu pregunta aquí', description='Pregunta:')
boton = Button(description='Enviar')
salida = Output()

# Función que se ejecuta al hacer clic
def on_click(b):
    with salida:
        salida.clear_output()
        q = caja_pregunta.value
        a = decode_sequence(q)
        print("Bot:", a)

boton.on_click(on_click)
display(caja_pregunta, boton, salida)

Text(value='', description='Pregunta:', placeholder='Escribe tu pregunta aquí')

Button(description='Enviar', style=ButtonStyle())

Output()