# Exploración — Ejercicio 3 (RAG + Agente Crítico)

Este notebook te ayuda a:
- Cargar el CSV de laptops
- Preparar un subset 200–400
- Crear chunks (campo:valor) y construir índice BM25
- Probar consultas (retrieve top-k)
- Generar respuesta con citas
- Pasar respuesta por el agente crítico


In [1]:
import os
import re
import json
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi

pd.set_option('display.max_columns', 60)
pd.set_option('display.width', 140)

# Ajusta si tu archivo está en otra ruta
DATA_PATH = r"data\\Laptops_with_technical_specifications.csv"
SUBSET_N = 300
RANDOM_SEED = 42
TOP_K = 5
MAX_ANSWER_WORDS = 120

print('Exists?', os.path.exists(DATA_PATH))
if os.path.exists(DATA_PATH):
    print('File size (MB):', round(os.path.getsize(DATA_PATH)/1024/1024, 2))

ModuleNotFoundError: No module named 'rank_bm25'

In [None]:
df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]
print('Shape:', df.shape)
df.head()

In [None]:
# Selección de subset reproducible
if len(df) > SUBSET_N:
    df_sub = df.sample(SUBSET_N, random_state=RANDOM_SEED).reset_index(drop=True)
else:
    df_sub = df.copy()

print('Subset shape:', df_sub.shape)
df_sub.head()

## Preparar columna de ID
El enunciado pide citas `[laptop_id:campo]`. Si el CSV no trae `laptop_id`, usamos la primera columna como ID.

In [None]:
if 'laptop_id' not in df_sub.columns:
    first_col = df_sub.columns[0]
    df_sub = df_sub.rename(columns={first_col: 'laptop_id'})
    print('Renombrado', first_col, '-> laptop_id')

df_sub[['laptop_id']].head()

## Chunking (campo: valor)
Creamos chunks por campo con metadatos de cita.

In [None]:
def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9áéíóúüñ\s]", " ", text)
    return [t for t in text.split() if len(t) > 1]

def make_chunks(row: dict, laptop_id_field: str = 'laptop_id'):
    laptop_id = str(row.get(laptop_id_field, ''))
    chunks = []
    i = 0
    for field, value in row.items():
        if field == laptop_id_field:
            continue
        if value is None:
            continue
        value = str(value).strip()
        if value == '' or value.lower() == 'nan':
            continue
        txt = f"{field}: {value}"
        chunks.append({
            'chunk_id': f"{laptop_id}_{i}",
            'laptop_id': laptop_id,
            'field': field,
            'text': txt,
            'citations': [(laptop_id, field)]
        })
        i += 1
    return chunks

rows = df_sub.to_dict(orient='records')
chunks = []
for r in rows:
    chunks.extend(make_chunks(r))

print('Chunks:', len(chunks))
pd.DataFrame(chunks[:8])

## BM25 Index + búsqueda top-k

In [None]:
corpus_tokens = [tokenize(c['text']) for c in chunks]
bm25 = BM25Okapi(corpus_tokens)

def search(query: str, top_k: int = 5):
    q = tokenize(query)
    scores = bm25.get_scores(q)
    idx = np.argsort(scores)[::-1][:top_k]
    return [(chunks[i], float(scores[i])) for i in idx]

query = 'laptop con 16gb ram y ssd'
results = search(query, TOP_K)
[(r[0]['text'], r[1]) for r in results[:3]]

## Generación de respuesta (MVP) + citas
MVP: compone respuesta a partir de los chunks recuperados y agrega citas `[laptop_id:campo]`.

In [None]:
def generate_answer(query: str, retrieved, max_words: int = 120):
    lines = [ch['text'] for ch, _ in retrieved]
    answer_core = ' '.join(lines)
    answer_core = ' '.join(answer_core.split()[:max_words])

    cites = []
    seen = set()
    for ch, _ in retrieved:
        for lid, field in ch['citations']:
            key = f"{lid}:{field}"
            if key not in seen:
                cites.append(f"[{lid}:{field}]")
                seen.add(key)
    context_text = '\n'.join(lines)
    return answer_core + ' ' + ' '.join(cites), context_text, cites

answer, context, cites = generate_answer(query, results, MAX_ANSWER_WORDS)
answer

## Agente crítico
Valida soporte: si detecta afirmaciones sin evidencia (heurística), reescribe/recorta.

In [None]:
def normalize(s: str) -> str:
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

def critic_review(answer: str, context: str, citations: list[str]):
    issues = []
    if not citations:
        issues.append('No citations found.')

    ctx = normalize(context)
    sentences = [s.strip() for s in re.split(r"[.!?]", answer) if s.strip()]

    supported = []
    for s in sentences:
        s_norm = normalize(s)
        if len(s_norm.split()) < 4:
            supported.append(True)
            continue
        supported.append(s_norm[:60] in ctx or any(tok in ctx for tok in s_norm.split()[:6]))

    if not all(supported):
        issues.append('Some statements may be unsupported by retrieved context.')
        kept = [sentences[i] for i, ok in enumerate(supported) if ok]
        revised = '. '.join(kept)
        if revised:
            revised_answer = revised + '. ' + ' '.join(citations)
        else:
            revised_answer = 'No puedo responder con confianza con la evidencia recuperada. ' + ' '.join(citations)
        return {'ok': False, 'revised_answer': revised_answer, 'issues': issues}

    return {'ok': True, 'revised_answer': answer, 'issues': []}

review = critic_review(answer, context, cites)
review

In [None]:
print('Answer (raw):')
print(answer)

print('\nAnswer (final):')
print(review['revised_answer'])

print('\nIssues:', review['issues'])

## Siguiente paso
1) Curar `data/eval_queries.json` con `relevant_laptop_ids` reales
2) Ejecutar evaluación automática (Precision@k, Recall@k, faithfulness, coverage)
3) (Opcional) conectar un LLM real manteniendo el contrato de salida y citas
