In [61]:

import time
import re
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union, Set
import requests
from bs4 import BeautifulSoup, Tag
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, unquote,quote

# PART 1: PROJECT

## Question 1: Data Ingestion 

The project involves extracting information from public bids, initially sourced from HTML and PDF formats. The HTML files contain bid and contract details, covering fields such as contract duration, contract type, and contract value. Meanwhile, the PDF documents specify the technical and legal requirements essential for securing the contract. The context 

In [85]:

HEADERS = {"User-Agent": "html-fetcher/1.0"}
BASE_URL = "https://contratos-publicos.comunidad.madrid"


def fetch_html(url: str, timeout: int = 30) -> str:
    """Fetch raw HTML from a URL using requests."""
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    r.encoding = r.apparent_encoding or "utf-8"
    return r.text


def extract_contract_links(html: str) -> List[str]:
    """
    Extract links starting with /contrato-publico/.
    Normalize to absolute URLs. Return unique list.
    """
    pattern = r'href="(/contrato-publico/[^"]+)"'
    matches = re.findall(pattern, html)
    return sorted({BASE_URL + path for path in matches})


def update_page(url: str, page: int) -> str:
    """Replace the `page` parameter in the URL with a new value."""
    parts = urlparse(url)
    q = parse_qs(parts.query, keep_blank_values=True)
    q["page"] = [str(page)]
    new_query = urlencode(q, doseq=True)
    return urlunparse(parts._replace(query=new_query))


def paginate_contract_links(
    base_url: str,
    max_pages: int = 20,
    sleep_secs: float = 0.8,
    stop_when_empty: bool = True,
) -> List[str]:
    """
    Crawl pages ?page=0..N and collect all contract links.
    """
    seen: Set[str] = set()

    for page in range(max_pages):
        url = update_page(base_url, page)
        html = fetch_html(url)
        links = extract_contract_links(html)

        before = len(seen)
        seen.update(links)
        added = len(seen) - before
        print(f"[page {page}] found={len(links)} added={added} total={len(seen)}")

        if stop_when_empty and len(links) == 0:
            print("No links found → stopping early.")
            break

        time.sleep(sleep_secs)

    return sorted(seen)


In [86]:


url = 'https://contratos-publicos.comunidad.madrid/contratos?t=&tipo_publicacion=All&createddate=&createddate_1=&fin_presentacion=&fin_presentacion_1=&ss_buscador_estado_situacion=All&numero_expediente=&referencia=&ss_identificador_ted=&entidad_adjudicadora=All&tipo_contrato=All&codigo_cpv=&ss_field_contrato_lote_reservado=All&bs_regulacion_armonizada=All&ss_sist_de_contratacion=All&modalidad_compra_publica=All&ss_financiacion_ue=All&ss_field_pcon_codigo_referencia=&procedimiento_adjudicacion=All&ss_tipo_de_tramitacion=All&ss_metodo_presentacion=All&bs_subasta_electronica=All&presupuesto_base_licitacion_total=&presupuesto_base_licitacion_total_1=&ds_field_pcon_fecha_desierto=&ds_field_pcon_fecha_desierto_1=&nif_adjudicatario=&nombre_adjudicatario=&importacion_adjudicacion_con_impuestos=&importacion_adjudicacion_con_impuestos_1=&ds_fecha_encargo=&ds_fecha_encargo_1=&ds_field_pcon_fecha_publi_anun_form=&ds_field_pcon_fecha_publi_anun_form_1=&bs_field_pcon_compra_publica=All&page=0'

In [87]:
# Get the links
search_url = "https://contratos-publicos.comunidad.madrid/contratos?t=&tipo_publicacion=All&createddate=&createddate_1=&fin_presentacion=&fin_presentacion_1=&ss_buscador_estado_situacion=All&numero_expediente=&referencia=&ss_identificador_ted=&entidad_adjudicadora=All&tipo_contrato=All&codigo_cpv=&ss_field_contrato_lote_reservado=All&bs_regulacion_armonizada=All&ss_sist_de_contratacion=All&modalidad_compra_publica=All&ss_financiacion_ue=All&ss_field_pcon_codigo_referencia=&procedimiento_adjudicacion=All&ss_tipo_de_tramitacion=All&ss_metodo_presentacion=All&bs_subasta_electronica=All&presupuesto_base_licitacion_total=&presupuesto_base_licitacion_total_1=&ds_field_pcon_fecha_desierto=&ds_field_pcon_fecha_desierto_1=&nif_adjudicatario=&nombre_adjudicatario=&importacion_adjudicacion_con_impuestos=&importacion_adjudicacion_con_impuestos_1=&ds_fecha_encargo=&ds_fecha_encargo_1=&ds_field_pcon_fecha_publi_anun_form=&ds_field_pcon_fecha_publi_anun_form_1=&bs_field_pcon_compra_publica=All&page=0"

contract_urls = paginate_contract_links(search_url, max_pages=10)
print(f"\nCollected {len(contract_urls)} unique contract links ✅")

for link in contract_urls[:5]:
    print(link)



[page 0] found=10 added=10 total=10
[page 1] found=10 added=10 total=20
[page 2] found=10 added=10 total=30
[page 3] found=10 added=10 total=40
[page 4] found=10 added=10 total=50
[page 5] found=10 added=10 total=60
[page 6] found=10 added=10 total=70
[page 7] found=10 added=10 total=80
[page 8] found=10 added=10 total=90
[page 9] found=10 added=10 total=100

Collected 100 unique contract links ✅
https://contratos-publicos.comunidad.madrid/contrato-publico/2025-0-78-suministro-instalacion-puesta-funcionamiento-equipamiento-laboratorio
https://contratos-publicos.comunidad.madrid/contrato-publico/2025-0-80-suministro-instalacion-puesta-funcionamiento-cunas-termicas-fototerapia
https://contratos-publicos.comunidad.madrid/contrato-publico/acuerdo-marco-servicio-asistencia-tecnica-materia-seguridad-salud-proyectos-0
https://contratos-publicos.comunidad.madrid/contrato-publico/acuerdo-marco-servicios-mantenimiento-correctivo-segundo-nivel-equipos-0
https://contratos-publicos.comunidad.madrid

In [None]:

def parse_convocatoria_from_url(url: str, timeout: int = 30) -> Dict[str, Optional[str]]:
    """
    Downloads a contract detail page, parses <ul class='pcon-convocatoria'>,
    and returns a dictionary {label: value}.
    """
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
    except Exception as e:
        print(f"Request error for {url}: {e}")
        return {}

    soup = BeautifulSoup(resp.text, "html.parser")
    data: Dict[str, Optional[str]] = {}

    ul = soup.select_one("ul.pcon-convocatoria")
    if not ul:
        return data

    for li in ul.find_all("li", recursive=False):
        label_el = li.select_one(".field__label")
        if not label_el:
            continue

        label = label_el.get_text(strip=True)

        # Prefer .field__item but fallback to .field-content
        value_el = li.select_one(".field__item") or li.select_one(".field-content")

        if value_el:
            value = " ".join(value_el.stripped_strings)
        else:
            value = None

        data[label] = value

    return data


In [91]:
for i in contract_urls[0:3]:
    print(f'Readings: {i} .... ')
    print()
    print(parse_convocatoria_from_url(i))

Readings: https://contratos-publicos.comunidad.madrid/contrato-publico/2025-0-78-suministro-instalacion-puesta-funcionamiento-equipamiento-laboratorio .... 

{'Tipo de publicación': 'Convocatoria anunciada a licitación', 'Situación': 'En plazo', 'Número de expediente': '2025-0-78', 'Referencia': 'C11145', 'Objeto del contrato': 'Suministro, instalación y puesta en funcionamiento de equipamiento de laboratorio para la unidad de reproducción humana del Hospital Universitario 12 de Octubre.', 'Tipo de contrato': 'Suministros', 'Contrato mixto': 'No', 'Código CPV': '33100000-1', 'Contrato/lote reservado': 'No aplica', 'Legislación nacional aplicable': 'Ley 9/2017', 'Sujeto a regulación armonizada': 'No', 'Sistema de contratación': 'No aplica', 'Código NUTS': 'ES300', 'Compra pública de innovación': 'No', 'Financiación de la Unión Europea': 'No hay financiación de la Unión Europea', 'Procedimiento de adjudicación': 'Abierto simplificado', 'Tipo de tramitación': 'Ordinaria', 'Método de prese

In [26]:
parse_convocatoria_from_url(contract_urls[0])

{'Tipo de publicación': 'Convocatoria anunciada a licitación',
 'Situación': 'En plazo',
 'Número de expediente': '2025000044',
 'Referencia': 'C10616',
 'Identificador del expediente en TED': 'a541a1eb-aef4-44e1-864d-7ffabe4d144c',
 'Objeto del contrato': 'Adquisición de dos Torres de Laparoscopia con destino a los Servicios de Ginecología y Cirugía Cardíaca del Hospital Universitario Ramón y Cajal.',
 'Tipo de contrato': 'Suministros',
 'Contrato mixto': 'No',
 'Código CPV': '33100000-1',
 'Contrato/lote reservado': 'No aplica',
 'Legislación nacional aplicable': 'Ley 9/2017',
 'Sujeto a regulación armonizada': 'Sí',
 'Sistema de contratación': 'No aplica',
 'Código NUTS': 'ES300',
 'Compra pública de innovación': 'No',
 'Financiación de la Unión Europea': 'No hay financiación de la Unión Europea',
 'Procedimiento de adjudicación': 'Abierto',
 'Tipo de tramitación': 'Ordinaria',
 'Método de presentación de ofertas': 'Electrónica',
 'Subasta electrónica': 'No',
 'Valor estimado sin im

## Question 2: Chunking strategy 

There are two types of documents.Each type reqiures a different chunking strategy

**Contracts** 

Each contract record already comes as structured JSON (publication type, procedure, CPV, amounts, deadlines, etc.), so no chunking is needed. The strategy is then to treat each contract as a single atomic document — one JSON object per contract. Use the contract ID as the primary key. 

**PDF Attachments** 

The goal is to extract structured and textual information from the PDF pliegos for Q&A, risk detection, and table reconstruction. For this a hybrid “page-aware + table-aware” strategy to handle text sections and tables efficiently. In particular, split by sections or headings, not blindly by page count. Some rules that can apply: 

- Use 2–3 pages per chunk (CHUNK_SIZE=2–3, STEP=1) to keep semantic context.

- Always include a small line overlap (2–3 lines) between consecutive chunks.

- Define chunks around ~12,000 characters (≈1–1.5k tokens).



## Question 3: Database

The characteristics of the project suggest using persisting storage. It will be used for the following 

- Raw PDFs/HTML and text/LLM outputs (expensive to recreate).

- Stable embeddings and chunk boundaries.

- Any curated field you’ll query later (contract value, duration, lots).

- Run logs & costs (for observability and regression checks).

In addition, there will be vectors searches and these vectors could be stored in a database like Qdrant. 

# PART 2: BUILD A SUMMARY AGENT

## Question 5 

Build an agent that does a couple of things:

- Use the `Fetch Web Page` tool to get the content of the page

- Use the `Save Summary` tool to save the summary

**Rationale of the Agent** 

The agent is composed of three key layers working together: Pydantic Agent, the Tools and the Pydantic Models

- **Pydantic Agent** -> It is the controller that receives natural language requests from the user. Interprets the language and decides what tool to use.

- **Tool(s)** -> These are functions that actually create the capabilities of the agents.

- **Models** -> Define exactly what each tool returns. They ensure the agent provides a structured and coherent output




In [None]:

import os
import re
import json
import datetime as dt
from typing import Optional, Literal
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, HttpUrl, Field
from openai import OpenAI
from pydantic_ai import Agent


# ---------------- Models ----------------

class FetchOutput(BaseModel):
    url: HttpUrl
    title: Optional[str]
    text: str
    fetched_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))

class SummaryOutput(BaseModel):
    url: HttpUrl
    title: Optional[str]
    summary: str
    saved_to: str
    created_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))

# ---------------- Helpers ----------------

def _slug_from_url(url: str) -> str:
    """
    Build a filesystem-safe slug from the URL:
    - use the last path component if present; otherwise use the hostname
    - strip query/fragment
    - lowercase and keep only [a-z0-9_-], replacing others with '-'
    """
    p = urlparse(url)
    last = (p.path or "").strip("/").split("/")[-1]
    base = last or p.netloc
    base = base.split("?")[0].split("#")[0]
    slug = re.sub(r"[^a-zA-Z0-9_-]+", "-", base).strip("-").lower()
    if not slug:
        slug = re.sub(r"[^a-zA-Z0-9]+", "-", p.netloc).strip("-").lower() or "page"
    return slug

def _fetch_text_and_title(url: str, timeout=(5, 20)) -> tuple[str, Optional[str]]:
    r = requests.get(url, headers={"User-Agent": "WebFetch/0.3"}, timeout=timeout, allow_redirects=True)
    r.raise_for_status()
    html = r.text

    # Fast path for Wikipedia (REST plain)
    if "wikipedia.org" in r.url:
        try:
            slug = r.url.rsplit("/", 1)[-1]
            rr = requests.get(
                f"https://en.wikipedia.org/api/rest_v1/page/plain/{slug}",
                headers={"User-Agent": "WebFetch/0.3"},
                timeout=(5, 15),
            )
            rr.raise_for_status()
            text = rr.text
            soup = BeautifulSoup(html, "html.parser")
            title = soup.title.string.strip() if (soup.title and soup.title.string) else None
            return text, title
        except Exception:
            pass

    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    title = soup.title.string.strip() if (soup.title and soup.title.string) else None
    text = "\n".join(t.strip() for t in soup.stripped_strings)
    return text, title

# ---------------- Tool: fetch_content ----------------

def fetch_content(url: HttpUrl) -> FetchOutput:
    """Fetch full text from a webpage."""
    resp = requests.get(url, headers={"User-Agent": "WebFetch/0.1"}, timeout=20)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    title = soup.title.string.strip() if soup.title and soup.title.string else None
    text = "\n".join(t.strip() for t in soup.stripped_strings)

    return FetchOutput(url=url, title=title, text=text)

# ---------------- Tool: save_summary ----------------

_SUMMARY_KEYS_SEEN: set[tuple[str, str, str]] = set()  # (url, style, saved_path)

def save_summary(
    url: HttpUrl,
    style: Literal["bullet", "exec"] = "bullet",
    out_dir: str = "summaries",
    model_name: Optional[str] = None,
) -> SummaryOutput:
    """
    One-shot: fetch full text from URL, summarize with OpenAI, save to a single JSON file
    named {slug}_summary.json under `out_dir`, and return the saved path.

    Idempotent within the process: avoids duplicate summaries for the same (url, style, out_dir/slug_summary.json).
    """
    # Fetch content
    html_clean, title = _fetch_text_and_title(str(url))
    text = " ".join(BeautifulSoup(html_clean, "html.parser").stripped_strings)


    # Compose filename
    slug = _slug_from_url(str(url))
    os.makedirs(out_dir or ".", exist_ok=True)
    saved_path = os.path.join(out_dir, f"{slug}_summary.json")

    key = (str(url), style, saved_path)
    if key in _SUMMARY_KEYS_SEEN:
        # Already summarized in this run
        return SummaryOutput(url=url, title=title, summary="(already summarized in this run)", saved_to=saved_path)

    # OpenAI client
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)
    model = model_name or os.getenv("AGENT_MODEL", "gpt-4o-mini")

    style_instr = (
        "Return 5–8 concise bullets with key facts, numbers, and dates. End with a one-sentence takeaway."
        if style == "bullet"
        else "Write a tight 2–3 paragraph executive summary followed by 3 concrete action items."
    )

    # Truncate long inputs (latency/cost control)
    max_chars = 120_000
    body = text if len(text) <= max_chars else (text[:max_chars] + "\n...[truncated]")

    prompt = f"Title: {title or 'N/A'}\nURL: {url}\n\n{style_instr}\n\nCONTENT:\n{body}"

    resp = client.chat.completions.create(
        model=model,
        temperature=0.2,
        max_tokens=500,
        messages=[
            {"role": "system", "content": "You are a precise summarizer. Stay faithful to the source."},
            {"role": "user", "content": prompt},
        ],
    )
    summary = (resp.choices[0].message.content or "").strip()

    # Save a single JSON file named {slug}_summary.json
    record = {
        "url": str(url),
        "title": title,
        "summary": summary,
        "model": model,
        "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
        "saved_to": saved_path,
    }
    with open(saved_path, "w", encoding="utf-8") as f:
        json.dump(record, f, ensure_ascii=False, indent=2)

    _SUMMARY_KEYS_SEEN.add(key)
    return SummaryOutput(url=url, title=title, summary=summary, saved_to=saved_path)

# ---------------- Agent ----------------

SYSTEM_PROMPT = """
You have two tools:
- fetch_content(url): fetch full webpage content.
- save_summary(url, style?, out_dir?, model_name?): fetch, summarize, and save to {slug}_summary.json.

Behavioral rules:
• If the user asks to fetch AND save a summary in one request, call ONLY save_summary ONCE.
• Never call save_summary more than once per user request.
• If you already produced a summary for the same (url, style, out_dir) in this run, do not run it again.
• Return only the final tool result for the user’s request.
• Always output the saved summary details clearly, including:
    - url
    - title
    - summary
    - saved_to path (should end with {slug}_summary.json)
    - timestamp
""".strip()

agent = Agent(
    model="gpt-4o-mini",
    tools=[fetch_content, save_summary],
    system_prompt=SYSTEM_PROMPT,
)

In [None]:
res = await agent.run("What is this page about https://en.wikipedia.org/wiki/Capybara and save a bullet summary")


[index_links] Error procesando https://en.wikipedia.org/wiki/Capybara: Embedding dimension mismatch: index has 1536, new has 3072
[index_links] Error procesando https://en.wikipedia.org/wiki/Lesser_capybara: Embedding dimension mismatch: index has 1536, new has 3072
[index_links] Error procesando https://en.wikipedia.org/wiki/Hydrochoerus: Embedding dimension mismatch: index has 1536, new has 3072
[index_links] Error procesando https://en.wikipedia.org/wiki/Neochoerus: Embedding dimension mismatch: index has 1536, new has 3072
[index_links] Error procesando https://en.wikipedia.org/wiki/Caviodon: Embedding dimension mismatch: index has 1536, new has 3072
[index_links] Error procesando https://en.wikipedia.org/wiki/Neochoerus_aesopi: Embedding dimension mismatch: index has 1536, new has 3072


In [92]:
print(res)

AgentRunResult(data='Here is the summary of the Wikipedia page about Capybaras:\n\n- **URL**: [Capybara - Wikipedia](https://en.wikipedia.org/wiki/Capybara)\n- **Title**: Capybara - Wikipedia\n- **Summary**:\n  - The capybara (Hydrochoerus hydrochaeris) is the largest living rodent, native to South America, and can weigh between 35 to 66 kg (77 to 146 lb).\n  - Adult capybaras typically measure 106 to 134 cm (3.48 to 4.40 ft) in length and stand 50 to 62 cm (20 to 24 in) tall at the withers.\n  - They are highly social animals, often found in groups of 10-20, but can gather in larger groups of up to 100 during the dry season.\n  - Capybaras are herbivores, primarily grazing on grasses and aquatic plants, and exhibit autocoprophagy, consuming their own feces to aid digestion.\n  - Their maximum lifespan is 8 to 10 years in captivity, but they usually live only about four years in the wild due to predation.\n  - Capybaras are not considered threatened, with a stable population across mos

## Question 6. Index 

In [79]:
import os
import re
import json
import hashlib
import datetime as dt
from typing import Optional, Literal, List, Dict, Tuple, Iterable
from urllib.parse import urlparse, quote, unquote

import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel, HttpUrl, Field
from openai import OpenAI
from pydantic_ai import Agent

import numpy as np

# Optional BM25 (nice boost for lexical exact matches)
try:
    from rank_bm25 import BM25Okapi
    USE_BM25 = True
except Exception:
    BM25Okapi = None
    USE_BM25 = False

# ---------------- Models ----------------

class FetchOutput(BaseModel):
    url: HttpUrl
    title: Optional[str]
    text: str
    fetched_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))

class SummaryOutput(BaseModel):
    url: HttpUrl
    title: Optional[str]
    summary: str
    saved_to: str
    created_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))

class IndexStats(BaseModel):
    added_docs: int
    total_docs: int
    index_dir: str
    embedding_model: str
    bm25_enabled: bool
    updated_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))


class IndexLinksParams(BaseModel):
    urls: List[str]
    index_dir: str = "index"
    embed_model: str = "text-embedding-ada-002" 
    chunk_tokens: int = 350
    overlap_tokens: int = 50
    connect_timeout: int = Field(5, ge=1)
    read_timeout: int = Field(20, ge=1)


class SearchHit(BaseModel):
    url: HttpUrl
    title: Optional[str]
    chunk_id: str
    score: float
    snippet: str

class AnswerOutput(BaseModel):
    question: str
    answer: str
    citations: List[Dict[str, str]]  # {"url":..., "title":..., "chunk_id":...}
    used_model: str
    generated_at: dt.datetime = Field(default_factory=lambda: dt.datetime.now(dt.timezone.utc))


# ---------------- Helpers ----------------

def _slug_from_url(url: str) -> str:
    """
    Build a filesystem-safe slug from the URL:
    - use the last path component if present; otherwise use the hostname
    - strip query/fragment
    - lowercase and keep only [a-z0-9_-], replacing others with '-'
    """
    p = urlparse(url)
    last = (p.path or "").strip("/").split("/")[-1]
    base = last or p.netloc
    base = base.split("?")[0].split("#")[0]
    slug = re.sub(r"[^a-zA-Z0-9_-]+", "-", base).strip("-").lower()
    if not slug:
        slug = re.sub(r"[^a-zA-Z0-9]+", "-", p.netloc).strip("-").lower() or "page"
    return slug

def _atomic_write(path: str, data: bytes) -> None:
    """
    Atomically write bytes to `path`. Ensures parent directory exists.
    """
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    tmp = f"{path}.tmp"
    with open(tmp, "wb") as f:
        f.write(data)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, path)

def _read_jsonl(path: str) -> Iterable[dict]:
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def _append_jsonl(path: str, records: List[dict]) -> None:
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def _hash_id(*parts: str) -> str:
    h = hashlib.sha256()
    for p in parts:
        h.update(p.encode("utf-8", errors="ignore"))
    return h.hexdigest()[:16]




def _clean_text(html: str) -> Tuple[str, Optional[str]]:
    soup = BeautifulSoup(html, "html.parser")
    # Try to focus on the main article content
    main = soup.find(attrs={"role": "main"}) or soup.find("main") or soup.find("article")
    root = main or soup

    # Remove obvious non-content
    for tag in root(["script", "style", "noscript", "header", "footer", "nav", "aside", "form"]):
        tag.decompose()

    title_el = soup.title
    title = title_el.string.strip() if (title_el and title_el.string) else None

    # Keep headings to preserve section boundaries
    texts: List[str] = []
    for el in root.find_all(["h1","h2","h3","h4","h5","h6","p","li","table"]):
        t = el.get_text(" ", strip=True)
        if not t:
            continue
        # Add simple heading markers for later section-aware chunking
        if el.name in {"h1","h2","h3"}:
            texts.append(f"\n\n## {t}\n")
        else:
            texts.append(t)

    text = "\n".join(texts)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text, title


def _fetch_text_and_title(url: str, timeout=(5, 20)) -> tuple[str, Optional[str]]:
    r = requests.get(url, headers={"User-Agent": "WebFetch/0.4"}, timeout=timeout, allow_redirects=True)
    r.raise_for_status()

    # Fast path for Wikipedia (REST "plain" or "mobile-html")
    if "wikipedia.org" in r.url:
        slug = r.url.rsplit("/", 1)[-1]
        # try /page/plain (no UI, just article text)
        try:
            rr = requests.get(
                f"https://en.wikipedia.org/api/rest_v1/page/plain/{slug}",
                headers={"User-Agent": "WebFetch/0.4"},
                timeout=(5, 15),
            )
            rr.raise_for_status()
            title = BeautifulSoup(r.text, "html.parser").title
            title = title.string.strip() if (title and title.string) else None
            return rr.text, title
        except Exception:
            pass
        # fallback: mobile-html (less chrome)
        try:
            rr = requests.get(
                f"https://en.wikipedia.org/api/rest_v1/page/mobile-html/{slug}",
                headers={"User-Agent": "WebFetch/0.4"},
                timeout=(5, 15),
            )
            rr.raise_for_status()
            text, title = _clean_text(rr.text)
            return text, title
        except Exception:
            pass

    # Generic sites
    return _clean_text(r.text)



def _tokenize(text: str) -> List[str]:
    return re.findall(r"[A-Za-z0-9_]+", (text or "").lower())


def _build_snippet(full_text: str, query: str, window: int = 220) -> str:
    txt = (full_text or "").replace("\n", " ")
    if not txt:
        return ""
    q_terms = [w for w in _tokenize(query) if len(w) > 2]
    if not q_terms:
        return txt[:window] + ("…" if len(txt) > window else "")
    # center around first matching term
    locs = []
    low = txt.lower()
    for t in q_terms:
        m = re.search(rf"\b{re.escape(t)}\b", low)
        if m:
            locs.append(m.start())
    if not locs:
        return txt[:window] + ("…" if len(txt) > window else "")
    pos = min(locs)
    half = window // 2
    start = max(0, pos - half)
    end = min(len(txt), start + window)
    snippet = txt[start:end]
    if start > 0:
        snippet = "…" + snippet
    if end < len(txt):
        snippet = snippet + "…"
    return snippet


def _mmr_rerank(cand_idx: List[int], query_vec: np.ndarray, emb_matrix: np.ndarray, k: int, alpha: float = 0.7) -> List[int]:
    """
    MMR over the candidate pool.
    cand_idx: list of GLOBAL row ids (len = C)
    emb_matrix: full [N, D] or already sliced; we’ll slice here for clarity
    Returns a list of GLOBAL row ids in selected order (length <= k).
    """
    C = len(cand_idx)
    if C == 0:
        return []
    if C == 1:
        return cand_idx[:1]

    # Build local matrix [C, D]
    M = emb_matrix[cand_idx].astype(np.float32)
    # Normalize
    M /= (np.linalg.norm(M, axis=1, keepdims=True) + 1e-9)
    Q = query_vec.astype(np.float32)
    Q /= (np.linalg.norm(Q) + 1e-9)

    sim_to_q = M @ Q  # [C]

    # Work strictly in LOCAL positions [0..C-1]
    remaining = list(range(C))
    selected_local: List[int] = []

    while remaining and len(selected_local) < k:
        if not selected_local:
            j_in_rem = int(np.argmax(sim_to_q[remaining]))
            selected_local.append(remaining.pop(j_in_rem))
            continue

        # Similarity to already selected
        S = M[selected_local] @ M.T              # [|S|, C]
        max_sim_to_S = np.max(S[:, remaining], axis=0)  # [|remaining|]
        mmr_scores = alpha * sim_to_q[remaining] - (1.0 - alpha) * max_sim_to_S
        j_in_rem = int(np.argmax(mmr_scores))
        selected_local.append(remaining.pop(j_in_rem))

    # Map back to GLOBAL row ids
    return [cand_idx[i] for i in selected_local]


def _chunk_text(text: str, target_tokens: int = 350, overlap_tokens: int = 50) -> List[str]:
    if not text:
        return []
    # split by our synthetic headings to keep sections cohesive
    sections = re.split(r"\n{2,}## ", text)
    sections = [s.strip() for s in sections if s and s.strip()]
    chunks: List[str] = []
    # token ≈ 0.75 words
    chunk_words = max(50, int(target_tokens / 0.75))
    overlap_words = int(overlap_tokens / 0.75)

    for sec in sections:
        words = sec.split()
        if not words:
            continue
        start = 0
        while start < len(words):
            end = min(len(words), start + chunk_words)
            ch = " ".join(words[start:end])
            if len(ch.split()) >= 12:  # avoid tiny fragments
                chunks.append(ch)
            if end == len(words):
                break
            start = max(end - overlap_words, start + 1)
    return chunks

def _index_links_impl(
    urls: List[str],
    index_dir: str = "index",
    embed_model: str = "text-embedding-ada-002",
    chunk_tokens: int = 350,
    overlap_tokens: int = 50,
    connect_timeout: int = 5,
    read_timeout: int = 20,
) -> IndexStats:
    """
    Implementación principal que realiza:
    1. Descarga y limpieza del contenido.
    2. Segmentación (chunking).
    3. Cálculo de embeddings.
    4. Escritura incremental en el índice persistente.
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)

    idx = VectorIndex(index_dir=index_dir, embedding_model=embed_model)
    added = 0

    for u in urls:
        try:
            if not re.match(r"^https?://", u):
                print(f"[WARN] URL inválida: {u}")
                continue
            text, title = _fetch_text_and_title(u, timeout=(connect_timeout, read_timeout))
            chunks = _chunk_text(text, target_tokens=chunk_tokens, overlap_tokens=overlap_tokens)
            chunks = [c for c in chunks if len(c.split()) >= 5]
            if not chunks:
                 print(f"[index_links] No usable chunks for {u} (after filtering).")
                 continue
            added += idx.add_document(client, u, title, chunks)
        except Exception as e:
            print(f"[index_links] Error procesando {u}: {e}")

    stats = IndexStats(
        added_docs=added,
        total_docs=idx.get_total(),
        index_dir=index_dir,
        embedding_model=embed_model,
        bm25_enabled=USE_BM25,
    )

    os.makedirs(index_dir, exist_ok=True)
    _atomic_write(
        os.path.join(index_dir, "stats.json"),
        json.dumps(stats.model_dump(mode='json'), ensure_ascii=False, indent=2).encode("utf-8"),
    )
    return stats


def index_links(params: IndexLinksParams) -> IndexStats:
    """
    Wrapper JSON-schema-friendly (para usar como tool del agente).
    Convierte el modelo pydantic en argumentos posicionales seguros.
    """
    return _index_links_impl(**params.model_dump())


# ---------------- Embeddings + Index storage ----------------

class VectorIndex:
    """
    Persistent index for embeddings + metadata.

    Files (under index_dir):
      - meta.jsonl         (per-chunk metadata: {chunk_id,url,title,order,created_at})
      - embeddings.npy     (float32 matrix [N x D])
      - embeddings.npy.shape  (json sidecar: {"n": N, "d": D})
      - docstore.jsonl     (per-chunk text payload: {chunk_id,text})
      - bm25_tokens.jsonl  (optional; tokens list per chunk for BM25)
      - stats.json         (IndexStats)
    """
    KNOWN_DIMS = {
        "text-embedding-3-large": 3072,
        "text-embedding-3-small": 1536,
        "text-embedding-ada-002": 1536,
    }

    def __init__(self, index_dir: str = "index", embedding_model: str = "text-embedding-ada-002"):
        self.index_dir = index_dir
        self.meta_path = os.path.join(index_dir, "meta.jsonl")
        self.vec_path = os.path.join(index_dir, "embeddings.npy")
        self.shape_path = self.vec_path + ".shape"
        self.doc_path = os.path.join(index_dir, "docstore.jsonl")
        self.bm25_path = os.path.join(index_dir, "bm25_tokens.jsonl")
        self.stats_path = os.path.join(index_dir, "stats.json")

        self.embedding_model = embedding_model
        self._meta: List[dict] = list(_read_jsonl(self.meta_path))
        self._docs: Dict[str, str] = {d["chunk_id"]: d["text"] for d in _read_jsonl(self.doc_path)}
        self._bm25_tokens: Dict[str, List[str]] = (
            {d["chunk_id"]: d["tokens"] for d in _read_jsonl(self.bm25_path)} if USE_BM25 else {}
        )

        # --- Read stored metadata to reconcile model + dims ---
        stored_model = None
        stored_dim = None
        if os.path.exists(self.stats_path):
            try:
                _stats = json.loads(open(self.stats_path, "r", encoding="utf-8").read())
                stored_model = _stats.get("embedding_model")
            except Exception:
                pass

        if os.path.exists(self.shape_path):
            try:
                shp = json.loads(open(self.shape_path, "r", encoding="utf-8").read())
                stored_dim = int(shp.get("d")) if "d" in shp else None
            except Exception:
                pass

        # If index already exists with a different model, ADOPT the stored model to avoid mismatch.
        if stored_model and stored_model != self.embedding_model:
            print(
                f"[VectorIndex] WARNING: index at '{self.index_dir}' was built with '{stored_model}', "
                f"but '{self.embedding_model}' was requested. Adopting stored model."
            )
            self.embedding_model = stored_model

        # Set expected dimension for the (possibly adopted) model
        self._dim_expected: Optional[int] = self.KNOWN_DIMS.get(self.embedding_model)

        # If shape sidecar missing but vec file exists, infer N,D using expected dim
        self._emb: Optional[np.memmap] = None
        if os.path.exists(self.vec_path):
            n = d = None
            if os.path.exists(self.shape_path):
                try:
                    shp = json.loads(open(self.shape_path, "r", encoding="utf-8").read())
                    n, d = int(shp.get("n", 0)), int(shp.get("d", 0))
                except Exception:
                    n = d = None
            if not n or not d:
                if self._dim_expected:
                    size_bytes = os.path.getsize(self.vec_path)
                    if size_bytes > 0 and size_bytes % (4 * self._dim_expected) == 0:
                        n = size_bytes // (4 * self._dim_expected)
                        d = self._dim_expected
                        with open(self.shape_path, "w", encoding="utf-8") as f:
                            json.dump({"n": int(n), "d": int(d)}, f)
            if n and d:
                # Validate against expected (if both known)
                if stored_dim and self._dim_expected and stored_dim != self._dim_expected:
                    raise ValueError(
                        f"Embedding dimension mismatch: index has dim={stored_dim}, "
                        f"but model '{self.embedding_model}' expects {self._dim_expected}. "
                        f"Rebuild the index or remove '{self.index_dir}'."
                    )
                self._emb = np.memmap(self.vec_path, dtype=np.float32, mode="r", shape=(n, d))

        # Build BM25 index in-memory if enabled
        self._bm25 = None
        if USE_BM25 and self._bm25_tokens:
            corpus = [self._bm25_tokens[cid] for cid in self._iter_chunk_ids()]
            if corpus and BM25Okapi:
                self._bm25 = BM25Okapi(corpus)

    def _iter_chunk_ids(self) -> Iterable[str]:
        for m in self._meta:
            yield m["chunk_id"]

    def _append(self, records: List[dict], embeds: np.ndarray, docs: List[dict], bm25_tokens: Optional[List[dict]]) -> None:
        # Sanity
        if embeds is None or embeds.ndim != 2 or embeds.shape[0] == 0:
            return
        if len(records) != embeds.shape[0] or len(docs) != embeds.shape[0]:
            raise ValueError("records/docs count must match number of embedding rows")

        # Append JSONL
        _append_jsonl(self.meta_path, records)
        _append_jsonl(self.doc_path, docs)
        if USE_BM25 and bm25_tokens:
            _append_jsonl(self.bm25_path, bm25_tokens)

        shape_path = self.vec_path + ".shape"

        if self._emb is None:
            # First write
            _atomic_write(self.vec_path, embeds.astype(np.float32).tobytes())
            _atomic_write(shape_path, json.dumps({"n": int(embeds.shape[0]), "d": int(embeds.shape[1])}).encode("utf-8"))
            self._emb = np.memmap(self.vec_path, dtype=np.float32, mode="r", shape=(embeds.shape[0], embeds.shape[1]))
        else:
            # Concatenate to existing matrix
            if os.path.exists(shape_path):
                shp = json.loads(open(shape_path, "r", encoding="utf-8").read())
                n, d = int(shp["n"]), int(shp["d"])
            else:
                n, d = self._emb.shape

            if embeds.shape[1] != d:
                raise ValueError(f"Embedding dimension mismatch: index has {d}, new has {embeds.shape[1]}")

            new_all = np.empty((n + embeds.shape[0], d), dtype=np.float32)
            # read existing into RAM (memmap -> ndarray)
            new_all[:n] = np.array(self._emb, dtype=np.float32)
            new_all[n:] = embeds.astype(np.float32)

            _atomic_write(self.vec_path, new_all.tobytes())
            _atomic_write(shape_path, json.dumps({"n": int(new_all.shape[0]), "d": int(d)}).encode("utf-8"))
            self._emb = np.memmap(self.vec_path, dtype=np.float32, mode="r", shape=(new_all.shape[0], d))

        # Update in-memory structures
        self._meta.extend(records)
        self._docs.update({d["chunk_id"]: d["text"] for d in docs})
        if USE_BM25 and bm25_tokens:
            for rec in bm25_tokens:
                self._bm25_tokens[rec["chunk_id"]] = rec["tokens"]

        # Refresh BM25 if enabled
        if USE_BM25 and BM25Okapi:
            corpus = [self._bm25_tokens[cid] for cid in self._iter_chunk_ids()]
            self._bm25 = BM25Okapi(corpus) if corpus else None

        # Persist stats
        stats = IndexStats(
            added_docs=len(records),
            total_docs=len(self._meta),
            index_dir=self.index_dir,
            embedding_model=self.embedding_model,
            bm25_enabled=USE_BM25,
        )
        _atomic_write(self.stats_path, json.dumps(stats.model_dump(mode="json"), ensure_ascii=False, indent=2).encode("utf-8"))


    # ---------- Public APIs ----------

    def contains_url(self, url: str) -> bool:
        return any(m["url"] == url for m in self._meta)

    def get_total(self) -> int:
        return len(self._meta)

    def embed_texts(self, client: OpenAI, texts: List[str]) -> np.ndarray:
        if not texts:
            return np.empty((0, 0), dtype=np.float32)
        resp = client.embeddings.create(model=self.embedding_model, input=texts)
        data = getattr(resp, "data", None) or []
        if not data:
            return np.empty((0, 0), dtype=np.float32)
        vecs = np.array([d.embedding for d in data], dtype=np.float32)
        if vecs.ndim == 1:
            vecs = vecs.reshape(1, -1)

        # Validate against sidecar / known dims if available
        shape_path = self.vec_path + ".shape"
        if os.path.exists(shape_path):
            try:
                shp = json.loads(open(shape_path, "r", encoding="utf-8").read())
                d_expected = int(shp.get("d"))
                if vecs.shape[1] != d_expected:
                    raise ValueError(f"Embedding dimension mismatch at embed time: got {vecs.shape[1]} vs index {d_expected}")
            except Exception:
                pass
        return vecs

    def add_document(self, client: OpenAI, url: str, title: Optional[str], chunks: List[str]) -> int:
        """
        Add a single document (split into chunks) to the index.
        Idempotent at chunk level via stable chunk_id hash.
        """
        if not chunks:
            return 0
        
        STOP = set("""
                the a an and or of for to in on by with without as at from into over under within between among across
                main menu navigation contents donate help portal page pages login logout edit talk search jump sidebar
                """.split())

        records, texts, bm25_tok = [], [], []
        for i, ch in enumerate(chunks):
            chunk_id = _hash_id(url, str(i), ch[:64])
            if any(m["chunk_id"] == chunk_id for m in self._meta):
                continue
            records.append({
                "chunk_id": chunk_id,
                "url": url,
                "title": title,
                "order": i,
                "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
            })
            texts.append(ch)
            if USE_BM25:
                raw = re.findall(r"[A-Za-z0-9_]+", ch.lower())
                tokens = [w for w in raw if len(w) > 2 and w not in STOP]
                bm25_tok.append({"chunk_id": chunk_id, "tokens": tokens})

        if not records:
            return 0

        embeds = self.embed_texts(client, texts)
        if embeds.ndim != 2 or embeds.shape[0] == 0:
            return 0

        docs = [{"chunk_id": r["chunk_id"], "text": t} for r, t in zip(records, texts)]
        self._append(records, embeds, docs, bm25_tok if USE_BM25 else None)
        return len(records)

    def _semantic_scores(self, query_vec: np.ndarray, top_k: int = 12) -> List[Tuple[int, float]]:
        if self._emb is None or self._emb.shape[0] == 0:
            return []
        M = np.array(self._emb, dtype=np.float32)
        q = query_vec / (np.linalg.norm(query_vec) + 1e-9)
        M_norm = M / (np.linalg.norm(M, axis=1, keepdims=True) + 1e-9)
        sims = (M_norm @ q.T).squeeze()
        idx = np.argpartition(-sims, kth=min(top_k, len(sims)-1))[:top_k]
        return sorted([(int(i), float(sims[int(i)])) for i in idx], key=lambda x: -x[1])

    def _bm25_scores(self, query: str, top_k: int = 12) -> List[Tuple[int, float]]:
        if not (USE_BM25 and self._bm25):
            return []
        q_tokens = re.findall(r"[A-Za-z0-9_]+", query.lower())
        scores = self._bm25.get_scores(q_tokens)  # type: ignore
        if len(scores) == 0:
            return []
        idx = np.argpartition(-scores, kth=min(top_k, len(scores)-1))[:top_k]
        return sorted([(int(i), float(scores[int(i)])) for i in idx], key=lambda x: -x[1])

    def search(self, client: OpenAI, query: str, k: int = 8) -> List[SearchHit]:
        """
        Hybrid search (semantic + BM25 if available) with RRF + optional lexical gate,
        MMR rerank, and query-centered snippets. Returns top-k hits.
        """
    # 0) guards
        if self._emb is None or self._emb.shape[0] == 0:
            return []

        # 1) query embedding
        q_vec = self.embed_texts(client, [query])
        if q_vec is None or getattr(q_vec, "ndim", 0) != 2 or q_vec.shape[0] == 0:
            return []

        # 2) candidates (semantic + lexical)
        pool_size = max(k * 5, 40)
        sem_top = self._semantic_scores(q_vec[0], top_k=pool_size)  # [(row_idx, sim)]
        lex_top = self._bm25_scores(query, top_k=pool_size) if USE_BM25 else []

        # 3) RRF fusion
        scores: Dict[int, float] = {}
        def fuse(items: List[Tuple[int, float]]) -> None:
            for rank, (row_idx, _) in enumerate(items, start=1):
                scores[row_idx] = scores.get(row_idx, 0.0) + 1.0 / (60.0 + rank)
        fuse(sem_top)
        fuse(lex_top)

        if not scores:
            return []

        # 4) lexical gate (general): keep if any token overlap; if empty, fall back
        q_tokens = set(_tokenize(query))
        def _has_overlap(row_idx: int) -> bool:
            # use precomputed tokens if available; else from text
            cid = self._meta[row_idx]["chunk_id"]
            toks = set(self._bm25_tokens.get(cid, [])) if USE_BM25 else set()
            if not toks:
                toks = set(_tokenize(self._docs.get(cid, "")))
            return len(q_tokens & toks) > 0

        filtered_scores = {i: s for i, s in scores.items() if _has_overlap(i)}
        if not filtered_scores:
            filtered_scores = scores  # don’t drop everything

        # 5) candidate pool sorted by fused score
        pool = sorted(filtered_scores.items(), key=lambda x: -x[1])[:max(k * 3, 24)]
        cand_idx = [i for i, _ in pool]

        # 6) MMR rerank (fallback to fused order if anything goes wrong)
        try:
            M = np.array(self._emb, dtype=np.float32)
            mmr_ordered = _mmr_rerank(cand_idx, q_vec[0], M, k=k)  # returns list[int]
        except Exception:
            mmr_ordered = cand_idx[:k]

        # 7) Build hits with query-centered snippets
        hits: List[SearchHit] = []
        for row_idx in mmr_ordered[:k]:
            if row_idx < 0 or row_idx >= len(self._meta):
                continue
            meta = self._meta[row_idx]
            chunk_id = meta["chunk_id"]
            text = self._docs.get(chunk_id, "")
            snippet = _build_snippet(text, query)
            # score = fused score for transparency (not strictly needed for ranking now)
            fused_score = float(filtered_scores.get(row_idx, 0.0))
            hits.append(
                SearchHit(
                    url=meta["url"],
                    title=meta.get("title"),
                    chunk_id=chunk_id,
                    score=fused_score,
                    snippet=snippet,
                )
            )
        return hits



# ---------------- Tools: fetch_content, save_summary, index_links, search_links and answer_question ----------------

_SUMMARY_KEYS_SEEN: set[tuple[str, str, str]] = set()  # (url, style, saved_path)

def fetch_content(url: HttpUrl) -> FetchOutput:
    """Fetch full text from a webpage."""
    text, title = _fetch_text_and_title(str(url))
    return FetchOutput(url=url, title=title, text=text)

def save_summary(
    url: HttpUrl,
    style: Literal["bullet", "exec"] = "bullet",
    out_dir: str = "summaries",
    model_name: Optional[str] = None,
) -> SummaryOutput:
    """
    One-shot: fetch full text from URL, summarize with OpenAI, save to a single JSON file
    named {slug}_summary.json under `out_dir`, and return the saved path.
    """
    # Fetch content
    text, title = _fetch_text_and_title(str(url))

    # Compose filename
    slug = _slug_from_url(str(url))
    os.makedirs(out_dir or ".", exist_ok=True)
    saved_path = os.path.join(out_dir, f"{slug}_summary.json")

    key = (str(url), style, saved_path)
    if key in _SUMMARY_KEYS_SEEN:
        return SummaryOutput(url=url, title=title, summary="(already summarized in this run)", saved_to=saved_path)

    # OpenAI client
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)
    model = model_name or os.getenv("AGENT_MODEL", "gpt-4o-mini")

    style_instr = (
        "Return 5–8 concise bullets with key facts, numbers, and dates. End with a one-sentence takeaway."
        if style == "bullet"
        else "Write a tight 2–3 paragraph executive summary followed by 3 concrete action items."
    )

    max_chars = 120_000
    body = text if len(text) <= max_chars else (text[:max_chars] + "\n...[truncated]")

    prompt = f"Title: {title or 'N/A'}\nURL: {url}\n\n{style_instr}\n\nCONTENT:\n{body}"

    resp = client.chat.completions.create(
        model=model,
        temperature=0.2,
        max_tokens=500,
        messages=[
            {"role": "system", "content": "You are a precise summarizer. Stay faithful to the source."},
            {"role": "user", "content": prompt},
        ],
    )
    summary = (resp.choices[0].message.content or "").strip()

    record = {
        "url": str(url),
        "title": title,
        "summary": summary,
        "model": model,
        "created_at": dt.datetime.now(dt.timezone.utc).isoformat(),
        "saved_to": saved_path,
    }
    with open(saved_path, "w", encoding="utf-8") as f:
        json.dump(record, f, ensure_ascii=False, indent=2)

    _SUMMARY_KEYS_SEEN.add(key)
    return SummaryOutput(url=url, title=title, summary=summary, saved_to=saved_path)


def index_links(params: IndexLinksParams) -> IndexStats:
    return _index_links_impl(**params.model_dump())

def search_links(
    query: str,
    index_dir: str = "index",
    embed_model: str = "text-embedding-ada-002",
    k: int = 8,
) -> List[SearchHit]:
    """
    Search the link index and return top-k chunk hits with snippets and scores.
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)

    idx = VectorIndex(index_dir=index_dir, embedding_model=embed_model)
    return idx.search(client, query, k=k)

def answer_question(
    question: str,
    index_dir: str = "index",
    gen_model: str = None,
    embed_model: str = "text-embedding-ada-002",
    k: int = 8,
    max_context_chars: int = 18_000,
) -> AnswerOutput:
    """
    RAG-style answer grounded on the indexed links, with lightweight citations.
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)
    gen_model = gen_model or os.getenv("AGENT_MODEL", "gpt-4o-mini")

    idx = VectorIndex(index_dir=index_dir, embedding_model=embed_model)
    hits = idx.search(client, question, k=k)
    if not hits:
        return AnswerOutput(
            question=question,
            answer="I couldn't find anything relevant in the current link index.",
            citations=[],
            used_model=gen_model,
        )

    # Build context (trim to max_context_chars)
    ctx_blocks = []
    citations = []
    used_chars = 0
    for i, h in enumerate(hits, start=1):
        block = f"[{i}] Title: {h.title or 'N/A'}\nURL: {h.url}\nCHUNK_ID: {h.chunk_id}\nSNIPPET:\n{h.snippet}\n"
        if used_chars + len(block) > max_context_chars:
            break
        ctx_blocks.append(block)
        citations.append({"url": str(h.url), "title": h.title or "N/A", "chunk_id": h.chunk_id})
        used_chars += len(block)

    system = (
        "You are a careful research assistant. Answer ONLY using the provided context. "
        "If the context is insufficient, say so. Include bracketed citation numbers [1], [2] inline "
        "corresponding to the provided context blocks."
    )
    user = f"QUESTION:\n{question}\n\nCONTEXT:\n" + "\n---\n".join(ctx_blocks)

    resp = client.chat.completions.create(
        model=gen_model,
        temperature=0.2,
        max_tokens=600,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
    )
    answer = (resp.choices[0].message.content or "").strip()
    return AnswerOutput(
        question=question,
        answer=answer,
        citations=citations,
        used_model=gen_model,
    )

def answer_question_debug(
    question: str,
    index_dir: str = "index",
    gen_model: str = None,
    embed_model: str = "text-embedding-ada-002",
    k: int = 8,
    max_context_chars: int = 18_000,
):
    """Same as answer_question but also returns the retrieved hits."""
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY must be set")
    client = OpenAI(api_key=api_key)
    gen_model = gen_model or os.getenv("AGENT_MODEL", "gpt-4o-mini")

    idx = VectorIndex(index_dir=index_dir, embedding_model=embed_model)
    hits = idx.search(client, question, k=k)

    if not hits:
        return AnswerOutput(
            question=question,
            answer="I couldn't find anything relevant in the current link index.",
            citations=[],
            used_model=gen_model,
        ), []

    # same logic as before
    ctx_blocks, citations, used_chars = [], [], 0
    for i, h in enumerate(hits, start=1):
        block = f"[{i}] Title: {h.title or 'N/A'}\nURL: {h.url}\nCHUNK_ID: {h.chunk_id}\nSNIPPET:\n{h.snippet}\n"
        if used_chars + len(block) > max_context_chars:
            break
        ctx_blocks.append(block)
        citations.append({"url": str(h.url), "title": h.title or "N/A", "chunk_id": h.chunk_id})
        used_chars += len(block)

    system = (
        "You are a careful research assistant. Answer ONLY using the provided context. "
        "If the context is insufficient, say so. Include bracketed citation numbers [1], [2] inline "
        "corresponding to the provided context blocks."
    )
    user = f"QUESTION:\n{question}\n\nCONTEXT:\n" + "\n---\n".join(ctx_blocks)

    resp = client.chat.completions.create(
        model=gen_model,
        temperature=0.2,
        max_tokens=600,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
    )
    answer = (resp.choices[0].message.content or "").strip()
    return AnswerOutput(
        question=question,
        answer=answer,
        citations=citations,
        used_model=gen_model,
    ), hits

# ---------------- Agent wiring ----------------

SYSTEM_PROMPT = """
You have tools to (a) fetch/summarize a single URL and (b) build/search an index of links and answer questions.

Tools:
- fetch_content(url)
- save_summary(url, style?, out_dir?, model_name?)
- index_links(params)  # params: {urls, index_dir, embed_model, chunk_tokens, overlap_tokens, connect_timeout, read_timeout}
- search_links(query, index_dir?, embed_model?, k?)
- answer_question_debug(question, index_dir?, gen_model?, embed_model?, k?, max_context_chars?)

Behavioral rules:
• If the user asks to fetch AND save a summary in one request, call ONLY save_summary ONCE.
• Never call save_summary more than once per user request.
• For questions about already-indexed links, call answer_question.
• If the user provides new links to add, call index_links first, then answer_question if they asked a question.
• Always surface citations (URLs) in answers that rely on the index.
• Be precise and avoid hallucinations; if the context lacks an answer, say so.
""".strip()

agent = Agent(
    model=os.getenv("AGENT_MODEL", "gpt-4o-mini"),
    tools=[fetch_content, save_summary, index_links, search_links, answer_question_debug],
    system_prompt=SYSTEM_PROMPT,
)

In [80]:
import shutil; shutil.rmtree("index", ignore_errors=True)

stats = index_links(IndexLinksParams(
    urls=[
        "https://en.wikipedia.org/wiki/Capybara",
        "https://en.wikipedia.org/wiki/Lesser_capybara",
        "https://en.wikipedia.org/wiki/Hydrochoerus",
        "https://en.wikipedia.org/wiki/Neochoerus",
        "https://en.wikipedia.org/wiki/Caviodon",
        "https://en.wikipedia.org/wiki/Neochoerus_aesopi",
    ],
    index_dir="index",
    embed_model="text-embedding-ada-002",  # keep consistent
))
print(stats)


added_docs=41 total_docs=41 index_dir='index' embedding_model='text-embedding-ada-002' bm25_enabled=True updated_at=datetime.datetime(2025, 11, 1, 22, 31, 20, 464084, tzinfo=datetime.timezone.utc)


In [81]:
question = "What are threats to capybara populations?"
ans, hits = answer_question_debug(question,
                                  index_dir="index",
                                  embed_model="text-embedding-ada-002",
                                  k=5)

print("\n=== ANSWER ===")
print(ans.answer)
print("\n=== TOP HITS ===")
for h in hits:
    print(round(h.score, 3), "|", h.title, "|", h.url)
    print(h.snippet[:300] + "…")
    print("---")


=== ANSWER ===
The threats to capybara populations include hunting, which has reduced their numbers in some areas. However, overall, capybaras are not considered a threatened species, and their population is stable throughout most of their South American range [1].

=== TOP HITS ===
0.033 | Capybara | https://en.wikipedia.org/wiki/Capybara
Conservation and human interaction Capybaras are not considered a threatened species; [ 1 ] their population is stable throughout most of their South American range, though in some areas hunting has reduced their numbers……
---
0.029 | Lesser capybara | https://en.wikipedia.org/wiki/Lesser_capybara
…/IUCN.UK.2016-2.RLTS.T136277A22189896.en . Retrieved 19 November 2021 . [2] canopytower (2021-03-12). "Lesser Capybara | The Canopy Family" . Retrieved 2024-10-21 . [3] Schmidt, Amanda (2023-08-14). "Capybara Fact Sheet ……
---
0.027 | Hydrochoerus | https://en.wikipedia.org/wiki/Hydrochoerus
Behavior Capybaras are highly social, living in groups of up to 

In [71]:
hits

[]

In [None]:
URLS = [
  "https://en.wikipedia.org/wiki/Capybara",
  "https://en.wikipedia.org/wiki/Lesser_capybara",
  "https://en.wikipedia.org/wiki/Hydrochoerus",
  "https://en.wikipedia.org/wiki/Neochoerus",
  "https://en.wikipedia.org/wiki/Caviodon",
  "https://en.wikipedia.org/wiki/Neochoerus_aesopi",
]
EMBED = "text-embedding-ada-002"
NEW_DIR = "index_ada"

stats = index_links(IndexLinksParams(urls=URLS, index_dir=NEW_DIR, embed_model=EMBED))
print(stats)



added_docs=24 total_docs=24 index_dir='index_ada' embedding_model='text-embedding-ada-002' bm25_enabled=True updated_at=datetime.datetime(2025, 11, 1, 20, 59, 14, 299471, tzinfo=datetime.timezone.utc)


In [55]:
q = "What are threats to capybara populations?"
ans, hits = answer_question_debug(q, index_dir="index", embed_model="text-embedding-ada-002", k=5)

print("=== ANSWER ===")
print(ans.answer)
print("\n=== SEARCH HITS ===")
for h in hits:
    print(f"{round(h.score, 3)} | {h.title} | {h.url}")
    print(h.snippet[:300] + "…")
    print("---")


=== ANSWER ===
I couldn't find anything relevant in the current link index.

=== SEARCH HITS ===
