# Multiagente Extractor (PDF/Imagem)
Este módulo cria:
- Tools de extração (PDF texto, PDF OCR, Imagem OCR)
- Heurística para detectar PDF "scanned" (sem texto real)
- 3 agentes:
  - RouterAgent: decide o caminho (extensão + scanned)
  - TextPDF_Agent: tenta extrair texto do PDF
  - OCR_Agent: faz OCR (PDF scanned ou imagem)
- 1 função `run_extraction(path)` para usar como pipeline


In [1]:
# Célula 1 - Inicialização
import os, json, time
from pathlib import Path
from typing import Dict, Any, List

DADOS_DIR = Path(r"C:\Users\fepac\Unicamp_Project\dados")
assert DADOS_DIR.exists(), f"Pasta não existe: {DADOS_DIR}"
print("OK:", DADOS_DIR)

OUT_DIR = DADOS_DIR / "out_extract_multi_agent"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRACE_JSONL = OUT_DIR / "trace_log.jsonl"
TRACE_MD    = OUT_DIR / "trace_pretty.md"

# limpa logs
for p in [TRACE_JSONL, TRACE_MD]:
    if p.exists():
        p.unlink()

def trace_event(event: Dict[str, Any]) -> None:
    event["ts"] = time.strftime("%Y-%m-%d %H:%M:%S")
    TRACE_JSONL.open("a", encoding="utf-8").write(json.dumps(event, ensure_ascii=False) + "\n")

def trace_md(line: str) -> None:
    TRACE_MD.open("a", encoding="utf-8").write(line.rstrip() + "\n")

print("OK: OUT_DIR =", OUT_DIR)
print("OK: TRACE_JSONL =", TRACE_JSONL)
print("OK: TRACE_MD =", TRACE_MD)


OK: C:\Users\fepac\Unicamp_Project\dados
OK: OUT_DIR = C:\Users\fepac\Unicamp_Project\dados\out_extract_multi_agent
OK: TRACE_JSONL = C:\Users\fepac\Unicamp_Project\dados\out_extract_multi_agent\trace_log.jsonl
OK: TRACE_MD = C:\Users\fepac\Unicamp_Project\dados\out_extract_multi_agent\trace_pretty.md


In [None]:
# Célula 2 - Helpers (ToolResponse + _resp_text + list/build + batch runner)
import time
from pathlib import Path
from typing import Dict, Any, List

from agentscope.message import Msg
from agentscope.tool import ToolResponse

SUPPORTED_IMAGES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
SUPPORTED_PDFS   = {".pdf"}
SUPPORTED_EXTS   = SUPPORTED_IMAGES | SUPPORTED_PDFS

def _resp_text(text: str, **meta) -> ToolResponse:
    return ToolResponse(
        content=[{"type": "text", "text": text}],
        metadata=meta or None,
        stream=False,
        is_last=True,
        is_interrupted=False,
        id=None,
    )

def list_supported_files(data_dir: Path) -> list[Path]:
    return sorted([p for p in data_dir.rglob("*") if p.is_file() and p.suffix.lower() in SUPPORTED_EXTS])

def build_extract_msg(file_path: Path, lang: str = "por") -> Msg:
    return Msg(
        name="user",
        role="user",
        content=(
            "Extraia texto do arquivo.\n"
            f"file_path={str(file_path)}\n"
            f"lang={lang}\n"
            "Retorne APENAS o texto extraído (sem inventar conteúdo)."
        ),
    )


# Router agora será um ORQUESTRADOR Python 

async def run_folder_extraction(
    data_dir: Path,
    out_dir: Path,
    lang: str = "por",
    limit: int = 0,
) -> Path:
    out_dir.mkdir(parents=True, exist_ok=True)

    files = list_supported_files(data_dir)
    if limit and limit > 0:
        files = files[:limit]

    print(f"Arquivos suportados encontrados: {len(files)}")
    for p in files[:10]:
        print(" -", p.name)
    if len(files) > 10:
        print(" ...")

    log_rows = ["file,status,output_or_error"]

    for p in files:
        print("\n=== EXTRAINDO:", p.name, "===")
        try:
            text = await extract_one_with_router(p, lang=lang)  # função na Célula 8

            out_txt = out_dir / f"{p.stem}.txt"
            out_txt.write_text(text, encoding="utf-8", errors="ignore")

            log_rows.append(f"{p.name},OK,{out_txt}")
            print("OK ->", out_txt)
        except Exception as e:
            log_rows.append(f"{p.name},ERR,{str(e).replace(',', ';')}")
            print("ERRO:", e)

    log_path = out_dir / "batch_log.csv"
    log_path.write_text("\n".join(log_rows), encoding="utf-8", errors="ignore")
    print("\nLOG salvo em:", log_path)
    return log_path


In [None]:
# Célula 3 - Model
import os
from dotenv import load_dotenv
from agentscope.model import OpenAIChatModel

ENV_PATH = r"C:\Users\fepac\Unicamp_Project\.env"
load_dotenv(ENV_PATH, override=True)

GROQ_API_KEY  = os.getenv("GROQ_API_KEY")
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
GROQ_MODEL    = os.getenv("GROQ_MODEL", "llama-3.3-70b-versatile")

assert GROQ_API_KEY, "GROQ_API_KEY não carregou do .env"
assert GROQ_BASE_URL, "GROQ_BASE_URL não carregou do .env"

model = OpenAIChatModel(
    model_name=GROQ_MODEL,
    api_key=GROQ_API_KEY,
    client_kwargs={"base_url": GROQ_BASE_URL},  # <<< FIX DEFINITIVO
    stream=True,
)

print("OK: model criado")
print("GROQ_BASE_URL:", GROQ_BASE_URL)
print("GROQ_MODEL:", GROQ_MODEL)
print("GROQ_API_KEY prefix:", GROQ_API_KEY[:4], "len:", len(GROQ_API_KEY))


OK: model criado
GROQ_BASE_URL: https://api.groq.com/openai/v1
GROQ_MODEL: llama-3.3-70b-versatile
GROQ_API_KEY prefix: gsk_ len: 56


In [None]:
# Célula 4 - Tools PDF 
from pathlib import Path
import json
import pdfplumber
from agentscope.tool import ToolResponse
from agentscope.tool import Toolkit

# Se toolkit já existir (executou a célula antes), reaproveita
try:
    toolkit
except NameError:
    toolkit = Toolkit()

def register_once(fn):
    """Evita duplicar tool quando a célula roda mais de uma vez no Jupyter."""
    name = fn.__name__
    if hasattr(toolkit, "tools") and name in toolkit.tools:
        return
    toolkit.register_tool_function(fn)

def pdf_is_scanned(pdf_path: str) -> ToolResponse:
    stats = _pdf_text_stats(pdf_path, max_pages=3)
    trace_event({"type": "tool", "name": "pdf_is_scanned", "file": pdf_path, "stats": stats})
    trace_md(
        f"### TOOL pdf_is_scanned | {Path(pdf_path).name}\n"
        f"```json\n{json.dumps(stats, ensure_ascii=False, indent=2)}\n```\n"
    )
    return _resp_text(json.dumps(stats, ensure_ascii=False), stats=stats, success=True)

def extract_pdf_text(pdf_path: str) -> ToolResponse:
    p = Path(pdf_path)
    if not p.exists():
        trace_event({"type": "tool", "name": "extract_pdf_text", "file": pdf_path, "success": False, "error": "not_found"})
        trace_md(f"### TOOL extract_pdf_text | {p.name}\n- ERROR: not_found\n")
        return _resp_text("Arquivo não existe.", success=False)

    texts = []
    with pdfplumber.open(str(p)) as pdf:
        for page in pdf.pages:
            t = page.extract_text() or ""
            if t.strip():
                texts.append(t)

    out = "\n\n".join(texts).strip()
    ok = bool(out)

    trace_event({
        "type": "tool",
        "name": "extract_pdf_text",
        "file": pdf_path,
        "success": ok,
        "chars": len(out),
        "preview": out[:1200],
    })
    trace_md(
        f"### TOOL extract_pdf_text | {p.name}\n"
        f"- success: {ok}\n"
        f"- chars: {len(out)}\n"
        f"```text\n{out[:6000]}\n```\n"
    )

    if not out:
        return _resp_text("Nenhum texto extraído via pdfplumber.", success=False)
    return _resp_text(out, success=True)

register_once(pdf_is_scanned)
register_once(extract_pdf_text)

print("OK: tools PDF registradas (sem duplicar)")


OK: tools PDF registradas (sem duplicar)


In [None]:
# Célula 5 - Tools OCR 
from pathlib import Path
from PIL import Image
import urllib.request

import pytesseract
from pdf2image import convert_from_path

TESSERACT_CMD = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # ou None
TESSDATA_DIR  = Path(r"C:\Program Files\Tesseract-OCR\tessdata")
DEFAULT_LANG  = "por"

if TESSERACT_CMD and Path(TESSERACT_CMD).exists():
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

def _ensure_tess_lang(lang: str, tessdata_dir: Path = TESSDATA_DIR) -> str:
    tessdata_dir.mkdir(parents=True, exist_ok=True)
    trained = tessdata_dir / f"{lang}.traineddata"
    if trained.exists():
        return lang

    if lang == "por":
        url = "https://github.com/UB-Mannheim/tesseract/raw/main/tessdata/por.traineddata"
        try:
            trace_event({"type": "tess_lang_download_start", "lang": "por", "dest": str(trained)})
            urllib.request.urlretrieve(url, str(trained))
            trace_event({"type": "tess_lang_download_end", "lang": "por", "dest": str(trained), "success": True})
            return "por"
        except Exception as e:
            trace_event({"type": "tess_lang_download_end", "lang": "por", "dest": str(trained), "success": False, "error": str(e)})
            return "eng"

    return "eng"

def ocr_image(image_path: str, lang: str = DEFAULT_LANG, psm: int = 6, oem: int = 3) -> ToolResponse:
    p = Path(image_path)
    if not p.exists():
        trace_event({"type": "tool", "name": "ocr_image", "file": image_path, "success": False, "error": "not_found"})
        trace_md(f"### TOOL ocr_image | {p.name}\n- ERROR: not_found\n")
        return _resp_text("Arquivo não existe.", success=False)

    lang_ok = _ensure_tess_lang(lang)
    cfg = f"--oem {oem} --psm {psm}"

    try:
        img = Image.open(str(p)).convert("RGB")
        text = pytesseract.image_to_string(img, lang=lang_ok, config=cfg).strip()
        ok = bool(text)

        trace_event({"type": "tool", "name": "ocr_image", "file": image_path, "success": ok, "lang_used": lang_ok, "chars": len(text), "preview": text[:1200]})
        trace_md(
            f"### TOOL ocr_image | {p.name}\n"
            f"- success: {ok}\n- lang_used: {lang_ok}\n- chars: {len(text)}\n"
            f"```text\n{text[:6000]}\n```\n"
        )

        if not text:
            return _resp_text(f"OCR não retornou texto (lang={lang_ok}).", success=False, lang_used=lang_ok)
        return _resp_text(text, success=True, lang_used=lang_ok)

    except Exception as e:
        trace_event({"type": "tool", "name": "ocr_image", "file": image_path, "success": False, "error": str(e)})
        trace_md(f"### TOOL ocr_image | {p.name}\n- ERROR: {e}\n")
        return _resp_text(f"Erro no OCR da imagem: {e}", success=False, lang_used=lang_ok)

def ocr_pdf(pdf_path: str, lang: str = DEFAULT_LANG, first_n_pages: int = 5, dpi: int = 250, psm: int = 6, oem: int = 3) -> ToolResponse:
    p = Path(pdf_path)
    if not p.exists():
        trace_event({"type": "tool", "name": "ocr_pdf", "file": pdf_path, "success": False, "error": "not_found"})
        trace_md(f"### TOOL ocr_pdf | {p.name}\n- ERROR: not_found\n")
        return _resp_text("Arquivo não existe.", success=False)

    lang_ok = _ensure_tess_lang(lang)
    cfg = f"--oem {oem} --psm {psm}"

    try:
        pages = convert_from_path(str(p), dpi=dpi, first_page=1, last_page=first_n_pages)
    except Exception as e:
        trace_event({"type": "tool", "name": "ocr_pdf", "file": pdf_path, "success": False, "error": f"pdf2image: {e}"})
        trace_md(f"### TOOL ocr_pdf | {p.name}\n- ERROR: pdf2image failed\n- detail: {e}\n")
        return _resp_text(
            "Falha ao converter PDF em imagens (pdf2image). Instale Poppler e adicione o bin ao PATH. "
            f"Detalhe: {e}",
            success=False,
        )

    chunks = []
    try:
        for i, pil_img in enumerate(pages, start=1):
            txt = pytesseract.image_to_string(pil_img.convert("RGB"), lang=lang_ok, config=cfg).strip()
            if txt:
                chunks.append(f"--- page {i} ---\n{txt}")

        out = "\n\n".join(chunks).strip()
        ok = bool(out)

        trace_event({"type": "tool", "name": "ocr_pdf", "file": pdf_path, "success": ok, "lang_used": lang_ok, "pages": len(pages), "chars": len(out), "preview": out[:1200]})
        trace_md(
            f"### TOOL ocr_pdf | {p.name}\n"
            f"- success: {ok}\n- lang_used: {lang_ok}\n- pages: {len(pages)}\n- chars: {len(out)}\n"
            f"```text\n{out[:6000]}\n```\n"
        )

        if not out:
            return _resp_text(f"OCR no PDF não retornou texto (lang={lang_ok}).", success=False, lang_used=lang_ok)
        return _resp_text(out, success=True, lang_used=lang_ok)

    except Exception as e:
        trace_event({"type": "tool", "name": "ocr_pdf", "file": pdf_path, "success": False, "error": str(e)})
        trace_md(f"### TOOL ocr_pdf | {p.name}\n- ERROR: {e}\n")
        return _resp_text(f"Erro no OCR do PDF: {e}", success=False, lang_used=lang_ok)

# registra SEM duplicar
register_once(ocr_image)
register_once(ocr_pdf)

print("OK: tools OCR registradas (sem duplicar)")


OK: tools OCR registradas (sem duplicar)


In [None]:
# Célula 6 - Agentes separados + Router
from agentscope.agent import ReActAgent
from agentscope.memory import InMemoryMemory
from agentscope.formatter import OpenAIChatFormatter
from agentscope.message import Msg

formatter = OpenAIChatFormatter()

# --- Tool: trace_decision (sem duplicar) ---
def trace_decision(file_path: str, agent: str, decision: str, extra: str = "") -> ToolResponse:
    trace_event({"type": "agent_decision", "agent": agent, "file": file_path, "decision": decision, "extra": extra})
    trace_md(
        f"## decision | {Path(file_path).name}\n"
        f"- agent: {agent}\n- decision: {decision}\n- extra: {extra}\n"
    )
    return _resp_text("decision logged", success=True)

register_once(trace_decision)

# --------------------------
# Agent 1: Imagem -> OCR
# --------------------------
ImageAgent = ReActAgent(
    name="ImageAgent",
    sys_prompt=(
        "Você extrai texto de IMAGENS.\n"
        "Sempre chame a tool ocr_image(image_path, lang).\n"
        "Retorne APENAS o texto extraído."
    ),
    model=model,
    formatter=formatter,
    memory=InMemoryMemory(),
    toolkit=toolkit,
    max_iters=8,
)

# --------------------------
# Agent 2: PDF com texto -> pdfplumber
# --------------------------
PDFTextAgent = ReActAgent(
    name="PDFTextAgent",
    sys_prompt=(
        "Você extrai texto de PDFs com TEXTO EMBUTIDO.\n"
        "Sempre chame extract_pdf_text(pdf_path).\n"
        "Retorne APENAS o texto extraído (ou vazio se falhar)."
    ),
    model=model,
    formatter=formatter,
    memory=InMemoryMemory(),
    toolkit=toolkit,
    max_iters=8,
)

# --------------------------
# Agent 3: PDF scanned -> OCR
# --------------------------
PDFOCRAgent = ReActAgent(
    name="PDFOCRAgent",
    sys_prompt=(
        "Você extrai texto de PDFs SCANNED.\n"
        "Sempre chame ocr_pdf(pdf_path, lang, first_n_pages=5).\n"
        "Retorne APENAS o texto extraído (ou vazio se falhar)."
    ),
    model=model,
    formatter=formatter,
    memory=InMemoryMemory(),
    toolkit=toolkit,
    max_iters=10,
)

# --------------------------
# TOOL: call_agent (para a LLM poder 'delegar' chamando tool)
# --------------------------
async def _call_agent_impl(agent_name: str, file_path: str, lang: str = "por") -> str:
    if agent_name == "ImageAgent":
        msg = Msg("user", f"image_path={file_path}\nlang={lang}\nExtraia e retorne só o texto.", role="user")
        rep = await ImageAgent.reply(msg)
        return rep.get_text_content()
    if agent_name == "PDFTextAgent":
        msg = Msg("user", f"pdf_path={file_path}\nExtraia e retorne só o texto.", role="user")
        rep = await PDFTextAgent.reply(msg)
        return rep.get_text_content()
    if agent_name == "PDFOCRAgent":
        msg = Msg("user", f"pdf_path={file_path}\nlang={lang}\nExtraia e retorne só o texto.", role="user")
        rep = await PDFOCRAgent.reply(msg)
        return rep.get_text_content()
    return ""

def call_agent(agent_name: str, file_path: str, lang: str = "por") -> ToolResponse:

    import asyncio
    loop = asyncio.get_running_loop()
    t0 = time.time()

    trace_event({"type": "agent_delegate_start", "by": "RouterAgent", "to": agent_name, "file": file_path})
    trace_md(f"### DELEGATE RouterAgent -> {agent_name} | {Path(file_path).name}\n")

    text = loop.run_until_complete(_call_agent_impl(agent_name, file_path, lang=lang))
    elapsed = round(time.time() - t0, 3)

    trace_event({"type": "agent_delegate_end", "by": "RouterAgent", "to": agent_name, "file": file_path, "elapsed_s": elapsed, "chars": len(text), "preview": text[:1200]})
    trace_md(
        f"### DELEGATE END | {agent_name} | elapsed={elapsed}s chars={len(text)}\n"
        f"```text\n{text[:6000]}\n```\n"
    )
    return _resp_text(text, success=True, agent=agent_name, elapsed_s=elapsed)

register_once(call_agent)

# --------------------------
# Agent 4: Router (LLM orquestra tools e delegações via tool)
# --------------------------
RouterAgent = ReActAgent(
    name="RouterAgent",
    sys_prompt=(
        "Você é um ORQUESTRADOR.\n"
        "Entrada sempre vem como linhas: file_path=... e lang=...\n\n"
        "Regras de orquestração (obrigatórias):\n"
        "1) Se extensão for imagem: chame trace_decision(file_path,'RouterAgent','delegate->ImageAgent') e então call_agent('ImageAgent', file_path, lang).\n"
        "2) Se extensão for .pdf:\n"
        "   - chame pdf_is_scanned(pdf_path=file_path)\n"
        "   - se likely_scanned=True: trace_decision(...,'delegate->PDFOCRAgent') e call_agent('PDFOCRAgent', file_path, lang)\n"
        "   - senão: trace_decision(...,'delegate->PDFTextAgent') e call_agent('PDFTextAgent', file_path, lang)\n"
        "3) Fallback: se vier vazio, trace_decision(...,'fallback->PDFOCRAgent') e call_agent('PDFOCRAgent', file_path, lang)\n\n"
        "Saída: devolva APENAS o texto final extraído."
    ),
    model=model,
    formatter=formatter,
    memory=InMemoryMemory(),
    toolkit=toolkit,
    max_iters=15,
)

print("OK: Multi-agente criado + RouterAgent (delegação via TOOL call_agent)")


OK: Multi-agente criado + RouterAgent (delegação via TOOL call_agent)


In [None]:
# Célula 7 - Batch runner 
from pathlib import Path
from agentscope.message import Msg

SUPPORTED_IMAGES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"}
SUPPORTED_PDFS   = {".pdf"}
SUPPORTED_EXTS   = SUPPORTED_IMAGES | SUPPORTED_PDFS

def list_supported_files(data_dir: Path) -> list[Path]:
    return sorted([p for p in data_dir.rglob("*") if p.is_file() and p.suffix.lower() in SUPPORTED_EXTS])

def build_extract_msg(file_path: Path, lang: str = "por") -> Msg:
    return Msg(
        name="user",
        role="user",
        content=(f"file_path={str(file_path)}\nlang={lang}\nExtraia e retorne APENAS o texto."),
    )

async def extract_one_with_router(file_path: Path, lang: str = "por") -> str:
    trace_event({"type": "agent_call_start", "agent": "RouterAgent", "file": str(file_path)})
    trace_md(f"\n# FILE: {file_path.name}\n")

    msg = build_extract_msg(file_path, lang=lang)
    t0 = time.time()
    reply = await RouterAgent.reply(msg)
    elapsed = round(time.time() - t0, 3)
    text = reply.get_text_content()

    trace_event({"type": "agent_call_end", "agent": "RouterAgent", "file": str(file_path), "elapsed_s": elapsed, "chars": len(text)})
    trace_event({"type": "final_output", "file": str(file_path), "elapsed_s": elapsed, "chars": len(text), "preview": text[:1200]})
    trace_md(
        f"## FINAL OUTPUT | {file_path.name}\n"
        f"- elapsed: {elapsed}s\n- chars: {len(text)}\n"
        f"```text\n{text[:6000]}\n```\n"
    )
    return text

async def run_folder_extraction(data_dir: Path, out_dir: Path, lang: str = "por", limit: int = 0) -> Path:
    out_dir.mkdir(parents=True, exist_ok=True)

    files = list_supported_files(data_dir)
    if limit and limit > 0:
        files = files[:limit]

    print(f"Arquivos suportados encontrados: {len(files)}")
    for p in files:
        print(" -", p.name)

    log_rows = ["file,status,output_or_error"]
    for p in files:
        print("\n=== EXTRAINDO:", p.name, "===")
        try:
            text = await extract_one_with_router(p, lang=lang)
            out_txt = out_dir / f"{p.stem}.txt"
            out_txt.write_text(text, encoding="utf-8", errors="ignore")
            log_rows.append(f"{p.name},OK,{out_txt}")
            print("OK ->", out_txt)
        except Exception as e:
            trace_event({"type": "file_error", "file": str(p), "error": str(e)})
            trace_md(f"## ERROR | {p.name}\n- {e}\n")
            log_rows.append(f"{p.name},ERR,{str(e).replace(',', ';')}")
            print("ERRO:", e)

    log_path = out_dir / "batch_log.csv"
    log_path.write_text("\n".join(log_rows), encoding="utf-8", errors="ignore")
    print("\nLOG salvo em:", log_path)
    return log_path


In [None]:
# Célula 8 - Executar 
log_path = await run_folder_extraction(DADOS_DIR, OUT_DIR, lang="por", limit=0)
print("Finalizado. Log:", log_path)
print("Trace JSONL:", TRACE_JSONL)
print("Trace MD:", TRACE_MD)


Arquivos suportados encontrados: 2
 - nota2.png
 - relatorio_financeiro.pdf

=== EXTRAINDO: nota2.png ===
RouterAgent: {
    "type": "tool_use",
    "id": "tez6t3rvx",
    "name": "trace_decision",
    "input": {
        "agent": "RouterAgent",
        "decision": "delegate->ImageAgent",
        "extra": "",
        "file_path": "C:\\\\Users\\\\fepac\\\\Unicamp_Project\\\\dados\\\\nota2.png"
    }
}
RouterAgent: {
    "type": "tool_use",
    "id": "v5yxzw2hw",
    "name": "call_agent",
    "input": {
        "agent_name": "ImageAgent",
        "file_path": "C:\\\\Users\\\\fepac\\\\Unicamp_Project\\\\dados\\\\nota2.png",
        "lang": "por"
    }
}
system: {
    "type": "tool_result",
    "id": "tez6t3rvx",
    "name": "trace_decision",
    "output": [
        {
            "type": "text",
            "text": "decision logged"
        }
    ]
}
system: {
    "type": "tool_result",
    "id": "v5yxzw2hw",
    "name": "call_agent",
    "output": [
        {
            "type": "text",
  

  res = ToolResponse(


ERRO: Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.

=== EXTRAINDO: relatorio_financeiro.pdf ===
RouterAgent: {
    "type": "tool_use",
    "id": "yfyaz3yda",
    "name": "pdf_is_scanned",
    "input": {
        "pdf_path": "C:\\\\Users\\\\fepac\\\\Unicamp_Project\\\\dados\\\\relatorio_financeiro.pdf"
    }
}
system: {
    "type": "tool_result",
    "id": "yfyaz3yda",
    "name": "pdf_is_scanned",
    "output": [
        {
            "type": "text",
            "text": "Error: name '_pdf_text_stats' is not defined"
        }
    ]
}
RouterAgent: {
    "type": "tool_use",
    "id": "1bm514h42",
    "name": "trace_decision",
    "input": {
        "agent": "RouterAgent",
        "decision": "delegate->PDFOCRAgent",
        "extra": "",
        "file_path": "C:\\\\Users\\\\fepac\\\\Unicamp_Project\\\\dados\\\\relatorio_financeiro.pdf"
    }
}
RouterAgent: {
    "type": "tool_use",
    "id": "6jek9e0vx",
    "name": "call_agent",
    "inpu

In [9]:
# Célula 9 - Sumário do trace (AJUSTADA)
import json
from collections import defaultdict
from pathlib import Path

def summarize_trace(path: Path):
    events = []
    for ln in path.read_text(encoding="utf-8", errors="ignore").splitlines():
        try:
            events.append(json.loads(ln))
        except Exception:
            pass

    per_file = defaultdict(lambda: {
        "decisions": [],
        "delegations": [],
        "tools": [],
        "final_preview": None,
        "errors": [],
    })

    for e in events:
        f = e.get("file")
        if not f:
            continue
        f = str(f)

        if e.get("type") == "agent_decision":
            per_file[f]["decisions"].append(e.get("decision"))

        if e.get("type") == "agent_delegate_start":
            per_file[f]["delegations"].append(f"Router -> {e.get('to')} (start)")

        if e.get("type") == "agent_delegate_end":
            per_file[f]["delegations"].append(f"Router -> {e.get('to')} (end) chars={e.get('chars')}")

        if e.get("type") == "tool":
            per_file[f]["tools"].append({
                "name": e.get("name"),
                "success": e.get("success"),
                "chars": e.get("chars"),
            })

        if e.get("type") == "final_output":
            per_file[f]["final_preview"] = (e.get("preview") or "")[:300]

        if e.get("type") == "file_error":
            per_file[f]["errors"].append(e.get("error"))

    print("=== SUMMARY ===")
    for f, info in per_file.items():
        name = Path(f).name
        print(f"\n{name}")
        print("  decisions:", info["decisions"])
        print("  delegations:", info["delegations"])
        print("  tools:", info["tools"])
        print("  final_preview:", info["final_preview"])
        if info["errors"]:
            print("  errors:", info["errors"])

summarize_trace(TRACE_JSONL)


=== SUMMARY ===

nota2.png
  decisions: ['delegate->ImageAgent']
  delegations: ['Router -> ImageAgent (start)']
  tools: []
  final_preview: None

nota2.png
  decisions: []
  delegations: []
  tools: []
  final_preview: None
  errors: ["Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details."]

relatorio_financeiro.pdf
  decisions: ['delegate->PDFOCRAgent', 'delegate->PDFTextAgent']
  delegations: ['Router -> PDFOCRAgent (start)', 'Router -> PDFTextAgent (start)']
  tools: []
  final_preview: None

relatorio_financeiro.pdf
  decisions: []
  delegations: []
  tools: []
  final_preview: None
  errors: ["Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details."]
