In [1]:
from dotenv import load_dotenv
from portia import Config, Portia, PortiaToolRegistry, open_source_tool_registry, InMemoryToolRegistry
from portia.cli import CLIExecutionHooks
from portia.config import default_config
from portia.open_source_tools.registry import example_tool_registry
from portia.clarification import MultipleChoiceClarification
from portia.plan_run import PlanRunState
from portia import LLMModel
from portia import Portia
from my_custom_tools.PDFReaderTool import PDFReaderTool


load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
from pathlib import Path
import fitz  # PyMuPDF
from pydantic import BaseModel
from portia import Tool, ToolHardError, ToolRunContext
from typing import ClassVar


# class PDFReaderToolSchema(BaseModel):
#     """No input needed. Reads all PDFs from the papers folder."""
#     pass


# class PDFReaderTool(Tool[dict[str, str]]):
#     """Reads and returns full text from all PDFs in the ./papers/ folder."""

#     id: ClassVar[str] = "pdf_reader_tool"
#     name: ClassVar[str] = "PDF reader tool"
#     description: ClassVar[str] = "Reads all PDFs from the local 'papers' folder and returns their full text"
#     args_schema = PDFReaderToolSchema
#     output_schema: ClassVar[tuple[str, str]] = ("dict", "Dictionary of filename -> full text")

#     def run(self, ctx: ToolRunContext) -> dict[str, str]:
#         """Extracts and returns full text from all PDFs in the ./papers folder."""
#         base_dir = Path(__file__).parent if "__file__" in globals() else Path.cwd()
#         papers_dir = base_dir / "fake_papers"
#         if not papers_dir.exists() or not papers_dir.is_dir():
#             raise ToolHardError("The 'papers/' folder does not exist.")

#         pdf_files = list(papers_dir.glob("*.pdf"))
#         if not pdf_files:
#             raise ToolHardError("No PDF files found in the 'papers/' folder.")

#         texts = {}
#         for file_path in pdf_files:
#             try:
#                 full_text = self.read_pdf(file_path)
#                 texts[file_path.name] = full_text
#             except Exception as e:
#                 texts[file_path.name] = f"Error reading file: {str(e)}"

#         return texts

#     def read_pdf(self, file_path: Path) -> str:
#         """Extracts and cleans text from a PDF file, stopping before References/Bibliography."""
#         text = []
#         with fitz.open(file_path) as doc:
#             for page_num, page in enumerate(doc):
#                 page_text = page.get_text("text")
#                 cleaned_text = self._remove_arxiv_footer(page_text)

#                 # Check for 'References' or 'Bibliography' section header
#                 if self._is_bibliography_page(cleaned_text):
#                     print(f"Stopping at page {page_num + 1} (found References section).")
#                     break

#                 text.append(f"--- Page {page_num + 1} ---\n{cleaned_text.strip()}")
#         return "\n\n".join(text)

#     def _remove_arxiv_footer(self, text: str) -> str:
#         """Removes common arXiv-style footers."""
#         lines = text.splitlines()
#         return "\n".join(
#             line for line in lines
#             if "arxiv" not in line.lower() and "preprint" not in line.lower()
#         )
    
#     def _is_bibliography_page(self, text: str) -> bool:
#         """Returns True if the page looks like it's starting the bibliography or references."""
#         lowered = text.lower()
#         # Check if 'references' or 'bibliography' is a standalone word early in the text
#         return (
#             "references\n" in lowered
#             or lowered.strip().startswith("references")
#             or lowered.strip().startswith("bibliography")
#         )


In [2]:
from portia.open_source_tools.registry import example_tool_registry

my_config = Config.from_default()


my_config.models['planning_default_model_name'] = LLMModel.GPT_4_O

example_tool_registry.register_tool(PDFReaderTool())
# Load example and custom tool registries into a single one
# Instantiate a Portia instance. Load it with the default config and with the tools above

portia = Portia(config=my_config,
                tools=PortiaToolRegistry(my_config)+example_tool_registry)

# Execute the plan from the user query
plan_run = portia.run('Run the PDFReaderTool and summarize the content of the PDF files returned by the tool.',)


print(plan_run.model_dump_json(indent=2))

[32m2025-04-11 20:41:19.487[0m | [1mINFO[0m | [38;5;39mportia.portia[0m:[38;5;39mplan[0m:[38;5;39m197[0m - [1mRunning planning_agent for query - Run the PDFReaderTool and summarize the content of the PDF files returned by the tool.[0m
[32m2025-04-11 20:41:24.193[0m | [1mINFO[0m | [38;5;39mportia.portia[0m:[38;5;39mplan[0m:[38;5;39m222[0m - [1mPlan created with 2 steps[0m | {'plan': 'plan-987fdce9-c112-4165-b972-0a935649b236'}
[32m2025-04-11 20:41:28.418[0m | [1mINFO[0m | [38;5;129mportia.portia[0m:[38;5;129m_execute_plan_run[0m:[38;5;129m528[0m - [1mPlan Run State is updated to PlanRunState.IN_PROGRESS. View in your Portia AI dashboard: https://app.portialabs.ai/dashboard/plan-runs?plan_run_id=prun-696d3d18-9af7-4c88-8dd9-c1b409f7f801[0m
[32m2025-04-11 20:41:28.419[0m | [1mINFO[0m | [38;5;129mportia.portia[0m:[38;5;129m_execute_plan_run[0m:[38;5;129m551[0m - [1mExecuting step 0: Read all PDFs from the local 'papers' folder and return their

In [6]:
def read_pdf(file_path: Path) -> str:
    """Extracts and cleans text from a PDF file, stopping before References/Bibliography."""
    text = []
    with fitz.open(file_path) as doc:
        for page_num, page in enumerate(doc):
            page_text = page.get_text("text")
            cleaned_text = _remove_arxiv_footer(page_text)

            # Check for 'References' or 'Bibliography' section header
            # if _is_bibliography_page(cleaned_text):
            #     print(f"Stopping at page {page_num + 1} (found References section).")
            #     break

            text.append(f"--- Page {page_num + 1} ---\n{cleaned_text.strip()}")
    return "\n\n".join(text)

def _remove_arxiv_footer(text: str) -> str:
    """Removes common arXiv-style footers."""
    lines = text.splitlines()
    return "\n".join(
        line for line in lines
        if "arxiv" not in line.lower() and "preprint" not in line.lower()
     )

def _is_bibliography_page(text: str) -> bool:
    """Returns True if the page looks like it's starting the bibliography or references."""
    lowered = text.lower()
    # Check if 'references' or 'bibliography' is a standalone word early in the text
    return (
        "references\n" in lowered
        or lowered.strip().startswith("references")
        or lowered.strip().startswith("bibliography")
    )


base_dir = Path(__file__).parent if "__file__" in globals() else Path.cwd()
papers_dir = base_dir / "fake_papers"
if not papers_dir.exists() or not papers_dir.is_dir():
    raise ToolHardError("The 'papers/' folder does not exist.")

pdf_files = list(papers_dir.glob("*.pdf"))
if not pdf_files:
    raise ToolHardError("No PDF files found in the 'papers/' folder.")

texts = {}
for file_path in pdf_files:
    try:
        full_text = read_pdf(file_path)
        texts[file_path.name] = full_text
    except Exception as e:
        texts[file_path.name] = f"Error reading file: {str(e)}"




In [8]:
len(texts['Poster.pdf'])

6906