In [1]:
from dotenv import load_dotenv
from portia import Config, Portia, PortiaToolRegistry, open_source_tool_registry, InMemoryToolRegistry
from portia.cli import CLIExecutionHooks
from portia.config import default_config
from portia.open_source_tools.registry import example_tool_registry
from portia.clarification import MultipleChoiceClarification
from portia.plan_run import PlanRunState
from portia import LLMModel
from portia import Portia
from my_custom_tools.PDFReaderTool import PDFReaderTool
from my_custom_tools.TopicSelectorTool import TopicSelectorTool


load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

In [3]:
from pathlib import Path
import fitz  # PyMuPDF
from pydantic import BaseModel
from portia import Tool, ToolHardError, ToolRunContext
from typing import ClassVar


# class PDFReaderToolSchema(BaseModel):
#     """No input needed. Reads all PDFs from the papers folder."""
#     pass


# class PDFReaderTool(Tool[dict[str, str]]):
#     """Reads and returns full text from all PDFs in the ./papers/ folder."""

#     id: ClassVar[str] = "pdf_reader_tool"
#     name: ClassVar[str] = "PDF reader tool"
#     description: ClassVar[str] = "Reads all PDFs from the local 'papers' folder and returns their full text"
#     args_schema = PDFReaderToolSchema
#     output_schema: ClassVar[tuple[str, str]] = ("dict", "Dictionary of filename -> full text")

#     def run(self, ctx: ToolRunContext) -> dict[str, str]:
#         """Extracts and returns full text from all PDFs in the ./papers folder."""
#         base_dir = Path(__file__).parent if "__file__" in globals() else Path.cwd()
#         papers_dir = base_dir / "fake_papers"
#         if not papers_dir.exists() or not papers_dir.is_dir():
#             raise ToolHardError("The 'papers/' folder does not exist.")

#         pdf_files = list(papers_dir.glob("*.pdf"))
#         if not pdf_files:
#             raise ToolHardError("No PDF files found in the 'papers/' folder.")

#         texts = {}
#         for file_path in pdf_files:
#             try:
#                 full_text = self.read_pdf(file_path)
#                 texts[file_path.name] = full_text
#             except Exception as e:
#                 texts[file_path.name] = f"Error reading file: {str(e)}"

#         return texts

#     def read_pdf(self, file_path: Path) -> str:
#         """Extracts and cleans text from a PDF file, stopping before References/Bibliography."""
#         text = []
#         with fitz.open(file_path) as doc:
#             for page_num, page in enumerate(doc):
#                 page_text = page.get_text("text")
#                 cleaned_text = self._remove_arxiv_footer(page_text)

#                 # Check for 'References' or 'Bibliography' section header
#                 if self._is_bibliography_page(cleaned_text):
#                     print(f"Stopping at page {page_num + 1} (found References section).")
#                     break

#                 text.append(f"--- Page {page_num + 1} ---\n{cleaned_text.strip()}")
#         return "\n\n".join(text)

#     def _remove_arxiv_footer(self, text: str) -> str:
#         """Removes common arXiv-style footers."""
#         lines = text.splitlines()
#         return "\n".join(
#             line for line in lines
#             if "arxiv" not in line.lower() and "preprint" not in line.lower()
#         )
    
#     def _is_bibliography_page(self, text: str) -> bool:
#         """Returns True if the page looks like it's starting the bibliography or references."""
#         lowered = text.lower()
#         # Check if 'references' or 'bibliography' is a standalone word early in the text
#         return (
#             "references\n" in lowered
#             or lowered.strip().startswith("references")
#             or lowered.strip().startswith("bibliography")
#         )


In [2]:
from portia.open_source_tools.registry import example_tool_registry

my_config = Config.from_default()


my_config.models['planning_default_model_name'] = LLMModel.GPT_4_O

example_tool_registry.register_tool(PDFReaderTool())
example_tool_registry.register_tool(TopicSelectorTool())

In [4]:
# Load example and custom tool registries into a single one
# Instantiate a Portia instance. Load it with the default config and with the tools above

from portia import PlanRunState, InputClarification, MultipleChoiceClarification

portia = Portia(config=my_config,
                tools=PortiaToolRegistry(my_config)+example_tool_registry)

# Execute the plan from the user query
plan_run = portia.run("""You are a research assistant running tasks: 
                    - Run the PDFReaderTool to extract the full text from the pdfs in the local folder
                    - From the full text, extract the core mathematical and scientific concepts required to understand the paper. Focus only on generalizable topics that could be included in a learning pathway or curriculum—avoid content specific to the study's location, data, or outcomes. List only the overarching topics, with no explanations or extra text.
                    - From the extracted topics, allow for the user to choose which topics they want to learn about.
                    """,)

from portia import PlanRunState, InputClarification, MultipleChoiceClarification, ActionClarification

while plan_run.state == PlanRunState.NEED_CLARIFICATION:
    print("\n⚠️ Clarification needed to continue the plan...\n")
    for clarification in plan_run.get_outstanding_clarifications():
        print(f"🔹 {clarification.user_guidance}")

        if isinstance(clarification, MultipleChoiceClarification):
            print("Options:")
            for i, option in enumerate(clarification.options, 1):
                print(f"{i}. {option}")

            user_input = input("Enter your choice(s), separated by commas if multiple:\n")
            try:
                indices = [int(i.strip()) - 1 for i in user_input.split(",")]
                selected = [clarification.options[i] for i in indices]
                plan_run = portia.resolve_clarification(clarification, selected, plan_run)
            except (IndexError, ValueError):
                print("❌ Invalid selection. Please enter valid option numbers.")

        elif isinstance(clarification, InputClarification):
            user_input = input("Please enter your response:\n")
            plan_run = portia.resolve_clarification(clarification, user_input, plan_run)

        elif isinstance(clarification, ActionClarification):
            print(f"{clarification.user_guidance}")
            print(f"🔗 {clarification.action_url}")
            input("Press Enter after you've completed the action...")

        else:
            print("❌ Unknown clarification type. Skipping.")

# Once plan is done, you can inspect it:
print("\n✅ Plan complete!\n")
print(plan_run.model_dump_json(indent=2))

[32m2025-04-11 23:10:51.460[0m | [1mINFO[0m | [38;5;39mportia.portia[0m:[38;5;39mplan[0m:[38;5;39m197[0m - [1mRunning planning_agent for query - You are a research assistant running tasks: 
                    - Run the PDFReaderTool to extract the full text from the pdfs in the local folder
                    - From the full text, extract the core mathematical and scientific concepts required to understand the paper. Focus only on generalizable topics that could be included in a learning pathway or curriculum—avoid content specific to the study's location, data, or outcomes. List only the overarching topics, with no explanations or extra text.
                    - From the extracted topics, allow for the user to choose which topics they want to learn about.
                    [0m
[32m2025-04-11 23:10:56.879[0m | [1mINFO[0m | [38;5;39mportia.portia[0m:[38;5;39mplan[0m:[38;5;39m222[0m - [1mPlan created with 3 steps[0m | {'plan': 'plan-3c01fe39-0d2d-4f5d-9cd9-8

In [6]:
def read_pdf(file_path: Path) -> str:
    """Extracts and cleans text from a PDF file, stopping before References/Bibliography."""
    text = []
    with fitz.open(file_path) as doc:
        for page_num, page in enumerate(doc):
            page_text = page.get_text("text")
            cleaned_text = _remove_arxiv_footer(page_text)

            # Check for 'References' or 'Bibliography' section header
            # if _is_bibliography_page(cleaned_text):
            #     print(f"Stopping at page {page_num + 1} (found References section).")
            #     break

            text.append(f"--- Page {page_num + 1} ---\n{cleaned_text.strip()}")
    return "\n\n".join(text)

def _remove_arxiv_footer(text: str) -> str:
    """Removes common arXiv-style footers."""
    lines = text.splitlines()
    return "\n".join(
        line for line in lines
        if "arxiv" not in line.lower() and "preprint" not in line.lower()
     )

def _is_bibliography_page(text: str) -> bool:
    """Returns True if the page looks like it's starting the bibliography or references."""
    lowered = text.lower()
    # Check if 'references' or 'bibliography' is a standalone word early in the text
    return (
        "references\n" in lowered
        or lowered.strip().startswith("references")
        or lowered.strip().startswith("bibliography")
    )


base_dir = Path(__file__).parent if "__file__" in globals() else Path.cwd()
papers_dir = base_dir / "fake_papers"
if not papers_dir.exists() or not papers_dir.is_dir():
    raise ToolHardError("The 'papers/' folder does not exist.")

pdf_files = list(papers_dir.glob("*.pdf"))
if not pdf_files:
    raise ToolHardError("No PDF files found in the 'papers/' folder.")

texts = {}
for file_path in pdf_files:
    try:
        full_text = read_pdf(file_path)
        texts[file_path.name] = full_text
    except Exception as e:
        texts[file_path.name] = f"Error reading file: {str(e)}"


