In [1]:
from dotenv import load_dotenv
from portia.cli import CLIExecutionHooks
from portia import *
import requests
import xml.etree.ElementTree as ET
from pydantic import BaseModel, Field
from typing import Generic, TypeVar, List, ClassVar, Dict
from notion_client import Client
import openai
import os
from my_custom_tools.utils import truncate_at_sentence
load_dotenv(override=True)

# Fetch the Notion API key
notion_api_key = os.getenv("NOTION_API_KEY")
notion_parent_id = os.getenv("NOTION_PARENT_ID")

youtube_api_key = os.getenv("GOOGLE_API_KEY")

# Initialize the Notion client
notion = Client(auth=notion_api_key)


In [5]:

topics = [{"topic": "SIRD Modelling", "page_id": "1d36ccbb-ecba-81cc-8114-f393076a8570", "content": "[[Introduction]]\nSIRD modeling is a mathematical approach used to understand the spread of infectious diseases within a population. It divides the population into four compartments: Susceptible, Infected, Recovered, and Deceased. This model helps in predicting the course of an epidemic and evaluating the impact of intervention strategies.\n\n[[Key Definitions]]\n- **Susceptible (S):** The group of individuals who are not yet infected with the disease but are at risk of becoming infected.\n- **Infected (I):** The group of individuals who have been infected with the disease and are capable of spreading it to susceptible individuals.\n- **Recovered (R):** The group of individuals who have recovered from the disease and are assumed to have gained immunity, thus no longer susceptible.\n- **Deceased (D):** The group of individuals who have died from the disease.\n\n[[Relevant Formulas]]\n- **Rate of Change of Susceptible Individuals**\n  \\( \\frac{dS}{dt} = -\\beta \\frac{SI}{N} \\)\n\n- **Rate of Change of Infected Individuals**\n  \\( \\frac{dI}{dt} = \\beta \\frac{SI}{N} - \\gamma I - \\mu I \\)\n\n- **Rate of Change of Recovered Individuals**\n  \\( \\frac{dR}{dt} = \\gamma I \\)\n\n- **Rate of Change of Deceased Individuals**\n  \\( \\frac{dD}{dt} = \\mu I \\)\n\nHere, \\( \\beta \\) is the transmission rate, \\( \\gamma \\) is the recovery rate, \\( \\mu \\) is the mortality rate, and \\( N \\) is the total population.\n\n[[Examples]]\n1. **Seasonal Flu Epidemic:** Consider a city experiencing a seasonal flu outbreak. The SIRD model can be used to predict how the flu will spread through the population, how many people will recover, and how many may unfortunately succumb to the illness. By adjusting parameters like the transmission rate \\( \\beta \\), health officials can simulate the impact of interventions such as vaccination or social distancing.\n\n2. **COVID-19 Pandemic:** During the COVID-19 pandemic, SIRD models were extensively used to project the number of cases and deaths. By inputting real-time data, health authorities could estimate the peak of infections and plan resource allocation like hospital beds and ventilators accordingly.\n\n[[Reflective Questions]]\n1. How does the SIRD model differ from the simpler SIR model?\n2. What are some limitations of using the SIRD model in predicting real-world epidemics?\n3. How can the parameters \\( \\beta \\), \\( \\gamma \\), and \\( \\mu \\) be estimated from real-world data?\n4. In what ways can the SIRD model be adjusted to account for vaccination?\n5. How might public health policies be informed by the predictions of a SIRD model?"}, {"topic": "Least Squares Regression", "page_id": "1d36ccbb-ecba-81ef-9f53-f70c0b331f6f", "content": "[[Introduction]]\nLeast squares regression is a statistical method used to determine the best-fitting line through a set of data points. This technique minimizes the sum of the squares of the vertical distances of the points from the line, providing a way to predict the value of a dependent variable based on the value of an independent variable.\n\n[[Key Definitions]]\n- **Regression Line**: A line that best fits a set of data points according to the least squares criterion. It is often used to predict values.\n- **Residual**: The difference between the observed value and the value predicted by the regression line. It is the vertical distance from a data point to the regression line.\n- **Coefficient of Determination (\\( R^2 \\))**: A statistical measure that explains how much of the variability of the dependent variable can be explained by the independent variable.\n\n[[Relevant Formulas]]\n- **Equation of the Regression Line**\n  \\( y = mx + b \\)\n\n- **Slope of the Regression Line**\n  \\( m = \\frac{n(\\sum xy) - (\\sum x)(\\sum y)}{n(\\sum x^2) - (\\sum x)^2} \\)\n\n- **Intercept of the Regression Line**\n  \\( b = \\frac{(\\sum y)(\\sum x^2) - (\\sum x)(\\sum xy)}{n(\\sum x^2) - (\\sum x)^2} \\)\n\n[[Examples]]\n1. **Predicting House Prices**: Suppose you have data on house sizes (in square feet) and their corresponding prices. By applying least squares regression, you can create a model to predict the price of a house based on its size. This model helps real estate agents provide price estimates for clients.\n\n2. **Forecasting Sales**: A company collects data on advertising expenditure and sales revenue. Using least squares regression, they can establish a relationship between the two variables, allowing them to forecast future sales based on planned advertising budgets. This assists in strategic planning and budgeting.\n\n[[Reflective Questions]]\n1. How does least squares regression help in making predictions based on data?\n2. What is the significance of the slope in the regression line equation?\n3. Why is minimizing the sum of the squares of the residuals important in regression analysis?\n4. How can the coefficient of determination (\\( R^2 \\)) be interpreted in the context of regression analysis?\n5. In what scenarios might least squares regression not be the best method to use?"}, {"topic": "Paper Summary", "page_id": "1d36ccbb-ecba-8123-abdf-efc91dffa39a", "content": "summary of paper"}]

pdf_texts = {"Poster.pdf":"--- Page 1 ---\nForecasting coronavirus in Italy with SIRD modelling\nGabrielle Littlefair\nOral: https://imperial.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=7d1f1304-438b-40e3-b586-abd7011a8903\nObjectives\n• Find estimates for the contact, death and recovery rates\nof coronavirus in Italy using least squares regression.\n• Identify any issues with the model and data.\n• Extrapolate the model to see how coronavirus may\nprogress in the upcoming months.\nIntroduction\nThe ﬁrst case of coronavirus in Italy was reported on the 31st\nJanuary 2020. Between then and the 6th June 2020, 33,846\npeople died and the total number of cases rose to 234,801 [8].\n23.3% of Italy’s population is over the age of 65 [5], making it\nthe second oldest population in the world. This may explain\nwhy Italy seems especially vulnerable. I have modelled Italy’s\noutbreak using SIRD modelling. Modelling accurate rates of\ninfection, recovery and death, along with the basic reproduc-\ntion number, R0, can be quite challenging due to the high\nproportion of infections that are undetected. The number of\ninfections has been estimated to be as high as 63 times as large\nas the number recorded [3].\nSIRD Modelling\nSIRD modelling is based on four diﬀerent groups within the\npopulation: those who are susceptible (S); those who are in-\nfected (I); those who have recovered (R); and those who have\ndied (D).\nThe governing equations of my model are as follows:\nS + I\nβ\n−→2I\nI\nγ\n−→R\nI\nδ\n−→D\nWhere β is the contact rate, γ is the rate of recovery, and δ is\nthe rate of death. From these equations, the following system\nof ODEs can be found and solved [6]:\ndS\ndt = −β\nNSI\ndI\ndt = β\nNSI −(γ + δ)I\ndR\ndt = γI\ndD\ndt = δI\nFitting the Model to the Data\nI used least squares regression from python’s lmﬁt module to\nﬁt my SIRD model to my data [7]. I began by assuming that\nmy rates β, γ and δ were all constant. However, after plotting\nthe results of this model, I quickly realised that my value of β\nneeded to decrease with time. Therefore, I decided to deﬁne\nβ(t) as a function instead. I found this function intuitively\nusing a negative exponential model multiplied by the β value\nfound. I also assumed that the initial number of susceptible in\nItaly was equal to the population: 60,461,828 [1]. The results\nfor recovery and death rates were as follows:\nγ = 0.0234\nδ = 0.0064\nThe following values of β correspond to the start of the data\nand the end of lockdown:\n18/02/2020\nβ = 0.1473\n14/05/2020\nβ = 0.0080\nFrom these values of β we can see that at the end of lockdown,\nthe infection rate was much lower, as expected, due to much\nfewer contacts between those in the susceptible group and those\nin the infected group.\nR0\nAn important feature of modelling epidemics is the basic repro-\nduction number R0. This number is the number of secondary\ninfections resulting from a single primary infection. The reason\nthis value is so important is that it is an indication of whether\nthe disease will die out (R0 < 1), or if it will become an en-\ndemic (R0 > 1) [4]. This value changes when measures are\nimplemented that reduce the rate of infection, like lockdown.\nR0 can be found using the following equation [7]:\nR0 = β(t)\nγ\nUsing my model, I have found values of R0 at diﬀerent times:\n18/02/2020\nR0 = 6.2929\n22/04/2020\nR0 = 1.0052\n23/04/2020\nR0 = 0.9542\n14/05/2020\nR0 = 0.3405\nR0 is ﬁrst below 1, meaning that the disease has begun dying\nout, on the 23rd April 2020, in the middle of the lockdown pe-\nriod. This suggests that lockdown was eﬀective. When Italy\nbegan relaxing its lockdown measures, R0 was very small, how-\never as the lockdown eases and contact rates increase that value\ncould easily rise once again.\nAssumptions\nSIRD modelling has many drawbacks. First of all, the assumes that once you have been infected you are then immune to the virus,\nwhich has not been proved or disproved for coronavirus yet. If this assumption is proved wrong, the model would be unreliable\nuntil another category is added in. My model assumes that you become infectious when you become infected (i.e. when you test\npositive), however, according to Harvard Medical School [2], you actually become infectious up to 72 hours before you show any\nsymptoms. Another assumption is that there are no births in the population. There are also problems with the accuracy and\nreliability of the data [8] being used. Many cases of coronavirus go unreported and untested. There could also be a high proportion\nof asymptomatic cases [3], which will also remain unrecorded.\nThe Model\nFigure 1:Data vs Model plotted between 18/02/2020 and 07/06/2020\nForecast\nFigure 2:Model plotted between 18/02/2020 and 06/08/2020\nConclusion\nAlthough I do not believe that the model I have gener-\nated is very accurate, it seems that Italy is very much on\nthe way towards eradicating coronavirus. However, the un-\nknown proportion of the population that has coronavirus\nand is asymptomatic, along with the unrecorded cases,\ncould mean that we may begin to see an upward trend\nin the number of cases. Potentially even a second peak.\nGoing forward, I would like to add in more compartments\ninto my model (such as the exposed compartment), in order\nto increase the accuracy of the model. This exposed cate-\ngory would remove the need for a dampener on the rate of\ninfection, as a contact rate would then also be estimated,\nmaking a much more reliable model. I would also like to\ncompare various countries which have now left lockdown\nin order to see how exiting the lockdown has aﬀected their\ninfection rates and whether a second spike seems likely.\nReferences\n[1] “Our world in data coronavirus.” [Online]. Available:\nhttps://ourworldindata.org/coronavirus\n[2] “Harvard health coronavirus.” [Online]. Available:\nhttps://www.health.harvard.edu/diseases-and-conditions/\nif-youve-been-exposed-to-the-coronavirus\n[3] G. C. Calaﬁore, C. Novara, and C. Possieri, “A modiﬁed sir model for\nthe covid-19 contagion in italy,” Mar 31, 2020. [Online]. Available:\n[4] K. Nixon and L. Servitje, Endemic.\nLondon: Palgrave Macmillan\nLimited, 2016. [Online]. Available: https://ebookcentral.proquest.\ncom/lib/[SITE_ID]/detail.action?docID=4720003\n[5] J. B. Dowd, L. Andriano, D. M. Brazel, V. Rotondi, P. Block,\nX. Ding, Y. Liu, and M. C. Mills, “Demographic science aids in\nunderstanding the spread and fatality rates of covid-19,” Proceedings\nof the National Academy of Sciences of the United States of\nAmerica, vol. 117, no. 18, pp. 9696–9698, May 5, 2020. [Online].\nAvailable: https://www.ncbi.nlm.nih.gov/pubmed/32300018\n[6] . K. S. E. Model, “Introduction to epidemic modeling.”\n[7] J. Fernández-Villaverde, “Estimating and simulating a sird model of\ncovid-19 for many countries, states, and cities,” 2020. [Online].\nAvailable: http://www.econis.eu/PPNSET?PPN=1698547927\n[8] “John hopkins university coronavirus data.” [Online]. Available:\nhttps://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases"}

In [6]:

class PaperSummaryToolSchema(BaseModel):
    """Input schema for PaperSummaryTool."""
    topics: List[Dict[str, str]] = Field(
        ..., 
        description="List of dictionaries (output from the NotionTool), each with 'topic', 'page_id', and 'content'. The 'Paper Summary' topic is required."
    )
    papers: List[Dict[str, str]] = Field(
        ..., 
        description="A list of dictionaries, each with 'title', 'link', and 'summary' fields."
    )
    pdf_texts: Dict[str, str] = Field(
        ..., 
        description="A dictionary mapping PDF filenames to their full extracted text content."
    )


class PaperSummaryTool(Tool[str]):
    """Creates a summary Notion subpage for the full paper with explanation and novelties."""

    id: ClassVar[str] = "paper_summary_tool"
    name: ClassVar[str] = "Paper Summary Tool"
    description: ClassVar[str] = "Creates a Notion subpage summarizing the full paper and its novelties."
    args_schema = PaperSummaryToolSchema
    output_schema: ClassVar[tuple[str, str]] = (
        "str",
        "Confirmation of the summary page creation."
    )

    def run(self, context: ToolRunContext, topics: List[Dict[str, str]], papers: List[Dict[str, str]], pdf_texts: Dict[str, str]) -> str:
        """Creates a summary Notion subpage for the full paper with explanation and novelties."""

        paper = papers[0]
        title = paper["title"]
        summary = paper["summary"]
        pdf_url = paper["link"]

        pdf_text = next(iter(pdf_texts.values()), "")


        notion = Client(auth=os.getenv("NOTION_API_KEY"))
        paper_topic = next((t for t in topics if t["topic"].lower() == "paper summary"), None)

        if not paper_topic:
            return "❌ 'Paper Summary' page not found in input."

        parent_id = paper_topic["page_id"]

        existing_blocks = notion.blocks.children.list(block_id=parent_id).get("results", [])
        for block in existing_blocks:
            try:
                notion.blocks.delete(block["id"])
            except:
                continue

        # Append blocks to the existing Paper Summary page
        notion.blocks.children.append(
            block_id=parent_id,
            children=[
                    # Title
                    {
                        "object": "block",
                        "type": "heading_1",
                        "heading_1": {
                            "rich_text": [{"type": "text", "text": {"content": f"📄 {paper['title']}"}}]
                        }
                    },
                    # Summary
                    {
                        "object": "block",
                        "type": "paragraph",
                        "paragraph": {
                            "rich_text": [{"type": "text", "text": {"content": paper["summary"]}}]
                        }
                    },
                    # Embed PDF
                    {
                        "object": "block",
                        "type": "bookmark",
                        "bookmark": {
                            "url": pdf_url
                        }
                    }
                ]
            )   
        client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

        response = client.chat.completions.create(
            model="gpt-4o",
            temperature=0.3,
            messages=[
                {"role": "system", "content": (
                    "You are a scientific writing assistant helping summarize and analyze academic papers. "
                    "Structure the output into clearly labeled sections. Each section MUST begin with [[Section Name]] on its own line. "
                    "Sections: Intuitive Understanding, Method Breakdown, Novelties / Contributions, Critiques, Related Reading. "
                    "Write clearly for an advanced undergraduate audience. Use bullet points if appropriate. DO NOT include LaTeX or math equations."
                )},
                {"role": "user", "content": (
                    f"Summarize and explain the following academic paper content:\n\n{pdf_text}\n\n"
                    "Use the following structure exactly, and ensure each section starts with [[Section Name]]:\n\n"
                    "[[Intuitive Understanding]]\n"
                    "Describe the core idea of the paper with conceptual depth and clarity. "
                    "Explain the reasoning or motivation behind the approach in a way that reveals why it works, not just what it does. "
                    "Write as if speaking to an intelligent undergraduate student — assume curiosity, not prior technical knowledge. "
                    "Avoid metaphors and analogies, but aim to build real understanding through clear logic and plain language.\n\n"
                    "[[Method Breakdown]]\n"
                    "Describe the main techniques or pipeline steps used in the paper.\n\n"
                    "[[Novelties / Contributions]]\n"
                    "List what makes this work new, better, or different.\n\n"
                    "[[Critiques]]\n"
                    "Note assumptions, weaknesses, or areas for improvement.\n\n"
                    "[[Related Reading]]\n"
                    "Suggest 2-3 related topics or papers worth reading next."
                )}
            ]
        )

        generated_content = response.choices[0].message.content.strip()

        # Parse into Notion blocks
        section_blocks = []
        current_section = None
        section_heading_map = {
            "intuitive understanding": "🧠 Intuitive Understanding",
            "method breakdown": "⚙️ Method Breakdown",
            "novelties / contributions": "🌟 Novelties / Contributions",
            "critiques": "🧪 Critiques",
            "related reading": "📚 Related Reading"
        }

        for line in generated_content.splitlines():
            line = line.strip()
            if not line:
                continue
            if line.startswith("[[") and line.endswith("]]"):
                section_key = line[2:-2].strip().lower()
                readable_title = section_heading_map.get(section_key, section_key.title())
                current_section = readable_title
                section_blocks.append({
                    "object": "block",
                    "type": "heading_2",
                    "heading_2": {
                        "rich_text": [{"type": "text", "text": {"content": readable_title}}]
                    }
                })
            else:
                section_blocks.append({
                    "object": "block",
                    "type": "paragraph",
                    "paragraph": {
                        "rich_text": [{"type": "text", "text": {"content": line}}]
                    }
                })

        # Add structured sections after base layout
        notion.blocks.children.append(block_id=parent_id, children=section_blocks)

        return "✅ 'Paper Summary' page updated with paper info and layout."

In [7]:
tool = PaperSummaryTool()
tool.run(None, topics, papers, pdf_texts)

"✅ 'Paper Summary' page updated with paper info and layout."