# Example DeepResearchAgent

In [62]:
import os
import time
from os.path import join, exists
from os import listdir, makedirs
from datetime import datetime
from google import genai
from google.genai import types
from openai import OpenAI
import requests
import json
from pydantic import BaseModel, Field
import asyncio
import nest_asyncio 
from crawl4ai import *
from pydantic_ai import Agent, RunContext
from dataclasses import dataclass
# Add this line to allow nested event loops
nest_asyncio.apply()

### Perplexity Search API Function

In [52]:

def get_perplexity_search_results(query):
    api_key = os.environ.get("PERPLEXITY_API_KEY")

    if not api_key:
        print("PERPLEXITY_API_KEY not found in environment variables.")

    messages = [
        {
            "role": "system",
            "content": (
                "You are an artificial intelligence assistant and you need to "
                "engage in a helpful, detailed, polite conversation with a user."
            ),
        },
        {   
            "role": "user",
            "content": (
                query
            ),
        },
    ]

    client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai")

    # chat completion without streaming
    response = client.chat.completions.create(
        model="sonar-pro",
        messages=messages,
    )

    message = response.choices[0].message.content + "\n\n"
    citations = response.citations

    for k, citation in enumerate(citations):
        message += f"[{k+1}] {citation}\n"
        #print(f"[{k+1}] {citation}")

    return message, citations


In [None]:
message, citations = get_perplexity_search_results("How many user queries can be done with the H100 und a 13B parameter LLM model?")
print(message)
print(citations)

### Google Search API Function

The API key is stored in the environment variable `SERPER_API_KEY`.  
An account can be created [here](https://serper.dev/).


In [61]:
def get_google_search_results(query, num_results=10):
    api_key = os.environ.get("SERPER_API_KEY")

    if not api_key:
        print("SERPER_API_KEY not found in environment variables.")
        return

    url = "https://google.serper.dev/search"
    payload = json.dumps({
    "q": query,
    "num": num_results
    })
    headers = {
    'X-API-KEY': api_key,
    'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)

    return response


### Search Webpages

In [None]:
topic = "Test-time compute and test-time training for Large Language Models."
response = get_google_search_results(topic)
# Convert to JSON
json_response = response.json()
print(json_response)

### Crawl Webpages

In [55]:
async def crawl_website_async(url_webpage):
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url_webpage,
        )
        return result.markdown

def crawl_website(url_webpage):
    return asyncio.run(crawl_website_async(url_webpage))

In [58]:
res = crawl_website("https://arxiv.org/html/2501.12895")

[INIT].... → Crawl4AI 0.4.247
[FETCH]... ↓ https://arxiv.org/html/2501.12895... | Status: True | Time: 1.77s
[SCRAPE].. ◆ Processed https://arxiv.org/html/2501.12895... | Time: 300ms
[COMPLETE] ● https://arxiv.org/html/2501.12895... | Status: True | Total: 2.07s


In [None]:
query_folder = "test-time-compute"
# Create folder if not exists
if not exists(query_folder):
    makedirs(query_folder)

document_data = {}

for result in json_response['organic']:
    title = result['title']
    markdown = crawl_website(result['link'])
    filename = result['title'] + ".md"

    document_data[title] = {
        'topic': topic,
        'link': result['link'],
        'snippet': result['snippet'],
        'date': result['date'],
        'position': result['position'],
        'markdown': markdown,
        'filename': filename
    }

    with open(join(query_folder, filename), "w") as f:
        f.write(markdown)

In [None]:
agent = Agent('openai:gpt-4o')

result_sync = agent.run_sync('What is the capital of Italy?')
print(result_sync.data)
#> Rome

async def run_agent_async():
    result = await agent.run('What is the capital of France?')
    print(result.data)
    #> Paris

    async with agent.run_stream('What is the capital of the UK?') as response:
        print(await response.get_data())
        #> London

def run_agent():
    asyncio.run(run_agent_async())

run_agent()

### Crate an Agent that rates the quality of generated content

In [67]:
# Only run this block for Gemini Developer API
client = genai.Client()

flash_thinking_model = "gemini-2.0-flash-thinking-exp-01-21"
flash2_model = "gemini-2.0-flash-exp"
flash1_model = "gemini-1.5-flash"

In [80]:
from pydantic_ai.models.gemini import GeminiModel
from datetime import date
model = GeminiModel(model_name)

@dataclass
class ResearchDeps:
    research_topic: str = Field(description="The research topic of the document.")
    document_type: str = Field(description="The type of document for the table of content (paper/report/general document/webpage).")
    document_number_of_pages: int = Field(description="A rough estimate of how many pages the report will have. The table of content needs to reflect that.")

class TableOfContentResult(BaseModel):
    table_of_content: list[str] = Field(description="The generated table of content for the scientific report/document.")
    additional_notes: str = Field(description="If you want to add commentary based on things you've observed during the processing of the data you can add it here.")
    text_summary: str = Field(description="A one page summary of the given research topic based on the search results.")

table_of_content_agent = Agent(
    flash2_model,
    deps_type=ResearchDeps,  
    result_type=TableOfContentResult,
    system_prompt="""Your goal is to create the table of content for a scientific report based on given topic and given input text. 
    You don't need to generate the report (except the one page summary).
    Use the following tools to collect information about the topic:
    - google search tool 
    - perplexity search tool
    
    Use both tools to get an overview about the topic, then create the table of content for the report/paper/document.
    The table of content needs to contain the most interessting and important parts of the topic based on the desired page count of the report (more details for more report pages).
    """,  
)

@table_of_content_agent.system_prompt  
async def get_system_prompt(ctx: RunContext[ResearchDeps]) -> str:  
    return f'{ctx.deps.research_topic}'

@table_of_content_agent.tool_plain  
def perplexity_search(search_query: str) -> dict:
    """Uses the Serper API to retrieve google results based on a string query.
    Certain search results could be faulty or irrelevant, please ignore these results.

    Args:
        search_query (str): The Perplexity query string. Perplexity uses an LLM to do the processing of webpages. Use a query text that is most suitable for this.
    
    Returns:
        search_result: A dictionary with the following fields:
            - 'test_response' (str): The text response from the Perplexity LLM with citations in the form [1], [2], etc.
            - 'citations' (list[str]): A list of citations used in the test_response

    """

    test_response, citations = get_perplexity_search_results(search_query)

    search_result = {
        'test_response': test_response,
        'citations': citations
    }
    return search_result

@table_of_content_agent.tool_plain  
def google_search(search_query: str, topic_folder_name: str) -> dict:
    """Uses the Serper API to retrieve google results based on a string query.
    Certain search results could be faulty or irrelevant, please ignore these results.

    Args:
        search_query:str The google query string. It needs to be suited for the given research topic.
        topic_folder_name:str A suitable name for a folder (all search results are saved in this folder). It needs to be a valid folder name.
    
    Returns:
        document_data: A dictionary with the following fields:
            - 'topic': The research topic (=search_query)
            - 'link': The webpage link
            - 'snippet': A short snippet of the webpage
            - 'date': The date of the webpage
            - 'position': Element position
            - 'markdown': The markdown text (webpage content)
            - 'filename': filename (saved in folder with name=topic_folder_name)
    """
    num_results = 10
    api_key = os.environ.get("SERPER_API_KEY")

    if not api_key:
        print("SERPER_API_KEY not found in environment variables.")
        return

    url = "https://google.serper.dev/search"
    payload = json.dumps({
    "q": search_query,
    "num": num_results
    })
    headers = {
    'X-API-KEY': api_key,
    'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    json_response = response.json()

    # Create folder if not exists
    if not exists(topic_folder_name):
        makedirs(topic_folder_name)

    document_data = {}

    for result in json_response['organic']:
        title = result['title']
        markdown = crawl_website(result['link'])
        filename = result['title'] + ".md"

        document_data[title] = {
            'topic': search_query,
            'link': result['link'],
            'snippet': result['snippet'],
            'date': result['date'],
            'position': result['position'],
            'markdown': markdown,
            'filename': filename
        }

        with open(join(topic_folder_name, filename), "w") as f:
            f.write(markdown)

    return document_data

result = table_of_content_agent.run_sync('Write a table of content for a scientific report.', deps=ResearchDeps(research_topic="Test-time compute and training for LLMs", document_type="Scientific Report", document_number_of_pages="20"))
print(result.data)

INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent "HTTP/1.1 200 OK"


table_of_content=['1. Introduction to Test-Time Compute', '    1.1. The Need for Test-Time Compute', '    1.2. How Test-Time Compute Differs from Pre-training', '2. Core Concepts and Mechanisms', '    2.1. Inference-Time Optimization', '    2.2. Adaptive Resource Allocation', '    2.3. Iterative Refinement and Self-Verification', '    2.4. Reward Modeling and Verifiers', '    2.5. Search Methods and Exploration Strategies', '3. Test-Time Compute Techniques', '    3.1. Best-of-N Sampling', '    3.2. Iterative Refinement', '    3.3. Tree of Thoughts and Search Algorithms', '    3.4. Self-Critique and Revision', '    3.5. Active Fine-Tuning at Test-Time', '4. Compute-Optimal Scaling Strategies', '    4.1. Adaptive Compute Allocation Based on Task Difficulty', '    4.2. Dynamic Resource Management', '    4.3. Trading Off Pre-training and Test-Time Compute', '5. Impact on LLM Performance', '    5.1. Enhanced Reasoning Abilities', '    5.2. Improved Accuracy on Complex Tasks', '    5.3. Miti

In [91]:
text_summary = result.data.text_summary
additional_notes = result.data.additional_notes
table_of_content = "" 
for item in result.data.table_of_content:
    table_of_content += f"- {item}\n"

document_text = f"""Text Summary:
{text_summary}

Additional Notes:
{additional_notes}

Table of Content:
{table_of_content}
"""

# Save document to file
with open("document.md", "w") as f:
    f.write(document_text)

In [28]:
class DocumentQuality(BaseModel):
    filename: str = Field(description="The name of the file")   
    document_length: int = Field(description="The length of the document (0 to 10). Is the document short or long.", ge=0, le=10)
    relevance: int = Field(description="How relevant is the document with the main topic (0 to 10).", ge=0, le=10)
    document_quality: int = Field(description="Guess the quality of the document (0 to 10).", ge=0, le=10)
    document_age: int = Field(description="The age of the document relative to the current data (0 to 10).", ge=0, le=10)
    additional_observations: str = Field(description="If you noticed something strange about the document. Write it in form of an instruction for another LLM agents.")

def rate_document(document):

    system_instruction = f"""
    You are an professional scientific journalist. 

    You will receive research related documents (markdown format). 
    Your goal is to estimate the relevance and quality of this document (based on a given topic).
    The document will later be used for writing a reasearch report/document.
    If the quality of the text isn't good, this will lead to an overall bad outcome of the report. 

    Topic: {document['topic']}    
    Current Date: {datetime.now().date()}
    Document Date: {document['date']}
    Link: {document['link']}
    """

    markdown_content = markdown

    response = client.models.generate_content(
        model=model_name,
        contents=markdown_content,
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_schema=DocumentQuality,
            system_instruction=system_instruction,
            temperature=0.3,
        ),
    )
    document['response'] = response
    return document

### Process Files (Quality Assessment)

In [None]:
# Create folder if not exists
if exists(query_folder):
    # Get all files in folder
    files = listdir(query_folder)

for title, document in document_data.items():
    print(f"File: {document.filename}")
    document = rate_document(document)
    print(f"Reponse Text: {response.text}")
    document_data[title] = document
    time.sleep(5) # sleep for 5 seconds (rate limit is 10 RPM)

In [None]:
system_instruction = """You are an AI assistant designed to produce output that is visually appealing and easily readable in a terminal. When formatting your responses, utilize the syntax of the Python `rich` library. This involves using square brackets to enclose formatting tags.
        Here are some examples of how to apply formatting:

        * **Emphasis:** Instead of "This is important", output "[bold]This is important[/]".
        * **Headers/Titles:** Instead of "Section Title:", output "[bold blue]Section Title:[/]".
        * **Warnings:** Instead of "Warning!", output "[bold red]Warning![/]".
        * **Success Messages:** Instead of "Operation successful.", output "[green]Operation successful.[/]".
        * **Lists:** You can use colors for list items like "[cyan]*[/] Item 1".

        Always use the `rich` library's syntax for formatting terminal output to enhance readability."""

response = client.models.generate_content(
    model=model_name_thinking,
    contents="Show me the proof for the euler identity?",
    config=types.GenerateContentConfig(
        system_instruction=system_instruction,
        temperature=0.3,
    ),
)
print(response.text)

In [None]:
file = client.files.upload(path="a11.text")
response = client.models.generate_content(
    model=model_name, contents=["Summarize this file", file]
)
print(response.text)

In [26]:
def get_current_weather(location: str) -> str:
    """Returns the current weather.

    Args:
      location: The city and state, e.g. San Francisco, CA
    """
    return "sunny"


response = client.models.generate_content(
    model="gemini-2.0-flash-exp",
    contents="What is the weather like in Boston?",
    config=types.GenerateContentConfig(tools=[get_current_weather]),
)

print(response.text)

  Expected `enum` but got `str` with value `'STRING'` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


The weather in Boston is sunny.



In [34]:
class CountryInfo(BaseModel):
    name: str
    population: int
    capital: str
    continent: str
    gdp: int
    official_language: str
    total_area_sq_mi: int


response = client.models.generate_content(
    model="gemini-2.0-flash-exp",
    contents="Give me information for the United States.",
    config=types.GenerateContentConfig(
        response_mime_type="application/json",
        response_schema=CountryInfo,
    ),
)
print(response.text)

{
"capital": "Washington, D.C.",
"continent": "North America",
"gdp": 25460000000000,
"name": "United States",
"official_language": "English",
"population": 331002651,
"total_area_sq_mi": 3796742
}
