In [1]:
import os
import time
from os.path import join, exists
from os import listdir, makedirs
from datetime import datetime
from google import genai
from google.genai import types
from openai import OpenAI
from openai import AsyncOpenAI
import requests
import json
from pydantic import BaseModel, Field
from crawl4ai import *
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.gemini import GeminiModel
from rich import print as rprint
from rich.console import Console
from rich.markdown import Markdown
from queue import Queue, Empty
from dataclasses import dataclass, field
from uuid import UUID, uuid4
from typing import Dict, Optional, List
from markitdown import MarkItDown
import asyncio
import nest_asyncio 
# Add this line to allow nested event loops
nest_asyncio.apply()

from agent_tools import *
from agent_utils import *

from loguru import logger

config = Config()

console = Console()
# Log to a file with custom timestamp format
logger.add("chain_of_thougth_agent_system.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}")
model = GeminiModel(config.FLASH2_MODEL)

/home/dplaia/Projekte/deepresearchagent/.venv/lib/python3.13/site-packages/pydantic/fields.py:1042: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'env'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  warn(
/home/dplaia/Projekte/deepresearchagent/.venv/lib/python3.13/site-packages/pydantic/_internal/_config.py:295: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/


### Read documents

In [2]:
documents = {}
folder_name = 'input_files/'

# Create directory if it doesn't exist
if not exists(folder_name):
    makedirs(folder_name, exist_ok=True)

# Process each file in the input directory
for filename in listdir(folder_name):
    filepath = join(folder_name, filename)
    
    if not os.path.isfile(filepath):
        continue
        
    try:
        md = MarkItDown()
        result = md.convert(filepath)
        filename = os.path.basename(filepath)
        documents[filename] = result.text_content
    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        continue

In [3]:
doc = ""
for filename in documents:
    print(f"Filename: {filename}")
    doc = documents[filename]
    count = word_count(doc)
    print(f"Number of Words in the document: {count}")
    #console.print(Markdown(doc))
    break


Filename: Vorhabenbeschreibung_InvestBW_Praxissprints_NeuroTrust_v06 (final).pdf
Number of Words in the document: 6512


### Setup Agent that reads the document

In [83]:
class DocumentStore:
    """Thread-safe FIFO string storage with non-blocking retrieval
    
    Usage:
        store = DocumentStore()
        store.put('data')
        item = store.get()  # returns None if empty
    """
    def __init__(self):
        self._items = deque()
        self._lock = threading.Lock()

    def put(self, item: str):
        """Add string to storage"""
        with self._lock:
            self._items.append(item)

    def get(self) -> str | None:
        """Retrieve and remove oldest string, returns None if empty"""
        with self._lock:
            return self._items.popleft() if self._items else None

    @property
    def count(self) -> int:
        return len(self._items)

store = DocumentStore()


In [39]:
system_prompt = """
You are a research agent and you are part of a chain-of-agent system that process multiple documents in sequential order.
Instead of processing all the documents at once (doesn't fit in the context window), we try to process each file individually.
The goal is to create a useful and high quality report over time by processing multiple document in sequential order.

The chain process can only be successfull, if you pass the right information to the next agent. 

Tools: 
- You have access to multiple search tools.
- You can use these tool multple times to collect more information.
- Google Search: An overview of google results.
- Google Schoolar Search: To find relevant papers.

Inputs: 
- User instruction + document(s): You might get an instruction and one (or multiple) document from the user. The main research that you will do is based on this input.
- Previous agent instruction: An instruction given by the predecessor agent (might be yourself from the past). 
- Search results documents: The search results document are based on weblinks (that were picked in the past). You should use the documents for collecting the most valueble information that is passed to the next agent.


Output:
- An instruction for the next agent in the chain: This instruction should help the next agent to know what to do next. Address this instruction directly to the agent (it will see the text that you write). Help improving the quality of the final report. You can also specify what the agent should not do. Maybe search something that was searched before. Be creative!
- A list of weblink based on search results: The content of each link will be saved into a document store, where each will be processed over time.
- main_findings (written in English): Here you have to collect the most important information for the next agent that is relevant to the input query or problem. Based on this information, the final report can be produced. Write everything strictly in Markdown format. Use references if the link to the text content is available.
- links: The weblinks (html/PDFs) that should be added to the document store and will be processed later in the chain.

"""

class AgentResponse(BaseModel):
    main_findings: str = Field(description="The summary/results/findings/notes based on the input. This will be past to the next agent.")
    agent_instruction: str = Field(description="The instruction that you want to give the next agent.")
    links: list[str] = Field(description="A list with links that should be evaluated next.")
    suggestions_for_improvements: str = Field(description="Here you can write down how the chain-of-agent process could be improved. Maybe the system prompt isn't optimal? -> mention it here.")

agent = Agent(
    model,
    deps_type=store,
    result_type=AgentResponse,
    tools=[google_general_search_async, google_scholar_search_async, google_news_search_async],
    system_prompt=system_prompt)


@agent.system_prompt
def add_the_date() -> str:  
    return f'Number of stored documents {store.count}. Not use web search tools if larger than 3!'


In [None]:
user_input = f"""
User: The following document (in German) is a preview of a research proposal that we want to submit. Please help us to improve the document.
Use all the tools available to you to find interessting ideas/research papers and possible improvements.

# Research proposal document:

{doc}
"""

result = await agent.run(user_input)

In [None]:
print(f"Number of Links: {len(data.links)}")

temp_folder = "temp/"

#makedirs(temp_folder)

headers = {"User-Agent": "Mozilla/5.0"}

files = []

for (k, link) in enumerate(data.links):
    # Inside your loop before the md.convert() call:
    filename = os.path.join(temp_folder, f"page_{k}.html")
    try:
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        response = requests.get(link, headers=headers)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)
        
        print(f"Saved: {filename}")
    
        files.append(filename)

    except IOError as e:
        print(f"Failed to save {filename}: {e}")
        continue
    

In [92]:
md = MarkItDown()
for file in files:
    result = md.convert(filename) # reads file again (needed for convertion)
    store.put(result.text_content)    

print(store.count)

56


In [97]:
#result = md.convert(files[2])

print(store.count)
console.print(Markdown(store.get()))

52


#### Reference / Citation Manager

In [15]:
class ReferenceManager():
    def __init__(self, save_directory: str):
        self.refs: dict[str, str] = {}
        self.save_directory: str = save_directory
        os.makedirs(save_directory, exist_ok=True)

    def add_reference(self, link: str, markdown_text: str, filename: str) -> None:
        """Adds a reference with validation of inputs and error handling."""
        if not link.startswith(('http://', 'https://')):
            raise ValueError(f"Invalid link format: {link}")

        if not filename.endswith('.md'):
            filename += '.md'

        if not self.check_ref(link):
            try:
                self.save_markdown(filename, markdown_text)
                self.refs[link] = filename
            except IOError as e:
                print(f"Failed to save {filename}: {str(e)}")

    def check_ref(self, link: str) -> bool:
        return link in self.refs

    def get_markdown_text(self, link: str) -> Optional[str]:

        if filename := self.refs.get(link):
            try:
                return self.read_markdown_file(filename)
            except FileNotFoundError:
                return None
        return None

    def save_markdown(self, filename: str, markdown_text: str) -> None:
        full_path = join(self.save_directory, filename)
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)

    def read_markdown_file(self, filename: str) -> str:
        full_path = join(self.save_directory, filename)
        try:
            with open(full_path, 'r', encoding='utf-8') as f:
                return f.read()
        except FileNotFoundError:
            raise

@dataclass
class DocumentInfo:
    id: UUID
    title: str
    note_count: int
    created_at: Optional[datetime]
    last_updated: Optional[datetime]

class NoteManager:
    def __init__(self):
        self.documents: Dict[UUID, DocumentNotes] = {}
        self.document_titles: Dict[str, UUID] = {}  # Prevent duplicate titles

    def create_document(self, title: str) -> UUID:
        """Creates a new document with unique title validation"""
        if title in self.document_titles:
            raise ValueError(f"Document title '{title}' already exists")
            
        doc_id = uuid4()
        self.documents[doc_id] = DocumentNotes(title)
        self.document_titles[title] = doc_id
        return doc_id

    def get_all_documents(self) -> List[DocumentInfo]:
        """Returns metadata about all documents with temporal statistics"""
        return [
            DocumentInfo(
                id=doc_id,
                title=doc.title,
                note_count=len(doc.notes),
                created_at=min(note.created_at for note in doc.notes.values()) if doc.notes else None,
                last_updated=max(note.updated_at for note in doc.notes.values()) if doc.notes else None
            )
            for doc_id, doc in self.documents.items()
        ]

@dataclass
class Note:
    id: UUID # Unique identifier using UUID
    title: str
    content: str
    created_at: datetime
    updated_at: datetime

@dataclass
class DocumentNotes:
    title: str
    notes: Dict[UUID, Note] = field(default_factory=dict)
    
    def add_note(self, title: str, content: str) -> UUID:
        """Adds a new note with creation timestamp"""
        note_id = uuid4()
        now = datetime.now()
        self.notes[note_id] = Note(
            id=note_id,
            title=title,
            content=content,
            created_at=now,
            updated_at=now
        )
        return note_id

    def update_note(self, note_id: UUID, new_content: str) -> None:
        """Updates existing note with conflict checking"""
        if note_id not in self.notes:
            raise KeyError(f"Note {note_id} not found")
            
        self.notes[note_id].content += f"\n\n--- Update {datetime.now()} ---\n{new_content}"
        self.notes[note_id].updated_at = datetime.now()

    def get_note(self, note_id: UUID) -> Optional[Note]:
        """Safer retrieval with explicit None return"""
        return self.notes.get(note_id)

    def get_all_notes(self) -> Dict[UUID, Note]:
        """Returns direct reference to notes dict"""
        return self.notes.copy()  # Return copy to prevent accidental mutation


In [25]:
# Initialize the note management system
manager = NoteManager()

# Create two documents
research_id = manager.create_document("Research Paper")
meeting_id = manager.create_document("Meeting Minutes")

# Add notes to first document
methodology_note_id = manager.documents[research_id].add_note(
    "Methodology",
    "Need to revise sampling methodology section"
)

methodology_note_id = manager.documents[research_id].add_note(
    "Shows creation time",
    "Creation time should be shown."
)

# Add notes to second document
action_items_id = manager.documents[meeting_id].add_note(
    "Action Items",
    "1. Schedule follow-up meeting\n2. Prepare Q2 budget"
)

# Update a note in the first document
manager.documents[research_id].update_note(
    methodology_note_id,
    "Added new randomization procedure details"
)

# Retrieve and print a specific note
def print_note(doc_id: UUID, note_id: UUID):
    doc = manager.documents[doc_id]
    note = doc.get_note(note_id)
    if note:
        print(f"\n--- Note: {note.title} ---")
        print(f"Created: {note.created_at}")
        print(f"Last Updated: {note.updated_at}")
        print(f"Content:\n{note.content}\n")
    else:
        print("Note not found")

# Print updated methodology note
print_note(research_id, methodology_note_id)

# Example of failed update (non-existent note)
try:
    manager.documents[meeting_id].update_note(
        UUID('00000000-0000-0000-0000-000000000000'),
        "This shouldn't work"
    )
except KeyError as e:
    print(f"\nError: {str(e)}")

# Retrieve all notes from meeting document
print("\nAll meeting notes:")
for note_id, note in manager.documents[meeting_id].get_all_notes().items():
    print(f" - {note.title}: {note.content[:50]}...")


--- Note: Shows creation time ---
Created: 2025-01-31 17:19:21.168653
Last Updated: 2025-01-31 17:19:21.168781
Content:
Creation time should be shown.

--- Update 2025-01-31 17:19:21.168770 ---
Added new randomization procedure details


Error: 'Note 00000000-0000-0000-0000-000000000000 not found'

All meeting notes:
 - Action Items: 1. Schedule follow-up meeting
2. Prepare Q2 budget...


In [28]:
id = manager.get_all_documents()[0].id

notes = manager.documents[id].get_all_notes()
for note_id in notes:
    print(notes[note_id].title)
    print(notes[note_id].content)
    print()

Methodology
Need to revise sampling methodology section

Shows creation time
Creation time should be shown.

--- Update 2025-01-31 17:19:21.168770 ---
Added new randomization procedure details



In [None]:
page_content_markdown = {}
for link in result.data.links:
    print(f"Link: {link}")
    markdown = await crawl4ai_website_async(link)
    page_content_markdown[link] = markdown

#### Response Agent or Summary Agent

In [41]:
system_prompt = """
You are an expert at writing professional technical writer (articles, blogs, books, etc.).

After receiving a user query and some files, your goal is to write an report about the user query.
This writen report should be technically detailed but comprehensive for normal readers.

Please use references in the report (e.g. [1]). You can find the link of a given input text above the text with "From link ([1] http ...)".

Always use References at the end of the report.
  
Write the output strictly in Markdown format. 
"""

summary_agent = Agent(
    model,
    result_type=str,
    system_prompt=system_prompt)

In [42]:
result = await summary_agent.run(combined_markdown)

INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent "HTTP/1.1 200 OK"


In [None]:
console.print(Markdown(result.data))