In [2]:
import os
import time
from os.path import join, exists
from os import listdir, makedirs
from datetime import datetime
from google import genai
from google.genai import types
from openai import OpenAI
from openai import AsyncOpenAI
import requests
import json
from pydantic import BaseModel, Field
from crawl4ai import *
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.gemini import GeminiModel
from pydantic_ai.exceptions import UsageLimitExceeded
from pydantic_ai.usage import UsageLimits
from rich import print as rprint
from rich.console import Console
from rich.markdown import Markdown
from queue import Queue, Empty
from dataclasses import dataclass, field
from uuid import UUID, uuid4
from typing import Dict, Optional, List
from markitdown import MarkItDown
import asyncio
import nest_asyncio 
# Add this line to allow nested event loops
nest_asyncio.apply()
import copy
from agent_tools import *
from agent_utils import *

from loguru import logger

config = Config()

# Log to a file with custom timestamp format
logger.add("logs/chain_of_thougth_agent_system.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}")
model = GeminiModel(config.FLASH2_MODEL)

### Read documents

In [3]:
documents = {}
folder_name = 'input_files/'

# Create directory if it doesn't exist
if not exists(folder_name):
    makedirs(folder_name, exist_ok=True)

# Process each file in the input directory
for filename in listdir(folder_name):
    filepath = join(folder_name, filename)
    
    if not os.path.isfile(filepath):
        continue
        
    try:
        md = MarkItDown()
        result = md.convert(filepath)
        filename = os.path.basename(filepath)
        documents[filename] = result.text_content
    except Exception as e:
        print(f"Error processing {filepath}: {str(e)}")
        continue

doc = ""
for filename in documents:
    print(f"Filename: {filename}")
    doc = documents[filename]
    count = word_count(doc)
    print(f"Number of Words in the document: {count}")

    break


Filename: Vorhabenbeschreibung_InvestBW_Praxissprints_NeuroTrust_v06 (final).pdf
Number of Words in the document: 6512


### Setup Agent that reads the document

In [4]:
class DocumentStore:
    """Thread-safe FIFO string storage with non-blocking retrieval
    
    Usage:
        store = DocumentStore()
        store.put('data')
        item = store.get()  # returns None if empty
    """
    def __init__(self):
        self._items = deque()
        self._lock = threading.Lock()

    def put(self, item: str):
        """Add string to storage"""
        with self._lock:
            self._items.append(item)

    def get(self) -> str | None:
        """Retrieve and remove oldest string, returns None if empty"""
        with self._lock:
            return self._items.popleft() if self._items else None

    @property
    def count(self) -> int:
        return len(self._items)
        
    def __copy__(self):
        """Create a shallow copy of the DocumentStore instance."""
        new_store = DocumentStore()
        with self._lock:
            new_store._items = deque(self._items)
        return new_store

store = DocumentStore()


In [5]:
import threading
from functools import wraps

class FunctionCallLimiter:
    def __init__(self, num: int):
        """
        Initializes the FunctionCallLimiter.

        Args:
            num: The maximum number of times the decorated function can be called in total.
        """
        self.num = num
        self.call_count = 0
        self.lock = threading.Lock()

    def __call__(self, func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            with self.lock:
                if self.call_count < self.num:
                    self.call_count += 1
                    print(f"Call count: {self.call_count}/{self.num}")  # Optional: For demonstration
                    return func(*args, **kwargs)
                else:
                    print(f"Function call limit ({self.num}) reached. Function not executed.")
                    #return None  # Or you could raise an exception or return a specific value
                    return f"Function call limit ({self.num}) reached. Function not executed."

        return wrapper


class AsyncFunctionCallLimiter:
    def __init__(self, num: int):
        """
        Initializes the AsyncFunctionCallLimiter for async functions.

        Args:
            num: The maximum number of times EACH decorated async function can be called in total.
        """
        self.num = num
        self.function_call_counts = {}  # Dictionary to track call counts per function
        self.lock = asyncio.Lock()

    def __call__(self, func):
        @wraps(func)
        async def wrapper(*args, **kwargs):
            func_name = func.__name__  # Get the function name

            async with self.lock:
                if func_name not in self.function_call_counts:
                    self.function_call_counts[func_name] = 0  # Initialize count if not seen before

                if self.function_call_counts[func_name] < self.num:
                    self.function_call_counts[func_name] += 1
                    print(f"Call count (async) for '{func_name}': {self.function_call_counts[func_name]}/{self.num}")
                    return await func(*args, **kwargs)
                else:
                    return f"Function call limit ({self.num}) reached for async function '{func.__name__}'."

        return wrapper

In [6]:
system_prompt = read_system_prompt('chain_of_agents')

class AgentResponse(BaseModel):
    main_findings: str = Field(description="The summary/results/findings/notes based on the input. This will be past to the next agent.")
    agent_instruction: str = Field(description="The instruction that you want to give the next agent.")
    links: list[str] = Field(description="A list with links that should be evaluated next.")
    suggestions_for_improvements: str = Field(description="Here you can write down how the chain-of-agent process could be improved. Maybe the system prompt isn't optimal? -> mention it here.")

agent = Agent(
    model,
    deps_type=store,
    result_type=AgentResponse,
    system_prompt=system_prompt)


# @agent.system_prompt
# def add_document_info() -> str:  
#     return f'Number of stored documents {store.count}.'

# Create a RateLimiter instance allowing 5 requests per 10 seconds
rate_limiter = RateLimiter(rpm=10, window=60.0)

async_call_limiter = AsyncFunctionCallLimiter(num=3)  # Allow only 3 calls in total


@async_call_limiter
@agent.tool_plain
async def google_search(search_query: str, 
                        time_span: Optional[TimeSpan] = None, 
                        web_domain: Optional[str] = None) -> Optional[dict] | str:
    """
    Perform a Google search using the Serper API.
    
    Args:
        search_query (str): The search query string.
        time_span (Optional[TimeSpan], optional): The time span. Defaults to None.
            - Allowed:
                - "qdr:h" (for hour)
                - "qdr:d" (for day)
                - "qdr:w" (for week)
                - "qdr:m" (for month)
                - "qdr:y" (for year)
        web_domain (Optional[str], optional): Search inside a web domain (e.g., web_domain="brainchip.com" -> searches only pages with this domain)
    Returns:
        Optional[dict]: The search results.
    """
    response = await google_general_search_async(search_query, time_span, web_domain)

    return response

@async_call_limiter
@agent.tool_plain
async def google_scholar_search_async(search_query: str, num_pages: int = 1) -> dict  | str:
    """Google scholar search using an API.

        Args:
            search_query (str): The search query string.
            num_pages (int): The amount of page results that should be returned (more pages=more results).
        Returns:
            dict: The search results.
        
    """
    response = await google_scholar_search_async(search_query, num_pages)
    return response



In [None]:
user_input = f"""
User: The following document (in German) is a preview of a research proposal that we want to submit. Please help us to improve the document.
Use all the tools available to you to find interessting ideas/research papers and possible improvements.

# Research proposal document:

{doc}
"""

result = await agent.run(user_input, deps=store)

INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent "HTTP/1.1 200 OK"


Call count (async) for 'google_scholar_search_async': 1/3


INFO:httpx:HTTP Request: POST https://google.serper.dev/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://google.serper.dev/search "HTTP/1.1 200 OK"


In [None]:
data = result.data
print(f"Number of Links: {len(data.links)}")

temp_folder = "temp/"

#makedirs(temp_folder)

headers = {"User-Agent": "Mozilla/5.0"}

files = {}

for (k, link) in enumerate(data.links):
    # Inside your loop before the md.convert() call:
    filename = os.path.join(temp_folder, f"page_{k}.html")
    try:
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        response = requests.get(link, headers=headers)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)
        
        print(f"Saved: {filename}")
        filename = os.path.basename(filename)
        files[filename] = link

    except IOError as e:
        print(f"Failed to save {filename}: {e}")
        continue

In [None]:
md = MarkItDown()
min_word_threshold = 1000

for filename in files:
    link = files[filename]
    with open(filename, "r", encoding="utf-8") as f:
        file_content = f.read()
        count = word_count(file_content)
        print(count)
        if count < min_word_threshold:
            markdown_output = await crawl_website_async(link)
    #converted_output = md.convert(filename) # reads file again (needed for convertion)
    #store.put(converted_output.text_content)    
    