In [None]:
from agent_utils import *
from agent_tools import *

config = Config()

# Log to a file with custom timestamp format
logger.add("logs/chain_of_thougth_agent_system.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}")
model = GeminiModel(config.FLASH2_MODEL)

logfire.configure(scrubbing=logfire.ScrubbingOptions(callback=scrubbing_callback))

logfire.instrument_httpx()

In [None]:
# Initialize the chat handler with your credentials and desired model.
chat = ChatHandler()
    
response = chat.send_question("What is 2*2?")
print(response)

### Read documents

In [3]:
documents = {}
folder_name = 'input_files/'

# Create directory if it doesn't exist
if not exists(folder_name):
    makedirs(folder_name, exist_ok=True)

main_input_file = join(folder_name, "Vorhabenbeschreibung_NeuroTrust.txt")

if not exists(main_input_file):
    print("file does not exists")
    # Process each file in the input directory
    for filename in listdir(folder_name):
        filepath = join(folder_name, filename)
        
        if not os.path.isfile(filepath):
            continue
            
        try:
            md = MarkItDown()
            result = md.convert(filepath)
            filename = os.path.basename(filepath)
            documents[filename] = result.text_content
        except Exception as e:
            print(f"Error processing {filepath}: {str(e)}")
            continue

    doc = ""
    for filename in documents:
        print(f"Filename: {filename}")
        doc = documents[filename]
        count = word_count(doc)
        print(f"Number of Words in the document: {count}")

        break

    with open(main_input_file, "w", encoding="utf-8") as f:
        f.write(doc)

else:
    with open(main_input_file, "r", encoding="utf-8") as f:
        doc = f.read()

In [3]:
query = f"""Let's assume you have access to the Google Search API and the Google Scholar API. 

Based on the folling document, which search queries would you do, to improve the quality of the document?

# Document: 

{doc}"""

response = chat.send_question(query)
console_print(response)

In [None]:
system_prompt = "Your goal is to extract search queries (e.g., google search, google scholar, etc.) that are mention in a text."
user_input = f"Please extract all search queries from this text: {response}"

baseAgent = BaseAgent(AgentResponse, system_prompt)
result = await baseAgent(user_input)

print(result.data)

In [None]:

class AgentResponse(BaseModel):
    google_search_queries: list[str] = Field(description="The extracted google search queries.")
    google_scholar_queries: list[str] = Field(description="The extracted google scholar queries.")

system_prompt = "Your goal is to extract search queries (e.g., google search, google scholar, etc.) that are mention in a text."
user_input = f"Please extract all search queries from this text: {response_thinking.final_answer}"

agent = Agent(
    model,
    result_type=AgentResponse,
    system_prompt=system_prompt)

result = await agent.run(user_input)
data = result.data

### Setup Agent that reads the document

In [29]:
class DocumentStore:
    """Thread-safe FIFO string storage with non-blocking retrieval
    
    Usage:
        store = DocumentStore()
        store.put('data')
        item = store.get()  # returns None if empty
    """
    def __init__(self):
        self._items = deque()
        self._lock = threading.Lock()

    def put(self, item: str):
        """Add string to storage"""
        with self._lock:
            self._items.append(item)

    def get(self) -> str | None:
        """Retrieve and remove oldest string, returns None if empty"""
        with self._lock:
            return self._items.popleft() if self._items else None

    def clear(self):
        self._items.clear()

    @property
    def count(self) -> int:
        return len(self._items)
        
    def __copy__(self):
        """Create a shallow copy of the DocumentStore instance."""
        new_store = DocumentStore()
        with self._lock:
            new_store._items = deque(self._items)
        return new_store

store = DocumentStore()

In [30]:
system_prompt = read_system_prompt('chain_of_agents')

class AgentResponse(BaseModel):
    main_findings: str = Field(..., description="Markdown summary with inline references (e.g., [Source 1]).")
    agent_instruction: str = Field(..., description="Specific, actionable steps for the next agent (e.g., 'Analyze X in Document 5').")
    links: list[str] = Field(..., description="High-priority links for further research.")
    search_queries_used: list[str] = Field(..., description="Queries executed in this step to avoid repetition.")
    processed_docs: list[str] = Field(..., description="IDs of documents analyzed (e.g., doc_3, doc_7).")
    confidence_score: float = Field(1.0, description="0-1 score indicating confidence in findings (1=high).")
    suggestions_for_improvements: str = Field("", description="Feedback on system/prompt issues.")

agent = Agent(
    model,
    deps_type=store,
    result_type=AgentResponse,
    system_prompt=system_prompt)

# @agent.system_prompt
# def add_document_info() -> str:  
#     return f'Number of stored documents {store.count}.'

# Create a RateLimiter instance allowing 5 requests per 10 seconds
#rate_limiter = RateLimiter(rpm=10, window=60.0)

async_call_limiter = AsyncFunctionCallLimiter(num=1)  # Restrict the number of function calls

@agent.tool_plain
@async_call_limiter
async def google_search(search_query: str, 
                        time_span: Optional[TimeSpan] = None, 
                        web_domain: Optional[str] = None) -> Optional[dict] | str:
    """
    Perform a Google search using the Serper API.
    
    Args:
        search_query (str): The search query string.
        time_span (Optional[TimeSpan], optional): The time span. Defaults to None.
            - Allowed:
                - "qdr:h" (for hour)
                - "qdr:d" (for day)
                - "qdr:w" (for week)
                - "qdr:m" (for month)
                - "qdr:y" (for year)
        web_domain (Optional[str], optional): Search inside a web domain (e.g., web_domain="brainchip.com" -> searches only pages with this domain)
    Returns:
        Optional[dict]: The search results.
    """
    print(f"google_search with arguments = ({search_query}, {time_span}, {web_domain})")
    response = await google_general_search_async(search_query, time_span, web_domain)

    return response

@agent.tool_plain
@async_call_limiter
async def scholar_search(search_query: str, num_pages: int = 1) -> dict  | str:
    """Google scholar search using an API.

        Args:
            search_query (str): The search query string.
            num_pages (int): The amount of page results that should be returned (more pages=more results).
        Returns:
            dict: The search results.
        
    """
    print(f"scholar_search with arguments = ({search_query}, {num_pages})")
    response = await google_scholar_search_async(search_query, num_pages)
    return response


In [None]:
user_input = f"""
The following document (in German) is a preview of a research proposal that we want to submit. 

Please help us to improve the content of the document in the following ways:
- Do research about the topics mentioned in the document.
- Output things that might be relavant and valueble to the report.
- Include information that might add value to the report.
- Analyse new released papers and see if certain things might have changed.

# Research proposal document:

{doc}
"""

pickle_file = "chain_of_agent_output.pkl"
os.system(f'rm {pickle_file}')

result = await agent.run(user_input, deps=store)
data = result.data
print(f"Number of Links: {len(data.links)}")
with open(pickle_file, "wb") as f:
    pickle.dump(result, f)

#### Load pickle file (instead of running model)

In [15]:
if exists(pickle_file):
    with open(pickle_file, "rb") as f:
        result = pickle.load(f)

In [None]:
console_print(result.data.main_findings)

In [None]:
temp_folder = "temp/"

if not exists(temp_folder):
    makedirs(temp_folder)

headers = {"User-Agent": "Mozilla/5.0"}

files = {}

for k, link in enumerate(data.links):
    filename = join(temp_folder, f"page_{k}.html")
    try:
        makedirs(os.path.dirname(filename), exist_ok=True)
        response = requests.get(link, headers=headers, timeout=10)
        with open(filename, "w", encoding="utf-8") as f:
            f.write(response.text)

        print(f"Saved: {filename}")
        files[basename(filename)] = link

    except requests.exceptions.Timeout:
        print(f"Timeout occurred for {link}.")
        continue

    except IOError as e:
        print(f"Failed to save {filename}: {e}")
        continue

In [None]:
md = MarkItDown()
min_word_threshold = 1000
max_word_threshold = 250000

for filename in files:
    link = files[filename]
    filepath = join(temp_folder, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        file_content = f.read()
        count = word_count(file_content)

        if count < max_word_threshold:
            if count < min_word_threshold:
                print(f"Word count: {count} -> Use crawl4ai")
                markdown_output = await crawl4ai_website_async(link)
            else:
                markdown_output = md.convert(filepath).text_content

            store.put(markdown_output)    
        

In [None]:
print(store.count)
store2 = copy.copy(store)

In [None]:
print(store2.count)
console_print(store2.get())