In [1]:
import os
import time
from pprint import pprint
from os.path import join, exists
from os import listdir, makedirs
from datetime import datetime
from google import genai
from google.genai import types
from openai import OpenAI
import requests
import json
from pydantic import BaseModel, Field
import asyncio
import nest_asyncio 
from crawl4ai import *
from pydantic_ai import Agent, RunContext
from pydantic_ai.models.gemini import GeminiModel
from dataclasses import dataclass
# Add this line to allow nested event loops
nest_asyncio.apply()

In [5]:
flash_thinking_model = "gemini-2.0-flash-thinking-exp-01-21"
flash2_model = "gemini-2.0-flash-exp"
flash1_model = "gemini-1.5-flash"
model = GeminiModel(flash2_model)

# @dataclass
# class MainDependencies(BaseModel):
#     document: str
#     response: str

class ModelRating(BaseModel):
    rating: int = Field(description="The rating of the model (0 to 10).", ge=0, le=10)
    model_name: str = Field(description="The name of the model (if accessable).")
    evaluation_result: str = Field(description="Your assessment for this model. How good is the model? Describe it in detail.")
    
class Response(BaseModel):
    model_rating: list[ModelRating] = Field(description="A list of model ratings.")

system_prompt="""You goal is to test two reasoning models (available as tools). 
Come up with a chellenging task that is relatively easy to verify to you but difficult to solve for other models if you provide only the task.
You can try different tests, if you're not sure how good the models are. 
Provide a system instruction to the model that seems optimial for reasoning models. 
Avoid chain-of-though instructions. 
The models tend to perform better if they have more time to think.
Come up with a system prompt that makes them think longer.
"""

agent = Agent(
    model,
    result_type=Response,
    system_prompt=system_prompt
)


In [6]:
@agent.tool
def run_deepseek_r1_agent(query, system_instruction="You are a helpful assistent") -> str:
    """Uses the DeepSeek API to retrieve information based on a string query.
    This model is a reasoner model and uses thinking tokens to generate better results.
    Reasoning models tend to be better accross the board but especially in math, reasoning and coding.
    This model is very powerful, open-source and cheap to use (2$ per 1M output tokens).

    Args:
        query (str): The LLM query string.
        system_instruction (str, optional): The system instruction to use for the query. Defaults to "You are a helpful assistent". Adjust it, if it necessary.

    Returns:
        str: The output string of the DeepSeek R1 model. The returned string is the final answer of the AI model.
    
    """

    api_key = os.environ.get("OPENROUTER_API_KEY")

    if not api_key:
        print("OPENROUTER_API_KEY not found in environment variables.")
        return "OPENROUTER_API_KEY not found in environment variables."
    
    client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key,
    )
    
    completion = client.chat.completions.create(
    model="deepseek/deepseek-r1",
    messages=[
        {
            "role": "system",
            "content": system_instruction
        },        
        {
        "role": "user",
        "content": query
        }
    ]
    )
    return completion.choices[0].message.content      

In [7]:
@agent.tool
def run_genai_agent(query, system_instruction="You are a helpful assistent") -> str:
    """Uses the Gemini API to use the Gemini-2.0 Flash Thinking model.
    This model is a reasoner model and uses thinking tokens to generate better results.
    Reasoning models tend to be better accross the board but especially in math, reasoning and coding.
    This model is very powerful and free (1500 Req./day), has a context window of 1M tokens and can generate up to 65k tokens in one go.

    Args:
        query (str): The LLM query string.
        system_instruction (str, optional): The system instruction to use for the query. Defaults to "You are a helpful assistent". Adjust it, if it necessary.

    Returns:
        str: The output string of the Gemini-2.0 Flash Thinking model. The returned string is the final answer of the AI model.
    """
    # Only run this block for Gemini Developer API
    client = genai.Client()    
    flash_thinking_model = "gemini-2.0-flash-thinking-exp-01-21"
    response = client.models.generate_content(
        model=flash_thinking_model,
        contents=query,
        config=types.GenerateContentConfig(
            system_instruction=system_instruction,
            temperature=0.3,
        ),
    )
    return response.text

In [8]:
async def run_agent():
    result = await agent.run('Test the two models on chellenging tasks and give a rating.')
    return result
 
result = asyncio.run(run_agent())
print(result)

INFO:httpx:HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp:generateContent "HTTP/1.1 400 Bad Request"


UnexpectedModelBehavior: Unexpected response from gemini 400, body:
{
  "error": {
    "code": 400,
    "message": "* GenerateContentRequest.tools[0].function_declarations[0].parameters.properties[system_instruction].type: must be specified when not using one_of\n* GenerateContentRequest.tools[0].function_declarations[1].parameters.properties[system_instruction].type: must be specified when not using one_of\n",
    "status": "INVALID_ARGUMENT"
  }
}