In [1]:
!gcloud auth login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=PQQrVSNVxh0pI6M4Botw6XtdMgRhrP&access_type=offline&code_challenge=mkaFsQA8NMnJ8JWIBhXIpvqHOfi10QVmdqHRxG0QspU&code_challenge_method=S256


You are now logged in as [duyguider@google.com].
Your current project is [claims-assistant-fsa].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update





In [None]:
import os
import re
import requests
import json
from agents import Runner, Agent
from datetime import datetime
from agents.sessions.in_memory_session_service import InMemorySessionService
from agents.artifacts.in_memory_artifact_service import InMemoryArtifactService
from google.generativeai import GenerativeModel, Part, Content, GenerationConfig  # Vertex AI Gemini
import google.cloud.aiplatform as aiplatform


# --- Utility Functions ---

def extract_responses_from_code(agent_code: str) -> list:
    """
    Extracts sample responses from the agent's code (Simplified Placeholder).
    This is VERY difficult to do reliably without running the code.
    This version uses a very simple regex, which will likely fail in many cases.
    """
    # VERY rudimentary attempt to find 'text' parts in responses.
    matches = re.findall(r"text=['\"](.*?)['\"]", agent_code, re.DOTALL)
    return matches


def parse_evaluation(evaluation_text: str) -> dict:
    """Parses the evaluation text from the Gemini model (Placeholder)."""
    scores = {}
    try:
        # Attempt to parse as JSON (ideal case)
        scores = json.loads(evaluation_text)
    except json.JSONDecodeError:
        # Fallback: Use regex to find scores (less reliable)
        match = re.search(r"fluency:\s*(\d+).*coherence:\s*(\d+).*relevance:\s*(\d+).*helpfulness:\s*(\d+)", evaluation_text, re.IGNORECASE)
        if match:
            scores = {
                "fluency": int(match.group(1)),
                "coherence": int(match.group(2)),
                "relevance": int(match.group(3)),
                "helpfulness": int(match.group(4)),
            }
    return scores


def google_search(query: str) -> dict:
    """Performs a Google Search (using the Custom Search JSON API)."""
    try:
        url = f"https://www.googleapis.com/customsearch/v1?key={os.environ['GOOGLE_SEARCH_API_KEY']}&cx={os.environ['SEARCH_ENGINE_ID']}&q={query}"
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error during Google Search: {e}")  # Log the error
        return {} # Return an empty dict
    except KeyError as e:
        print(f"Missing environment variable: {e}")
        return {}


def extract_claims(text: str) -> list:
    """Extracts factual claims from the agent's response (Placeholder)."""
    # Very basic sentence splitting.  Real-world claim extraction is MUCH harder.
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences


def verify_claim(claim: str, search_results: dict) -> bool:
    """Verifies a claim against search results (Placeholder)."""
    # Extremely simplified verification.  Real-world verification is complex.
    if not search_results or 'items' not in search_results:
        return False #Unable to verify

    for item in search_results['items']:
        if claim.lower() in item.get('title', '').lower() or claim.lower() in item.get('snippet', '').lower():
            return True
    return False


# --- Tool Definitions ---

def generation_evaluator(agent_code: str, sample_interactions: list = None) -> dict:
    """Evaluates the generation quality of the agent's responses."""
    model = GenerativeModel("gemini-1.5-pro-002")

    if sample_interactions is None:
        sample_responses = extract_responses_from_code(agent_code)
    else:
        sample_responses = [interaction['output'] for interaction in sample_interactions]

    scores = {}
    for i, response in enumerate(sample_responses):
        prompt = f"""Evaluate the following response based on fluency, coherence, relevance, and helpfulness.  Provide a score from 0 to 10 for each, where 10 is best.

        Response:
        {response}

        Scores (in JSON format):
        """
        try:
            evaluation = model.generate_content(prompt)
            scores[f"response_{i}"] = parse_evaluation(evaluation.text)
        except Exception as e:
             print(f"Error during generation evaluation: {e}") #Error handeling
             scores[f"response_{i}"] =  {"fluency": 0, "coherence": 0, "relevance": 0, "helpfulness": 0}
    return scores


def context_evaluator(agent_code: str, sample_interactions: list) -> float:
    """Evaluates the agent's ability to maintain and use context."""
    # Placeholder:  In a real implementation, this would involve running the agent's code.
    # For now, we'll just return a placeholder score based on whether interactions are provided.

    if not sample_interactions:
         return 0.0

    model = GenerativeModel("gemini-1.5-pro-002")

    conversation_history = ""
    total_context_score = 0

    for interaction in sample_interactions:
        user_input = interaction['input']
        agent_response = interaction['output']

        prompt = f"""
        You are evaluating an AI agent's ability to maintain context in a conversation.
        Here is the conversation history so far:
        {conversation_history}

        The user's latest input is:
        {user_input}

        The agent's response is:
        {agent_response}

        Rate the agent's response on a scale of 0 to 10 (10 being best) based on how well it uses and maintains context from the previous conversation. Return a single number.
        """

        try:
          evaluation = model.generate_content(prompt)
          score = int(evaluation.text.strip())  # Try to convert to integer
          total_context_score += score
        except Exception as e:
            print(f"Error during evaluation: {e}")
            total_context_score += 0

        # Update conversation history
        conversation_history += f"User: {user_input}\nAgent: {agent_response}\n"


    return total_context_score / len(sample_interactions) if sample_interactions else 0.0



def groundness_evaluator(agent_code: str, sample_interactions: list) -> float:
    """Evaluates the factual accuracy of the agent's responses."""
    if not sample_interactions:
        return 0.0

    scores = []
    for interaction in sample_interactions:
        agent_response = interaction['output']
        claims = extract_claims(agent_response)

        for claim in claims:
            try:
                search_results = google_search(claim)
                if verify_claim(claim, search_results):
                    scores.append(10)
                else:
                    scores.append(0)
            except Exception as e:
                print(f"Error during groundness evaluation: {e}")
                scores.append(0)  # Assume ungrounded if error

    return sum(scores) / len(scores) if scores else 0.0


def function_caller_evaluator(agent_code: str, sample_interactions: list) -> float:
    """Evaluates the agent's ability to select and use the correct functions."""
    # Placeholder: Requires dynamic execution of the agent code.
    if not sample_interactions:
      return 0.0

    model = GenerativeModel("gemini-1.5-pro-002")
    total_function_score = 0

    for interaction in sample_interactions:
        user_input = interaction['input']
        agent_response = interaction['output']

        prompt = f"""You are evaluating an AI agent's ability to use function calls.
        User Input: {user_input}
        Agent response: {agent_response}
        Based on the user input, analyze agent's response and rate from 0 to 10 if used the correct functions to get the response (where 10 is best). Return a single number.
        """
        try:
          evaluation = model.generate_content(prompt)
          score = int(evaluation.text.strip())  # Try to convert to integer
          total_function_score += score
        except Exception as e:
            print(f"Error during evaluation: {e}")
            total_function_score += 0
    return total_function_score / len(sample_interactions) if sample_interactions else 0.0




# --- Agent Definition ---
judge_agent = Agent(
    model="gemini-1.5-pro-002",  # Use a strong model
    name="AgentEvaluator",
    instruction="""
        You are an agent designed to evaluate the quality and correctness of other AI agents.
        You will receive the source code of a target agent as input, and optionally, sample interactions.
        Analyze the code and use the provided tools to assess its:
        - Generation quality (fluency, coherence, relevance, helpfulness)
        - Context handling (ability to maintain conversation state)
        - Groundness (factual accuracy)
        - Function selection (correct use of tools)

        Report the individual scores from each tool and the final score. The final score should be the average of generation, context, groundness and function caller scores.  Each of those scores should be a single number between 0 and 10.
    """,
    tools=[
        generation_evaluator,
        context_evaluator,
        groundness_evaluator,
        function_caller_evaluator,
        google_search  # Added to be used by groundness_evaluator
    ],
      flow='sequential',
)

# --- Agent Initialization ---
session_service = InMemorySessionService()
artifact_service = InMemoryArtifactService()
runner = Runner(app_name="AgentEvaluator", agent=judge_agent, artifact_service=artifact_service, session_service=session_service)
session = session_service.create(app_name="AgentEvaluator", user_id="1")


def run_prompt(agent_code: str, sample_interactions: list = None):
     # Use Vertex AI Content objects
    content = Content(role='user', parts=[Part.from_text(agent_code)])
    #print(content) # Remove to avoid noisy output.
    final_response = None #To get the last response
    for event in runner.run(
      session=session,
      new_message=content,
      tool_args={"sample_interactions": sample_interactions}
    ):
        if event.content:
            #print(event.content)  # Optional: Print intermediate steps (Vertex AI format)
            final_response = event.content

    return final_response


# --- Main Execution ---
if __name__ == "__main__":
    # --- Setup API Keys and Vertex AI ---
    # Best practice: Load from environment variables
    os.environ["GOOGLE_API_KEY"] = "YOUR_GOOGLE_API_KEY"  # For Gemini API (still needed for google_search)
    os.environ["GOOGLE_SEARCH_API_KEY"] = "YOUR_GOOGLE_SEARCH_API_KEY"  # For Google Custom Search API
    os.environ["SEARCH_ENGINE_ID"] = "YOUR_SEARCH_ENGINE_ID"  # Your Custom Search Engine ID
    os.environ["GOOGLE_CLOUD_PROJECT"] = "YOUR_GOOGLE_CLOUD_PROJECT" #Your project id
    os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1" #The region

    # Initialize Vertex AI
    aiplatform.init(project=os.environ["GOOGLE_CLOUD_PROJECT"], location=os.environ["GOOGLE_CLOUD_LOCATION"])


    # --- Load Target Agent's Code and sample interactions ---
    # Example usage (replace with the path to your agent's code):
    try:
      with open("target_agent.py", "r") as f:
          target_agent_code = f.read()
    except FileNotFoundError:
        print("Error: target_agent.py not found. Please provide the correct path.")
        exit(1) #Exit if can't continue

    sample_interactions = [
        {'input': "What's the weather like today?",
         'output': "I'm sorry, I don't have the ability to look up weather information."},
        {'input': "What's 2 + 2?", 'output': "2 + 2 = 4"},
        {'input': "Can you tell me the capital of France?", 'output': "The capital of France is Paris."},
        {'input': "And what is the population of Paris?", 'output': "The population of Paris is about 2.1 million people."}, #Context question
        {'input':"What is the exchange rate from USD to EUR?", 'output': "The exchange rate of USD to EUR is bla bla."}, # Test the function caller

    ]

    # --- Run the Evaluation ---
    evaluation_result = run_prompt(target_agent_code, sample_interactions)

    # --- Process and Print Results ---
    if evaluation_result and evaluation_result.parts:
        # Extract the final text response
        final_text = evaluation_result.parts[0].text
        print("Evaluation Results:")
        print(final_text)


        #Attempt to extract scores.
        try:

          match = re.search(r"Generation quality score:\s*([\d\.]+).*Context handling score:\s*([\d\.]+).*Groundness score:\s*([\d\.]+).*Function selection score:\s*([\d\.]+).*Final score:\s*([\d\.]+)", final_text, re.IGNORECASE)
          if match:
            print("\nExtracted Scores:")
            print(f"  Generation Quality: {float(match.group(1))}")
            print(f"  Context Handling: {float(match.group(2))}")
            print(f"  Groundness: {float(match.group(3))}")
            print(f"  Function Selection: {float(match.group(4))}")
            print(f"  Final Score: {float(match.group(5))}")
        except Exception as e:
            print(f"\nCould not extract individual scores: {e}")


    else:
        print("No evaluation results returned.")