## Getting Started

Before running the sample:
```bash
pip install azure-ai-projects azure-identity azure-ai-evaluation azure-ai-agents
```

Set these environment variables with your own values:
1. **PROJECT_ENDPOINT** - The project endpoint, as found in the overview page of your Azure AI Foundry project.
2. **MODEL_DEPLOYMENT_NAME** - The deployment name of the AI model.
3. **AZURE_OPENAI_ENDPOINT** - Azure OpenAI Endpoint to be used for evaluation.
4. **AZURE_OPENAI_API_KEY** - Azure OpenAI Key to be used for evaluation.
5. **AZURE_OPENAI_API_VERSION** - Azure OpenAI API version.
6. **AZURE_SUBSCRIPTION_ID** - Azure Subscription Id.
7. **PROJECT_NAME** - Azure AI Project Name.
8. **RESOURCE_GROUP_NAME** - Azure AI Project Resource Group Name.

# Create Agent

In [2]:
import os
import json
import pandas as pd
import time
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from user_functions import user_functions
from dotenv import load_dotenv

load_dotenv()

from azure.ai.projects import __version__ as projects_version
from packaging.version import Version

# Check version to determine API style
updated_agents = Version(projects_version) > Version("1.0.0b10") or projects_version.startswith("1.0.0a")

if updated_agents:
    from azure.ai.agents.models import FunctionTool, ToolSet
    project_client = AIProjectClient(
        endpoint=os.environ["PROJECT_ENDPOINT"],
        credential=DefaultAzureCredential(),
    )
else:
    from azure.ai.projects.models import FunctionTool, ToolSet
    project_client = AIProjectClient.from_connection_string(
        credential=DefaultAzureCredential(),
        conn_str=os.environ["PROJECT_CONNECTION_STRING"],
    )

AGENT_NAME = "Foundry-Evaluations-PythonSDK"

# Adding Tools to be used by Agent
functions = FunctionTool(user_functions)

toolset = ToolSet()
toolset.add(functions)

### Create Agent

In [3]:
agent = project_client.agents.create_agent(
    model=os.environ["MODEL_DEPLOYMENT_NAME"],
    name=AGENT_NAME,
    instructions="You are a helpful assistant",
    toolset=toolset
)

print(f"Created agent, ID: {agent.id}")

Created agent, ID: asst_uqmmsG13l3Rn1VBjA8bDxzG6


### Create Thread

In [4]:
if updated_agents:
    thread = project_client.agents.threads.create()
else:
    thread = project_client.agents.create_thread()
print(f"Created thread, ID: {thread.id}")

Created thread, ID: thread_bJyv1iqqrvFbUkZaZq668la3


## Conversation with Agent

Use below cells to have conversation with the agent:
- `Create Message` - Creates a message in the thread
- `Execute` - Runs the agent on the thread

### Create Message

In [5]:
# Create message to thread

MESSAGE = "Can you send me an email with weather information for Seattle?"

if updated_agents:
    message = project_client.agents.messages.create(
        thread_id=thread.id,
        role="user",
        content=MESSAGE,
    )
else:
    message = project_client.agents.create_message(
        thread_id=thread.id,
        role="user",
        content=MESSAGE,
    )

print(f"Created message, ID: {message.id}")

Created message, ID: msg_2DnDVK0IuFf47m5IfQGexTSZ


### Execute

In [6]:
if updated_agents:
    from azure.ai.agents.models import (
        FunctionTool,
        ListSortOrder,
        RequiredFunctionToolCall,
        SubmitToolOutputsAction,
        ToolOutput,
    )
    run = project_client.agents.runs.create(thread_id=thread.id, agent_id=agent.id)

    while run.status in ["queued", "in_progress", "requires_action"]:
        time.sleep(1)
        run = project_client.agents.runs.get(thread_id=thread.id, run_id=run.id)

        if run.status == "requires_action" and isinstance(run.required_action, SubmitToolOutputsAction):
            tool_calls = run.required_action.submit_tool_outputs.tool_calls
            if not tool_calls:
                print("No tool calls provided - cancelling run")
                project_client.agents.runs.cancel(thread_id=thread.id, run_id=run.id)
                break

            tool_outputs = []
            for tool_call in tool_calls:
                if isinstance(tool_call, RequiredFunctionToolCall):
                    try:
                        print(f"Executing tool call: {tool_call}")
                        output = functions.execute(tool_call)
                        tool_outputs.append(
                            ToolOutput(
                                tool_call_id=tool_call.id,
                                output=output,
                            )
                        )
                    except Exception as e:
                        print(f"Error executing tool_call {tool_call.id}: {e}")

            print(f"Tool outputs: {tool_outputs}")
            if tool_outputs:
                project_client.agents.runs.submit_tool_outputs(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)
    print(f"Run status: {run.status}")

else:
    from azure.ai.projects.models import (
        FunctionTool,
        ListSortOrder,
        RequiredFunctionToolCall,
        SubmitToolOutputsAction,
        ToolOutput,
    )
    run = project_client.agents.create_run(thread_id=thread.id, agent_id=agent.id)

    while run.status in ["queued", "in_progress", "requires_action"]:
        time.sleep(1)
        run = project_client.agents.get_run(thread_id=thread.id, run_id=run.id)

        if run.status == "requires_action" and isinstance(run.required_action, SubmitToolOutputsAction):
            tool_calls = run.required_action.submit_tool_outputs.tool_calls
            if not tool_calls:
                print("No tool calls provided - cancelling run")
                project_client.agents.cancel_run(thread_id=thread.id, run_id=run.id)
                break

            tool_outputs = []
            for tool_call in tool_calls:
                if isinstance(tool_call, RequiredFunctionToolCall):
                    try:
                        print(f"Executing tool call: {tool_call}")
                        output = functions.execute(tool_call)
                        tool_outputs.append(
                            ToolOutput(
                                tool_call_id=tool_call.id,
                                output=output,
                            )
                        )
                    except Exception as e:
                        print(f"Error executing tool_call {tool_call.id}: {e}")

            print(f"Tool outputs: {tool_outputs}")
            if tool_outputs:
                project_client.agents.submit_tool_outputs_to_run(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)
    print(f"Run status: {run.status}")

print(f"Run finished with status: {run.status}")

if run.status == "failed":
    print(f"Run failed: {run.last_error}")

print(f"Run ID: {run.id}")

Executing tool call: {'id': 'call_MqydQYErhXM2yLMZefwlazJi', 'type': 'function', 'function': {'name': 'fetch_weather', 'arguments': '{"location":"Seattle"}'}}
Tool outputs: [{'tool_call_id': 'call_MqydQYErhXM2yLMZefwlazJi', 'output': '{"weather": "Rainy, 14\\u00b0C"}'}]
Executing tool call: {'id': 'call_HYm4LSNVDyqLaj7WLGUu8gQ0', 'type': 'function', 'function': {'name': 'convert_temperature', 'arguments': '{"celsius":14}'}}
Tool outputs: [{'tool_call_id': 'call_HYm4LSNVDyqLaj7WLGUu8gQ0', 'output': '{"fahrenheit": 57.2}'}]
Executing tool call: {'id': 'call_h9kKAz5yV9x7DxxS6539zayK', 'type': 'function', 'function': {'name': 'send_email', 'arguments': '{"recipient":"user@example.com","subject":"Weather Update for Seattle","body":"Hello,\\n\\nHere\'s the current weather update for Seattle:\\n\\n- Condition: Rainy\\n- Temperature: 14°C (57.2°F)\\n\\nStay prepared for the rain!\\n\\nBest regards,\\nYour Virtual Assistant"}'}}
Sending email to user@example.com...
Subject: Weather Update for S

### List Messages

In [7]:
if updated_agents:
    for message in project_client.agents.messages.list(thread.id, order="asc"):
        print(f"Role: {message.role}")
        print(f"Content: {message.content[0].text.value}")
        print("-" * 40)
else:
    for message in project_client.agents.list_messages(thread.id, order="asc").data:
        print(f"Role: {message.role}")
        print(f"Content: {message.content[0].text.value}")
        print("-" * 40)

Role: MessageRole.USER
Content: Can you send me an email with weather information for Seattle?
----------------------------------------
Role: MessageRole.AGENT
Content: I have sent you an email with the current weather information for Seattle. Let me know if you need anything else!
----------------------------------------


# Evaluate

### Get data from agent

In [8]:
import json
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter that will be backed by the project.
converter = AIAgentConverter(project_client)

thread_id = thread.id
run_id = run.id

converted_data = converter.convert(thread_id=thread_id, run_id=run_id)
print(json.dumps(converted_data, indent=4))

Class AIAgentConverter: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FDPAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AIAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{
    "query": [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "createdAt": "2025-12-04T12:12:28Z",
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Can you send me an email with weather information for Seattle?"
                }
            ]
        }
    ],
    "response": [
        {
            "createdAt": "2025-12-04T12:12:30Z",
            "run_id": "run_cPnE25x7lHnZJxNq5rlGBfwg",
            "role": "assistant",
            "content": [
                {
                    "type": "tool_call",
                    "tool_call_id": "call_MqydQYErhXM2yLMZefwlazJi",
                    "name": "fetch_weather",
                    "arguments": {
                        "location": "Seattle"
                    }
                }
            ]
        },
        {
            "createdAt": "2025-12-04T12:12:32Z",
    

In [9]:
# Save the converted data to a JSONL file

file_name = "evaluation_data.jsonl"
evaluation_data = converter.prepare_evaluation_data(thread_ids=thread.id, filename=file_name)

In [10]:
load_dotenv()

True

### Setting up evaluators

In [11]:
from azure.ai.evaluation import ToolCallAccuracyEvaluator, AzureOpenAIModelConfiguration, IntentResolutionEvaluator, TaskAdherenceEvaluator
from pprint import pprint

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["MODEL_DEPLOYMENT_NAME"],
)

# Azure AI Project configuration (needed for content safety evaluators)
azure_ai_project = {
    "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
    "project_name": os.environ["PROJECT_NAME"],
    "resource_group_name": os.environ["RESOURCE_GROUP_NAME"],
}

# Initialize evaluators
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
intent_resolution = IntentResolutionEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


In [12]:
# Test single evaluation
tool_call_accuracy(query=converted_data['query'], response=converted_data['response'], tool_definitions=converted_data['tool_definitions'])

{'tool_call_accuracy': 5.0,
 'gpt_tool_call_accuracy': 5.0,
 'tool_call_accuracy_result': 'pass',
 'tool_call_accuracy_threshold': 3,
 'tool_call_accuracy_reason': "Let's think step by step: The user's last query was to send an email with weather information for Seattle. Based on the tool definitions, the relevant tools to address this query are 'fetch_weather' to get the weather information, 'convert_temperature' to convert the temperature from Celsius to Fahrenheit (if needed for the email content), and 'send_email' to send the email with the weather details. The agent made three tool calls: 'fetch_weather' with the correct parameter 'Seattle', 'convert_temperature' with the correct parameter '14' (extracted from the weather result), and 'send_email' with the correct parameters for recipient, subject, and body. All tool calls were executed successfully without errors, and no unnecessary or excessive tool calls were made. Therefore, the tool calls were fully relevant, efficient, and o

### Run Evaluators

In [13]:
from azure.ai.evaluation import evaluate

response = evaluate(
    data=file_name,
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence
    },
    azure_ai_project=os.environ["PROJECT_ENDPOINT"],
)
pprint(f'AI Foundry URL: {response.get("studio_url")}')

2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Finished 2 / 8 lines.
2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Average execution time for completed lines: 2.71 seconds. Estimated time for incomplete lines: 16.26 seconds.
2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Finished 3 / 8 lines.
2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Average execution time for completed lines: 1.82 seconds. Estimated time for incomplete lines: 9.1 seconds.
2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Finished 4 / 8 lines.
2025-12-04 17:43:23 +0530   25980 execution.bulk     INFO     Average execution time for completed lines: 1.4 seconds. Estimated time for incomplete lines: 5.6 seconds.
2025-12-04 17:43:23 +0530   47656 execution.bulk     INFO     Finished 1 / 8 lines.
2025-12-04 17:43:23 +0530   47656 execution.bulk     INFO     Average execution time for completed lines: 5.68 seconds. Estimated time for incomplete lin

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "tool_call_accuracy_20251204_121317_788381"
Run status: "Completed"
Start time: "2025-12-04 12:13:17.788381+00:00"
Duration: "0:00:09.507436"

2025-12-04 17:43:28 +0530   47656 execution.bulk     INFO     Finished 8 / 8 lines.
2025-12-04 17:43:28 +0530   47656 execution.bulk     INFO     Average execution time for completed lines: 1.33 seconds. Estimated time for incomplete lines: 0.0 seconds.


Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "task_adherence_20251204_121317_795183"
Run status: "Completed"
Start time: "2025-12-04 12:13:17.795183+00:00"
Duration: "0:00:11.519890"


{
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:00:09.507436",
        "completed_lines": 8,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    },
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:00:07.543643",
        "completed_lines": 8,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:00:11.519890",
        "completed_lines": 8,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    }
}


('AI Foundry URL: '
 'https://ai.azure.com/resource/build/evaluation/bb21ee10-c2c9-422e-9ac0-d659cfe17d7d?wsid=/su

In [None]:
print(response)

### Cleanup

In [15]:
# Optional: Delete the agent when done
# project_client.agents.delete_agent(agent.id)
# print(f"Deleted agent: {agent.id}")