## Getting Started

Before running the sample:
```bash
pip install azure-ai-projects azure-identity azure-ai-evaluation azure-ai-agents
```

Set these environment variables with your own values:
1. **PROJECT_ENDPOINT** - The project endpoint, as found in the overview page of your Azure AI Foundry project.
2. **MODEL_DEPLOYMENT_NAME** - The deployment name of the AI model.
3. **AZURE_OPENAI_ENDPOINT** - Azure OpenAI Endpoint to be used for evaluation.
4. **AZURE_OPENAI_API_KEY** - Azure OpenAI Key to be used for evaluation.
5. **AZURE_OPENAI_API_VERSION** - Azure OpenAI API version.
6. **AZURE_SUBSCRIPTION_ID** - Azure Subscription Id.
7. **PROJECT_NAME** - Azure AI Project Name.
8. **RESOURCE_GROUP_NAME** - Azure AI Project Resource Group Name.

# Create Agent

In [1]:
import os
import json
import pandas as pd
import time
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
from user_functions import user_functions
from dotenv import load_dotenv

load_dotenv()

from azure.ai.projects import __version__ as projects_version
from packaging.version import Version

# Check version to determine API style
updated_agents = Version(projects_version) > Version("1.0.0b10") or projects_version.startswith("1.0.0a")

if updated_agents:
    from azure.ai.agents.models import FunctionTool, ToolSet
    project_client = AIProjectClient(
        endpoint=os.environ["PROJECT_ENDPOINT"],
        credential=DefaultAzureCredential(),
    )
else:
    from azure.ai.projects.models import FunctionTool, ToolSet
    project_client = AIProjectClient.from_connection_string(
        credential=DefaultAzureCredential(),
        conn_str=os.environ["PROJECT_CONNECTION_STRING"],
    )

AGENT_NAME = "Foundry-Evaluations-PythonSDK"

# Adding Tools to be used by Agent
functions = FunctionTool(user_functions)

toolset = ToolSet()
toolset.add(functions)

### Create Agent

In [2]:
agent = project_client.agents.create_agent(
    model=os.environ["MODEL_DEPLOYMENT_NAME"],
    name=AGENT_NAME,
    instructions="You are a helpful assistant",
    toolset=toolset
)

print(f"Created agent, ID: {agent.id}")

Created agent, ID: asst_Em2vdzoimDXnQmrjimSZmuhJ


### Create Thread

In [3]:
if updated_agents:
    thread = project_client.agents.threads.create()
else:
    thread = project_client.agents.create_thread()
print(f"Created thread, ID: {thread.id}")

Created thread, ID: thread_4Q57RJsauv6kMfmTACUgD8JU


## Conversation with Agent

Use below cells to have conversation with the agent:
- `Create Message` - Creates a message in the thread
- `Execute` - Runs the agent on the thread

### Create Message

In [4]:
# Create message to thread

MESSAGE = "Can you send me an email with weather information for Seattle?"

if updated_agents:
    message = project_client.agents.messages.create(
        thread_id=thread.id,
        role="user",
        content=MESSAGE,
    )
else:
    message = project_client.agents.create_message(
        thread_id=thread.id,
        role="user",
        content=MESSAGE,
    )

print(f"Created message, ID: {message.id}")

Created message, ID: msg_6yrrFL7d09z5is48PkbUAMOn


### Execute

In [5]:
if updated_agents:
    from azure.ai.agents.models import (
        FunctionTool,
        ListSortOrder,
        RequiredFunctionToolCall,
        SubmitToolOutputsAction,
        ToolOutput,
    )
    run = project_client.agents.runs.create(thread_id=thread.id, agent_id=agent.id)

    while run.status in ["queued", "in_progress", "requires_action"]:
        time.sleep(1)
        run = project_client.agents.runs.get(thread_id=thread.id, run_id=run.id)

        if run.status == "requires_action" and isinstance(run.required_action, SubmitToolOutputsAction):
            tool_calls = run.required_action.submit_tool_outputs.tool_calls
            if not tool_calls:
                print("No tool calls provided - cancelling run")
                project_client.agents.runs.cancel(thread_id=thread.id, run_id=run.id)
                break

            tool_outputs = []
            for tool_call in tool_calls:
                if isinstance(tool_call, RequiredFunctionToolCall):
                    try:
                        print(f"Executing tool call: {tool_call}")
                        output = functions.execute(tool_call)
                        tool_outputs.append(
                            ToolOutput(
                                tool_call_id=tool_call.id,
                                output=output,
                            )
                        )
                    except Exception as e:
                        print(f"Error executing tool_call {tool_call.id}: {e}")

            print(f"Tool outputs: {tool_outputs}")
            if tool_outputs:
                project_client.agents.runs.submit_tool_outputs(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)
    print(f"Run status: {run.status}")

else:
    from azure.ai.projects.models import (
        FunctionTool,
        ListSortOrder,
        RequiredFunctionToolCall,
        SubmitToolOutputsAction,
        ToolOutput,
    )
    run = project_client.agents.create_run(thread_id=thread.id, agent_id=agent.id)

    while run.status in ["queued", "in_progress", "requires_action"]:
        time.sleep(1)
        run = project_client.agents.get_run(thread_id=thread.id, run_id=run.id)

        if run.status == "requires_action" and isinstance(run.required_action, SubmitToolOutputsAction):
            tool_calls = run.required_action.submit_tool_outputs.tool_calls
            if not tool_calls:
                print("No tool calls provided - cancelling run")
                project_client.agents.cancel_run(thread_id=thread.id, run_id=run.id)
                break

            tool_outputs = []
            for tool_call in tool_calls:
                if isinstance(tool_call, RequiredFunctionToolCall):
                    try:
                        print(f"Executing tool call: {tool_call}")
                        output = functions.execute(tool_call)
                        tool_outputs.append(
                            ToolOutput(
                                tool_call_id=tool_call.id,
                                output=output,
                            )
                        )
                    except Exception as e:
                        print(f"Error executing tool_call {tool_call.id}: {e}")

            print(f"Tool outputs: {tool_outputs}")
            if tool_outputs:
                project_client.agents.submit_tool_outputs_to_run(thread_id=thread.id, run_id=run.id, tool_outputs=tool_outputs)
    print(f"Run status: {run.status}")

print(f"Run finished with status: {run.status}")

if run.status == "failed":
    print(f"Run failed: {run.last_error}")

print(f"Run ID: {run.id}")

Executing tool call: {'id': 'call_rhF5roni4yfF9kIc10EKwy6A', 'type': 'function', 'function': {'name': 'fetch_weather', 'arguments': '{"location":"Seattle"}'}}
Tool outputs: [{'tool_call_id': 'call_rhF5roni4yfF9kIc10EKwy6A', 'output': '{"weather": "Rainy, 14\\u00b0C"}'}]
Executing tool call: {'id': 'call_S1gDCCLLq28l2QsZFhDWf951', 'type': 'function', 'function': {'name': 'send_email', 'arguments': '{"recipient":"your-email@example.com","subject":"Seattle Weather Update","body":"Today\'s weather in Seattle is Rainy with a temperature of 14°C."}'}}
Sending email to your-email@example.com...
Subject: Seattle Weather Update
Body:
Today's weather in Seattle is Rainy with a temperature of 14°C.
Tool outputs: [{'tool_call_id': 'call_S1gDCCLLq28l2QsZFhDWf951', 'output': '{"message": "Email successfully sent to your-email@example.com."}'}]
Run status: RunStatus.COMPLETED
Run finished with status: RunStatus.COMPLETED
Run ID: run_hPSjao52YDIv5XpXxXYUlUMx


### List Messages

In [6]:
if updated_agents:
    for message in project_client.agents.messages.list(thread.id, order="asc"):
        print(f"Role: {message.role}")
        print(f"Content: {message.content[0].text.value}")
        print("-" * 40)
else:
    for message in project_client.agents.list_messages(thread.id, order="asc").data:
        print(f"Role: {message.role}")
        print(f"Content: {message.content[0].text.value}")
        print("-" * 40)

Role: MessageRole.USER
Content: Can you send me an email with weather information for Seattle?
----------------------------------------
Role: MessageRole.AGENT
Content: I sent you an email with the weather information for Seattle. It shows today’s weather is rainy with a temperature of 14°C. Let me know if you need anything else!
----------------------------------------


# Evaluate

### Get data from agent

In [7]:
import json
from azure.ai.evaluation import AIAgentConverter

# Initialize the converter that will be backed by the project.
converter = AIAgentConverter(project_client)

thread_id = thread.id
run_id = run.id

converted_data = converter.convert(thread_id=thread_id, run_id=run_id)
print(json.dumps(converted_data, indent=4))

Class AIAgentConverter: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class FDPAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AIAgentDataRetriever: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{
    "query": [
        {
            "role": "system",
            "content": "You are a helpful assistant"
        },
        {
            "createdAt": "2025-12-04T11:14:22Z",
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Can you send me an email with weather information for Seattle?"
                }
            ]
        }
    ],
    "response": [
        {
            "createdAt": "2025-12-04T11:14:25Z",
            "run_id": "run_hPSjao52YDIv5XpXxXYUlUMx",
            "role": "assistant",
            "content": [
                {
                    "type": "tool_call",
                    "tool_call_id": "call_rhF5roni4yfF9kIc10EKwy6A",
                    "name": "fetch_weather",
                    "arguments": {
                        "location": "Seattle"
                    }
                }
            ]
        },
        {
            "createdAt": "2025-12-04T11:14:26Z",
    

In [8]:
# Save the converted data to a JSONL file

file_name = "evaluation_data.jsonl"
evaluation_data = converter.prepare_evaluation_data(thread_ids=thread.id, filename=file_name)

In [9]:
load_dotenv()

True

### Setting up evaluators

In [10]:
from azure.ai.evaluation import ToolCallAccuracyEvaluator, AzureOpenAIModelConfiguration, IntentResolutionEvaluator, TaskAdherenceEvaluator
from pprint import pprint

model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["MODEL_DEPLOYMENT_NAME"],
)

# Azure AI Project configuration (needed for content safety evaluators)
azure_ai_project = {
    "subscription_id": os.environ["AZURE_SUBSCRIPTION_ID"],
    "project_name": os.environ["PROJECT_NAME"],
    "resource_group_name": os.environ["RESOURCE_GROUP_NAME"],
}

# Initialize evaluators
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
intent_resolution = IntentResolutionEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)

Class ToolCallAccuracyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntentResolutionEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class TaskAdherenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


In [11]:
# Test single evaluation
tool_call_accuracy(query=converted_data['query'], response=converted_data['response'], tool_definitions=converted_data['tool_definitions'])

{'tool_call_accuracy': 5.0,
 'gpt_tool_call_accuracy': 5.0,
 'tool_call_accuracy_result': 'pass',
 'tool_call_accuracy_threshold': 3,
 'tool_call_accuracy_reason': "Let's think step by step: The user's last query was to send an email with weather information for Seattle. Based on the tool definitions, the relevant tools to address this query are 'fetch_weather' to retrieve the weather information for Seattle and 'send_email' to send the email with the retrieved weather information. The agent made two tool calls: one to 'fetch_weather' with the correct parameter 'location: Seattle', which returned the correct weather information, and another to 'send_email' with the correct parameters 'recipient', 'subject', and 'body', all grounded in the conversation. Both tool calls were relevant, efficient, and executed successfully without errors. No unnecessary or excessive tool calls were made, and no tool calls were missing. Therefore, the tool calls made by the agent fully addressed the user's 

### Run Evaluators

In [12]:
from azure.ai.evaluation import evaluate

response = evaluate(
    data=file_name,
    evaluators={
        "tool_call_accuracy": tool_call_accuracy,
        "intent_resolution": intent_resolution,
        "task_adherence": task_adherence
    },
    azure_ai_project=os.environ["PROJECT_ENDPOINT"],
)
pprint(f'AI Foundry URL: {response.get("studio_url")}')

2025-12-04 16:45:01 +0530   25944 execution.bulk     INFO     Finished 1 / 7 lines.
2025-12-04 16:45:01 +0530   25944 execution.bulk     INFO     Average execution time for completed lines: 5.46 seconds. Estimated time for incomplete lines: 32.76 seconds.
2025-12-04 16:45:01 +0530   25944 execution.bulk     INFO     Finished 2 / 7 lines.
2025-12-04 16:45:01 +0530   25944 execution.bulk     INFO     Average execution time for completed lines: 2.77 seconds. Estimated time for incomplete lines: 13.85 seconds.
2025-12-04 16:45:01 +0530   43052 execution.bulk     INFO     Finished 1 / 7 lines.
2025-12-04 16:45:01 +0530   43052 execution.bulk     INFO     Average execution time for completed lines: 5.72 seconds. Estimated time for incomplete lines: 34.32 seconds.
2025-12-04 16:45:01 +0530   43052 execution.bulk     INFO     Finished 2 / 7 lines.
2025-12-04 16:45:01 +0530   43052 execution.bulk     INFO     Average execution time for completed lines: 2.87 seconds. Estimated time for incomplet

Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics
Aggregated metrics for evaluator is not a dictionary will not be logged as metrics



Run name: "tool_call_accuracy_20251204_111455_775601"
Run status: "Completed"
Start time: "2025-12-04 11:14:55.775601+00:00"
Duration: "0:00:11.503661"


{
    "tool_call_accuracy": {
        "status": "Completed",
        "duration": "0:00:11.503661",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    },
    "intent_resolution": {
        "status": "Completed",
        "duration": "0:00:08.494525",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    },
    "task_adherence": {
        "status": "Completed",
        "duration": "0:00:07.468260",
        "completed_lines": 7,
        "failed_lines": 0,
        "log_path": null,
        "error_message": null,
        "error_code": null
    }
}


('AI Foundry URL: '
 'https://ai.azure.com/resource/build/evaluation/e9176d4f-7293-4fa0-abb5-30c4eac21351?wsid

{'metrics': {'intent_resolution.binary_aggregate': 1.0,
             'intent_resolution.gpt_intent_resolution': 3.7142857142857144,
             'intent_resolution.intent_resolution': 3.7142857142857144,
             'intent_resolution.intent_resolution_completion_tokens': 54.285714285714285,
             'intent_resolution.intent_resolution_prompt_tokens': 1921.5714285714287,
             'intent_resolution.intent_resolution_threshold': 3.0,
             'intent_resolution.intent_resolution_total_tokens': 1975.857142857143,
             'task_adherence.binary_aggregate': 1.0,
             'task_adherence.task_adherence': 1.0,
             'tool_call_accuracy.binary_aggregate': 1.0,
             'tool_call_accuracy.gpt_tool_call_accuracy': 5.0,
             'tool_call_accuracy.tool_call_accuracy': 5.0,
             'tool_call_accuracy.tool_call_accuracy_completion_tokens': 407.14285714285717,
             'tool_call_accuracy.tool_call_accuracy_prompt_tokens': 3026.8571428571427,
      

In [15]:
pprint(response)

### Cleanup

In [14]:
# Optional: Delete the agent when done
# project_client.agents.delete_agent(agent.id)
# print(f"Deleted agent: {agent.id}")