### Evaluation

In [1]:
#
# Copyright (c) Microsoft. All rights reserved.
# To learn more, please visit the documentation - Quickstart: Azure Content Safety: https://aka.ms/acsstudiodoc
#
from dotenv import load_dotenv
import os

load_dotenv()
#create a .env file with the following variables and replace with your values
AI_STUDIO_CONNECTION_ENDPOINT = os.getenv("AI_STUDIO_CONNECTION_ENDPOINT")
AI_STUDIO_CONNECTION_KEY = os.getenv("AI_STUDIO_CONNECTION_KEY")
GPT4o_API_KEY = os.getenv("GPT4o_API_KEY")
GPT4o_DEPLOYMENT_ENDPOINT = os.getenv("GPT4o_DEPLOYMENT_ENDPOINT")
GPT4o_DEPLOYMENT_NAME = os.getenv("GPT4o_DEPLOYMENT_NAME")
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP = os.getenv("AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP")
AZURE_AISTUDIO_PROJECT_NAME = os.getenv("AZURE_AISTUDIO_PROJECT_NAME")

api_version = "2024-02-15-preview"


In [2]:
import os
from promptflow.core import AzureOpenAIModelConfiguration

# Initialize Azure OpenAI Connection with your environment variables
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=GPT4o_DEPLOYMENT_ENDPOINT,
    api_key=GPT4o_API_KEY,
    azure_deployment=GPT4o_DEPLOYMENT_NAME,
    api_version=api_version,
)


In [3]:
# Relevance
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import RelevanceEvaluator

# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    answer="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    question="Which tent is the most waterproof?",
)
print(relevance_score)

{'gpt_relevance': 5.0}


In [4]:
# Coherence
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import CoherenceEvaluator

# Initialzing Coherence Evaluator
coherence_eval = CoherenceEvaluator(model_config)
# Running Relevance Evaluator on single input row
score = coherence_eval(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print("High Coherence " + str(score))

score = coherence_eval(
    question="What is the capital of Japan?",
    answer="Blue")
print("Low Coherence " +str(score))

High Coherence {'gpt_coherence': 5.0}
Low Coherence {'gpt_coherence': 1.0}


In [6]:
# Fluency
from promptflow.evals.evaluators import FluencyEvaluator

eval_fn = FluencyEvaluator(model_config)
result = eval_fn(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print(result)

{'gpt_fluency': 5.0}


In [5]:
#Risk and Safety
# When you use AI-assisted risk and safety metrics, a GPT model isn't required. 
# Instead of model_config, provide your azure_ai_project information. 
# This accesses the Azure AI Studio safety evaluations back-end service, 
# which provisions a GPT-4 model that can generate content risk severity scores and reasoning to enable your safety evaluators.
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.
azure_ai_project = {
    "subscription_id": AZURE_SUBSCRIPTION_ID,
    "resource_group_name": AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP,
    "project_name": AZURE_AISTUDIO_PROJECT_NAME
}

from promptflow.evals.evaluators import ViolenceEvaluator

# Initialzing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(azure_ai_project)
# Running Violence Evaluator on single input row
result = violence_eval(question="What is the capital of France?", answer="Paris.")
print("Low violence")
print(result)

[2024-09-05 08:14:57,832][flowinvoker][INFO] - Getting connections from pf client with provider from args: local...
[2024-09-05 08:14:58,964][flowinvoker][INFO] - Promptflow get connections successfully. keys: dict_keys([])
[2024-09-05 08:14:58,965][flowinvoker][INFO] - Promptflow executor starts initializing...
[2024-09-05 08:14:59,109][flowinvoker][INFO] - Promptflow executor initiated successfully.
[2024-09-05 08:14:59,110][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 08:14:59,112][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb9

2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     Executing node validate_inputs. node run id: 1be56b6c-0cb9-4c0a-adfa-f85bdc199442_validate_inputs_19e8963c-d5cb-4635-aa01-5e79fca316c9
2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 08:14:59 +0300   15364 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 1be56b6c-0cb9-4c0a-adfa-f85bdc199442_evaluate_with_rai_service_fb462c07-cb24-4a00-8596-58742fa2c09a
2024-09-05 08:15:45 +0300   15364 execution.flow     INFO     Node ev

In [7]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.


from promptflow.evals.evaluators import ContentSafetyEvaluator
eval_fn = ContentSafetyEvaluator(azure_ai_project)
result = eval_fn(
    question="What is the capital of France?",
    answer="Paris.",
)
print("High Content safety")
print(result)

[2024-09-05 08:15:53,448][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 08:15:53,450][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'sexual', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 08:15:53,451][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesi

2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.


[2024-09-05 08:15:53,453][flowinvoker][INFO] - Execute flow with data {'metric_name': 'sexual', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.


[2024-09-05 08:15:53,454][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.


[2024-09-05 08:15:53,457][flowinvoker][INFO] - Execute flow with data {'metric_name': 'self_harm', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.


[2024-09-05 08:15:53,460][flowinvoker][INFO] - Execute flow with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Executing node validate_inputs. node run id: 82d4bc09-bd8d-4601-b4a6-b2cc35684b27_validate_inputs_042dd926-6473-490d-89df-506c6898d51a
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:15:53 +0300   15364 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 08:1

[{"variableName": "azure_ai_project", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "relevance_score", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "result", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "score", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}]
[{"variableName": "azure_ai_project", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "relevance_score", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "result", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}, {"variableName": "score", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": false}]
[{"variableName": "azure_ai_project", "type": "dictionary", "supportedEngines": ["pandas"], "isLoc

In [10]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.

from promptflow.evals.evaluators import F1ScoreEvaluator
eval_fn = F1ScoreEvaluator()
result = eval_fn(
    answer="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements.")

print(result)

[2024-09-05 08:16:25,177][flowinvoker][INFO] - Getting connections from pf client with provider from args: local...
[2024-09-05 08:16:25,199][flowinvoker][INFO] - Promptflow get connections successfully. keys: dict_keys([])
[2024-09-05 08:16:25,200][flowinvoker][INFO] - Promptflow executor starts initializing...
[2024-09-05 08:16:25,221][flowinvoker][INFO] - Promptflow executor initiated successfully.
[2024-09-05 08:16:25,223][flowinvoker][INFO] - Validating flow input with data {'answer': 'The capital of Japan is Tokyo.', 'ground_truth': "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}
[2024-09-05 08:16:25,224][flowinvoker][INFO] - Execute flow with data {'answer': 'The capital of Japan is Tokyo.', 'ground_truth': "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}


2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Executing node validate_inputs. node run id: 2699c683-7bba-4751-916d-7d6c5a65af73_validate_inputs_18120968-9c16-4893-9159-abf37e97845e
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     The node 'compute_f1_score' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Executing node compute_f1_score. node run id: 2699c683-7bba-4751-916d-7d6c5a65af73_compute_f1_score_dc9eb6df-d485-47ca-bef6-7bf1f2d6cc1f
2024-09-05 08:16:25 +0300   15364 execution.flow     INFO     Node compute_f1_score completes.
{

In [12]:
#Hate Unfairness
from promptflow.evals.evaluators import HateUnfairnessEvaluator
eval_fn = HateUnfairnessEvaluator(azure_ai_project)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
print(result)

[2024-09-05 08:16:49,126][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 08:16:49,127][flowinvoker][INFO] - Execute flow with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     Executing node validate_inputs. node run id: 1a6f2079-4cd0-4600-954a-1b24d33e3473_validate_inputs_572d8e13-cb2e-4d6b-9d8e-3348e2040693
2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 08:16:49 +0300   15364 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 1a6f2079-4cd0-4600-954a-1b24d33e3473_evaluate_with_rai_service_d377a0f7-117b-48a1-b265-245c91f3fa6b
2024-09-05 08:16:58 +0300   15364 execution.flow     INFO     Node ev