### Evaluation

In [1]:
#
# Copyright (c) Microsoft. All rights reserved.
# To learn more, please visit the documentation - Quickstart: Azure Content Safety: https://aka.ms/acsstudiodoc
#
from dotenv import load_dotenv
import os

load_dotenv()
#create a .env file with the following variables and replace with your values
AI_STUDIO_CONNECTION_ENDPOINT = os.getenv("AI_STUDIO_CONNECTION_ENDPOINT")
AI_STUDIO_CONNECTION_KEY = os.getenv("AI_STUDIO_CONNECTION_KEY")
GPT4o_API_KEY = os.getenv("GPT4o_API_KEY")
GPT4o_DEPLOYMENT_ENDPOINT = os.getenv("GPT4o_DEPLOYMENT_ENDPOINT")
GPT4o_DEPLOYMENT_NAME = os.getenv("GPT4o_DEPLOYMENT_NAME")
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP = os.getenv("AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP")
AZURE_AISTUDIO_PROJECT_NAME = os.getenv("AZURE_AISTUDIO_PROJECT_NAME")

api_version = "2024-02-15-preview"


In [2]:
import os
from promptflow.core import AzureOpenAIModelConfiguration

# Initialize Azure OpenAI Connection with your environment variables
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=GPT4o_DEPLOYMENT_ENDPOINT,
    api_key=GPT4o_API_KEY,
    azure_deployment=GPT4o_DEPLOYMENT_NAME,
    api_version=api_version,
)


In [3]:
from promptflow.evals.evaluators import GroundednessEvaluator
eval_fn = GroundednessEvaluator(model_config)
result = eval_fn(
    answer="The capital of Japan is Tokyo.",
    context="Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements.")
print(result)

{'gpt_groundedness': 5.0}


In [4]:
# Relevance
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import RelevanceEvaluator

# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    answer="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    question="Which tent is the most waterproof?",
)
print(relevance_score)

{'gpt_relevance': 5.0}


In [5]:
# Coherence
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import CoherenceEvaluator

# Initialzing Coherence Evaluator
coherence_eval = CoherenceEvaluator(model_config)
# Running Relevance Evaluator on single input row
score = coherence_eval(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print("High Coherence " + str(score))

score = coherence_eval(
    question="What is the capital of Japan?",
    answer="Blue")
print("Low Coherence " +str(score))

High Coherence {'gpt_coherence': 5.0}
Low Coherence {'gpt_coherence': 1.0}


In [6]:
# Fluency
from promptflow.evals.evaluators import FluencyEvaluator

eval_fn = FluencyEvaluator(model_config)
result = eval_fn(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print(result)

{'gpt_fluency': 5.0}


In [7]:
#Risk and Safety
# When you use AI-assisted risk and safety metrics, a GPT model isn't required. 
# Instead of model_config, provide your azure_ai_project information. 
# This accesses the Azure AI Studio safety evaluations back-end service, 
# which provisions a GPT-4 model that can generate content risk severity scores and reasoning to enable your safety evaluators.
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.
azure_ai_project = {
    "subscription_id": AZURE_SUBSCRIPTION_ID,
    "resource_group_name": AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP,
    "project_name": AZURE_AISTUDIO_PROJECT_NAME
}

from promptflow.evals.evaluators import ViolenceEvaluator

# Initialzing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(azure_ai_project)
# Running Violence Evaluator on single input row
result = violence_eval(question="What is the capital of France?", answer="Paris.")
print("Low violence")
print(result)

[2024-09-09 15:30:26,426][flowinvoker][INFO] - Getting connections from pf client with provider from args: local...
[2024-09-09 15:30:27,755][flowinvoker][INFO] - Promptflow get connections successfully. keys: dict_keys([])
[2024-09-09 15:30:27,755][flowinvoker][INFO] - Promptflow executor starts initializing...
[2024-09-09 15:30:27,871][flowinvoker][INFO] - Promptflow executor initiated successfully.
[2024-09-09 15:30:27,872][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-09 15:30:27,872][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb9

2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     Executing node validate_inputs. node run id: 7a33d1a2-a636-430a-abcf-488e1bf33081_validate_inputs_f6257cb1-0ca5-4b43-99ac-15d01f2b1ca3
2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     Node validate_inputs completes.
2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-09 15:30:27 +0300   50348 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 7a33d1a2-a636-430a-abcf-488e1bf33081_evaluate_with_rai_service_963aeb93-501c-485d-af73-3b1267d2098c
2024-09-09 15:31:00 +0300   50348 execution.flow     INFO     Node ev

In [12]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: 
# East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.


from promptflow.evals.evaluators import ContentSafetyEvaluator
eval_fn = ContentSafetyEvaluator(azure_ai_project)
result = eval_fn(
    question="what was found?",
    answer="A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.",
)

print(result)

[2024-09-09 15:31:58,092][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-09 15:31:58,092][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb

2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.


[2024-09-09 15:31:58,092][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'sexual', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.


[2024-09-09 15:31:58,104][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'self_harm', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.


[2024-09-09 15:31:58,108][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'hate_fairness', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Executing node validate_inputs. node run id: b96f748c-62d4-4a29-96cb-d1216ddbe52a_validate_inputs_acedbd30-4449-4f80-9688-d28e3568f521


[2024-09-09 15:31:58,108][flowinvoker][INFO] - Execute flow with data {'metric_name': 'hate_fairness', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-09 15:31:58,108][flowinvoker][INFO] - Execute flow with data {'metric_name': 'self_harm', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94b

2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.


[2024-09-09 15:31:58,108][flowinvoker][INFO] - Execute flow with data {'metric_name': 'sexual', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Node validate_inputs completes.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Executing node validate_inputs. node run id: 94c2e840-6194-496f-b8a9-557a415f7b5d_validate_inputs_bbd12c42-275a-4821-95a3-16debe79244b
2024-09-09 15:31:58 +0300   50348 execution.flow     INFO     Start to run 2 node

In [10]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.

from promptflow.evals.evaluators import F1ScoreEvaluator
eval_fn = F1ScoreEvaluator()
result = eval_fn(
    answer="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements.")

print(result)

[2024-09-09 15:31:36,454][flowinvoker][INFO] - Validating flow input with data {'answer': 'The capital of Japan is Tokyo.', 'ground_truth': "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}
[2024-09-09 15:31:36,455][flowinvoker][INFO] - Execute flow with data {'answer': 'The capital of Japan is Tokyo.', 'ground_truth': "Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements."}


2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Executing node validate_inputs. node run id: b51eba5c-0861-4eba-ab75-e1fe151a7f1b_validate_inputs_e40da372-fa61-419a-bde3-b8e0c4fd3531
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Node validate_inputs completes.
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     The node 'compute_f1_score' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Executing node compute_f1_score. node run id: b51eba5c-0861-4eba-ab75-e1fe151a7f1b_compute_f1_score_ae80981f-d12a-4b2f-8f17-7358d400d600
2024-09-09 15:31:36 +0300   50348 execution.flow     INFO     Node compute_f1_score completes.
{

In [11]:
#Hate Unfairness
from promptflow.evals.evaluators import HateUnfairnessEvaluator
eval_fn = HateUnfairnessEvaluator(azure_ai_project)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
print(result)

[2024-09-09 15:31:40,141][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-09 15:31:40,141][flowinvoker][INFO] - Execute flow with data {'metric_name': 'hate_fairness', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     Executing node validate_inputs. node run id: 0dfce9e7-0e2b-47c8-8090-740df08ee189_validate_inputs_7ed5ff19-4057-4437-b589-7079aa738bcd
2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     Node validate_inputs completes.
2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-09 15:31:40 +0300   50348 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 0dfce9e7-0e2b-47c8-8090-740df08ee189_evaluate_with_rai_service_ea4520dc-3a6a-4136-a118-b7e954d56286
2024-09-09 15:31:51 +0300   50348 execution.flow     INFO     Node ev

In [25]:
#Protected material
from promptflow.evals.evaluators import QAEvaluator

eval_fn = QAEvaluator(model_config)
result = eval_fn(
    question="Tokyo is the capital of which country?",
    answer="Japan",
    context="Tokyo is the capital of Japan.",
    ground_truth="Japan"
)
print(result)

[2024-09-05 14:37:38,251][flowinvoker][INFO] - Validating flow input with data {'answer': 'Japan', 'ground_truth': 'Japan'}
[2024-09-05 14:37:38,251][flowinvoker][INFO] - Execute flow with data {'answer': 'Japan', 'ground_truth': 'Japan'}


2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Executing node validate_inputs. node run id: 62e8fa52-af86-431d-a48c-79daabb328bc_validate_inputs_81641014-6fb2-4186-a1ae-5f09b0d2f56f
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     The node 'compute_f1_score' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Executing node compute_f1_score. node run id: 62e8fa52-af86-431d-a48c-79daabb32