### Evaluation

In [1]:
#
# Copyright (c) Microsoft. All rights reserved.
# To learn more, please visit the documentation - Quickstart: Azure Content Safety: https://aka.ms/acsstudiodoc
#
from dotenv import load_dotenv
import os

load_dotenv()
#create a .env file with the following variables and replace with your values
AI_STUDIO_CONNECTION_ENDPOINT = os.getenv("AI_STUDIO_CONNECTION_ENDPOINT")
AI_STUDIO_CONNECTION_KEY = os.getenv("AI_STUDIO_CONNECTION_KEY")
GPT4o_API_KEY = os.getenv("GPT4o_API_KEY")
GPT4o_DEPLOYMENT_ENDPOINT = os.getenv("GPT4o_DEPLOYMENT_ENDPOINT")
GPT4o_DEPLOYMENT_NAME = os.getenv("GPT4o_DEPLOYMENT_NAME")
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP = os.getenv("AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP")
AZURE_AISTUDIO_PROJECT_NAME = os.getenv("AZURE_AISTUDIO_PROJECT_NAME")

api_version = "2024-02-15-preview"


In [2]:
import os
from promptflow.core import AzureOpenAIModelConfiguration

# Initialize Azure OpenAI Connection with your environment variables
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=GPT4o_DEPLOYMENT_ENDPOINT,
    api_key=GPT4o_API_KEY,
    azure_deployment=GPT4o_DEPLOYMENT_NAME,
    api_version=api_version,
)


In [5]:
from promptflow.evals.evaluators import GroundednessEvaluator
eval_fn = GroundednessEvaluator(model_config)
result = eval_fn(
    answer="The capital of Japan is Tokyo.",
    context="Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements.")
print(result)

{'gpt_groundedness': 5.0}


In [18]:
# Relevance
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import RelevanceEvaluator

# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    answer="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    question="Which tent is the most waterproof?",
)
print(relevance_score)

In [19]:
# Coherence
# When using AI-assisted performance and quality metrics, 
# you must specify a GPT model for the calculation process. 
# Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config.
    
from promptflow.evals.evaluators import CoherenceEvaluator

# Initialzing Coherence Evaluator
coherence_eval = CoherenceEvaluator(model_config)
# Running Relevance Evaluator on single input row
score = coherence_eval(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print("High Coherence " + str(score))

score = coherence_eval(
    question="What is the capital of Japan?",
    answer="Blue")
print("Low Coherence " +str(score))

In [20]:
# Fluency
from promptflow.evals.evaluators import FluencyEvaluator

eval_fn = FluencyEvaluator(model_config)
result = eval_fn(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.")
print(result)

In [21]:
#Risk and Safety
# When you use AI-assisted risk and safety metrics, a GPT model isn't required. 
# Instead of model_config, provide your azure_ai_project information. 
# This accesses the Azure AI Studio safety evaluations back-end service, 
# which provisions a GPT-4 model that can generate content risk severity scores and reasoning to enable your safety evaluators.
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.
azure_ai_project = {
    "subscription_id": AZURE_SUBSCRIPTION_ID,
    "resource_group_name": AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP,
    "project_name": AZURE_AISTUDIO_PROJECT_NAME
}

from promptflow.evals.evaluators import ViolenceEvaluator

# Initialzing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(azure_ai_project)
# Running Violence Evaluator on single input row
result = violence_eval(question="What is the capital of France?", answer="Paris.")
print("Low violence")
print(result)

[2024-09-05 14:36:47,537][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 14:36:47,537][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     Executing node validate_inputs. node run id: 8006972b-c4c4-4fbe-b453-3f2987254ea6_validate_inputs_6bf4f9b8-b66c-441b-8c32-ad11af993304
2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 14:36:47 +0300   25724 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 8006972b-c4c4-4fbe-b453-3f2987254ea6_evaluate_with_rai_service_e900d431-66cf-4d89-aa4e-8a6a1590b54c
2024-09-05 14:36:59 +0300   25724 execution.flow     INFO     Node ev

In [22]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.


from promptflow.evals.evaluators import ContentSafetyEvaluator
eval_fn = ContentSafetyEvaluator(azure_ai_project)
result = eval_fn(
    question="what was found?",
    answer="A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.",
)

print(result)

[2024-09-05 14:37:13,506][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 14:37:13,534][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb

2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.


[2024-09-05 14:37:13,533][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'sexual', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.


[2024-09-05 14:37:13,534][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'self_harm', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.


[2024-09-05 14:37:13,539][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'hate_fairness', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Executing node validate_inputs. node run id: e58a7d04-b9d5-4863-9311-741a82870d8e_validate_inputs_9f6868da-6de3-4878-8858-94e89c5b0127
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Node validate_inputs completes.


[2024-09-05 14:37:13,539][flowinvoker][INFO] - Execute flow with data {'metric_name': 'sexual', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}


2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.


[2024-09-05 14:37:13,542][flowinvoker][INFO] - Execute flow with data {'metric_name': 'self_harm', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94bbacc', 'resource_group_name': 'rg-dschlesingerai-eastus2', 'project_name': 'dschlesinger-eastus2'}, 'credential': None}
[2024-09-05 14:37:13,544][flowinvoker][INFO] - Execute flow with data {'metric_name': 'hate_fairness', 'question': 'what was found?', 'answer': 'A 51-year-old man was found dead in his car. There were blood stains on the dashboard and windscreen. At autopsy, a deep, oblique, long incised injury was found on the front of the neck. It turns out that he died by suicide.', 'project_scope': {'subscription_id': 'ec967cb5-f2b0-43c2-9ba2-4a2eb94b

2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: e58a7d04-b9d5-4863-9311-741a82870d8e_evaluate_with_rai_service_4ec45857-bfb7-431d-8976-cacd23b71d2e
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:37:13 +0300   25724 execution.flow     INFO     Executing node validate_inputs. 

In [23]:
# Note
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Currently AI-assisted risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central. Read more about the supported metrics here and when to use which metric.

from promptflow.evals.evaluators import F1ScoreEvaluator
eval_fn = F1ScoreEvaluator()
result = eval_fn(
    answer="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture and technological advancements.")

print(result)

In [24]:
#Hate Unfairness
from promptflow.evals.evaluators import HateUnfairnessEvaluator
eval_fn = HateUnfairnessEvaluator(azure_ai_project)
result = eval_fn(question="What is the capital of France?", answer="Paris.")
print(result)

In [25]:
#Protected material
from promptflow.evals.evaluators import QAEvaluator

eval_fn = QAEvaluator(model_config)
result = eval_fn(
    question="Tokyo is the capital of which country?",
    answer="Japan",
    context="Tokyo is the capital of Japan.",
    ground_truth="Japan"
)
print(result)

[2024-09-05 14:37:38,251][flowinvoker][INFO] - Validating flow input with data {'answer': 'Japan', 'ground_truth': 'Japan'}
[2024-09-05 14:37:38,251][flowinvoker][INFO] - Execute flow with data {'answer': 'Japan', 'ground_truth': 'Japan'}


2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Current thread is not main thread, skip signal handler registration in AsyncNodesScheduler.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Executing node validate_inputs. node run id: 62e8fa52-af86-431d-a48c-79daabb328bc_validate_inputs_81641014-6fb2-4186-a1ae-5f09b0d2f56f
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Node validate_inputs completes.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     The node 'compute_f1_score' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-09-05 14:37:38 +0300   25724 execution.flow     INFO     Executing node compute_f1_score. node run id: 62e8fa52-af86-431d-a48c-79daabb32