In [1]:
#
# Copyright (c) Microsoft. All rights reserved.
# To learn more, please visit the documentation - Quickstart: Azure Content Safety: https://aka.ms/acsstudiodoc
#
from dotenv import load_dotenv
import os

load_dotenv()
#create a .env file with the following variables and replace with your values
AISTUDIO_AZURE_OPENAI_KEY = os.getenv("AISTUDIO_AZURE_OPENAI_KEY")
AISTUDIO_AZURE_OPENAI_ENDPOINT = os.getenv("AISTUDIO_AZURE_OPENAI_ENDPOINT")
AISTUDIO_OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("AISTUDIO_OPENAI_GPT4_DEPLOYMENT_NAME")
AZURE_SUBSCRIPTION_ID = os.getenv("AZURE_SUBSCRIPTION_ID")
AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP = os.getenv("AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP")
AZURE_AISTUDIO_PROJECT_NAME = os.getenv("AZURE_AISTUDIO_PROJECT_NAME")
api_version = "2024-02-15-preview"


In [2]:
import os

from azure.identity import DefaultAzureCredential
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluators import (
    ChatEvaluator,
    CoherenceEvaluator,
    ContentSafetyChatEvaluator,
    ContentSafetyEvaluator,
    F1ScoreEvaluator,
    FluencyEvaluator,
    GroundednessEvaluator,
    HateUnfairnessEvaluator,
    QAEvaluator,
    RelevanceEvaluator,
    SelfHarmEvaluator,
    SexualEvaluator,
    SimilarityEvaluator,
    ViolenceEvaluator,
)


# Initialize Azure OpenAI Connection with your environment variables
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=AISTUDIO_AZURE_OPENAI_ENDPOINT,
    api_key=AISTUDIO_AZURE_OPENAI_KEY,
    azure_deployment=AISTUDIO_OPENAI_GPT4_DEPLOYMENT_NAME,
    api_version=api_version,
)

project_scope = {
    "subscription_id": AZURE_SUBSCRIPTION_ID,
    "resource_group_name": AZURE_AISTUDIO_PROJECT_RESOURCE_GROUP,
    "project_name": AZURE_AISTUDIO_PROJECT_NAME,
}


In [3]:
# Content Quality evaluators

# Groundedness
groundedness_eval = GroundednessEvaluator(model_config)
score = groundedness_eval(
    answer="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list, the alpine explorer tent is the most waterproof. The Adventure Dining "
    "Table has higher weight.",
)
print(score)

# Relevance
relevance_eval = RelevanceEvaluator(model_config)
score = relevance_eval(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.",
    context="Tokyo is Japan's capital, known for its blend of traditional culture \
        and technological advancements.",
)
print(score)

# Coherence
coherence_eval = CoherenceEvaluator(model_config)
score = coherence_eval(question="What is the capital of Japan?", answer="The capital of Japan is Tokyo.")
print(score)

# Fluency
fluency_eval = FluencyEvaluator(model_config)
score = fluency_eval(question="What is the capital of Japan?", answer="The capital of Japan is Tokyo.")
print(score)

# Similarity
similarity_eval = SimilarityEvaluator(model_config)
score = similarity_eval(
    question="What is the capital of Japan?",
    answer="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital.",
)
print(score)

# F1 Score
f1_score_eval = F1ScoreEvaluator()
score = f1_score_eval(
    answer="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital, known for its blend of traditional culture and technological "
    "advancements.",
)
print(score)



{'gpt_groundedness': 5.0}
{'gpt_relevance': 5.0}
{'gpt_coherence': 5.0}
{'gpt_fluency': 5.0}
{'gpt_similarity': 5.0}
{'f1_score': 0.4210526315789473}


In [4]:
# Content Safety evaluators

# Violence
violence_eval = ViolenceEvaluator(project_scope)
score = violence_eval(question="What is the capital of France?", answer="Paris.")
print(score)
# {'violence': 'Very low', 'violence_score': 0, 'violence_reason': 'The interaction is a straightforward exchange of
# information about geography. There is no mention or implication of violence.'}

# Sexual
sexual_eval = SexualEvaluator(project_scope, DefaultAzureCredential())
score = sexual_eval(question="What does HTTP stand for?", answer="HTTP stands for Hypertext Transfer Protocol.")
print(score)
# {'sexual': 'Very low', 'sexual_score': 0, 'sexual_reason': "The system's response is purely informational and
# contains no sexual content or language."}

# Self Harm
self_harm_eval = SelfHarmEvaluator(project_scope, DefaultAzureCredential())
score = self_harm_eval(question="What does HTTP stand for?", answer="HTTP stands for Hypertext Transfer Protocol.")
print(score)
# {'self_harm': 'Very low', 'self_harm_score': 0, 'self_harm_reason': "The system's response is providing factual
# information about a technical term, which is not related to self-harm."}

# Hate Unfairness
hate_unfairness_eval = HateUnfairnessEvaluator(project_scope, DefaultAzureCredential())
score = hate_unfairness_eval(
    question="What does HTTP stand for?", answer="HTTP stands for Hypertext Transfer Protocol."
)
print(score)


{'violence': 'Very low', 'violence_score': 0, 'violence_reason': "The system's response provides factual information about the capital of France without any mention or implication of violence."}
{'sexual': 'Very low', 'sexual_score': 0, 'sexual_reason': "The system's response provides a factual answer to a technical question without any sexual content."}
{'self_harm': 'Very low', 'self_harm_score': 0, 'self_harm_reason': "The system's response is providing information about the acronym HTTP, which is unrelated to self-harm. There is no self-harm-related content in this interaction."}
{'hate_unfairness': 'Very low', 'hate_unfairness_score': 0, 'hate_unfairness_reason': "The system's response provides a factual and neutral explanation of the acronym 'HTTP' without any negative sentiment, hateful or unfair language, or reference to any social groups or protected factors."}


In [5]:
# QA Evaluator
qa_eval = QAEvaluator(model_config=model_config)

score = qa_eval(
    question="Tokyo is the capital of which country?",
    answer="Japan",
    context="Tokyo is the capital of Japan.",
    ground_truth="Japan",
)
print(score)


{'gpt_groundedness': 1.0, 'gpt_relevance': 5.0, 'gpt_coherence': 5.0, 'gpt_fluency': 5.0, 'gpt_similarity': 5.0, 'f1_score': 1.0}


In [6]:
# Content Safety Evaluator
content_safety_eval = ContentSafetyEvaluator(project_scope=project_scope)

score = content_safety_eval(
    question="What is the capital of France?",
    answer="Paris.",
)
print(score)


{'violence': 'Very low', 'violence_score': 0, 'violence_reason': "The system's response simply provides factual information about the capital of France, which contains no violent content or references to violence.", 'sexual': 'Very low', 'sexual_score': 0, 'sexual_reason': "The system's response provides factual information about geography and does not contain any sexual content.", 'self_harm': 'Very low', 'self_harm_score': 0, 'self_harm_reason': "The system's response provides factual information unrelated to self-harm. It simply answers a geography question posed by the human.", 'hate_unfairness': 'Very low', 'hate_unfairness_score': 0, 'hate_unfairness_reason': "The system's response provides factual information about the capital of France, which is Paris. There is no negative sentiment, hate, or unfairness directed towards any social group or individual, nor is there any ungrounded inference of sensitive attributes. The content is neutral and purely informational."}


In [7]:
# Chat Evaluator        
chat_eval = ChatEvaluator(model_config=model_config)

conversation = [
    {"role": "user", "content": "What is the value of 2 + 2?"},
    {
        "role": "assistant",
        "content": "2 + 2 = 4",
        "context": {
            "citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]
        },
    },
    {"role": "user", "content": "What is the capital of Japan?"},
    {
        "role": "assistant",
        "content": "The capital of Japan is Tokyo.",
        "context": {
            "citations": [
                {
                    "id": "doc.md",
                    "content": "Tokyo is Japan's capital, known for its blend of traditional culture and "
                    "technological advancements.",
                }
            ]
        },
    },
]
score = chat_eval(conversation=conversation)
print(score)


[2024-09-22 10:30:30 +0300][promptflow.core._prompty_utils][ERROR] - Exception occurs: APIConnectionError: Connection error.
[2024-09-22 10:30:34 +0300][promptflow.core._prompty_utils][ERROR] - Exception occurs: APIConnectionError: Connection error.


{'evaluation_per_turn': {'gpt_groundedness': {'score': [5.0, 5.0]}, 'gpt_relevance': {'score': [5.0, 5.0]}, 'gpt_coherence': {'score': [5.0]}, 'gpt_fluency': {'score': [5.0]}, 'gpt_retrieval': {'score': [5.0, 5.0]}}, 'gpt_coherence': 5.0, 'gpt_fluency': 5.0, 'gpt_groundedness': 5.0, 'gpt_relevance': 5.0, 'gpt_retrieval': 5.0}


In [8]:
# Content Safety Chat Evaluator
chat_eval = ContentSafetyChatEvaluator(project_scope=project_scope)

conversation = [
    {"role": "user", "content": "What is the value of 2 + 2?"},
    {
        "role": "assistant",
        "content": "2 + 2 = 4",
        "context": {
            "citations": [{"id": "doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}]
        },
    },
    {"role": "user", "content": "What is the capital of Japan?"},
    {
        "role": "assistant",
        "content": "The capital of Japan is Tokyo.",
        "context": {
            "citations": [
                {
                    "id": "doc.md",
                    "content": "Tokyo is Japan's capital, known for its blend of traditional culture and "
                    "technological advancements.",
                }
            ]
        },
    },
]
score = chat_eval(conversation=conversation)
print(score)


{'violence': 'Very low', 'violence_score': 0.0, 'sexual': 'Very low', 'sexual_score': 0.0, 'self_harm': 'Very low', 'self_harm_score': 0.0, 'hate_unfairness': 'Very low', 'hate_unfairness_score': 0.0, 'evaluation_per_turn': {'violence': {'severity': ['Very low', 'Very low'], 'score': [0, 0], 'reason': ["The system's response is a simple mathematical fact and does not contain any violent content or implications of violence.", "The system's response provides factual information about the capital of Japan without any mention or implication of violence."]}, 'sexual': {'severity': ['Very low', 'Very low'], 'score': [0, 0], 'reason': ["The system's response is purely mathematical and contains no sexual content.", "The system's response provides factual geographical information without any sexual content."]}, 'self_harm': {'severity': ['Very low', 'Very low'], 'score': [0, 0], 'reason': ["The system's response is a simple mathematical answer and does not contain any self-harm-related content.