## Basic Groundedness Evaluation

> ⚠️ **Note:** Test data is provided as static code.

> ⚠️ **Note:** The evaluation run will not be shown in AI Foundry

## Setup

In [5]:
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

load_dotenv(override=True)

True

## Configure the LLM Judge

In [6]:

# Initialize Azure AI project and Azure OpenAI conncetion with your environment variables
azure_ai_project = {
    "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP"),
    "project_name": os.environ.get("AZURE_PROJECT_NAME"),
}

model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION")
}


## Define your Evaluator

In [None]:
# Choose between Groundedness and Groundedness Pro Evaluator
USE_GROUNDEDNESS_PRO = False

if USE_GROUNDEDNESS_PRO:
    from azure.ai.evaluation import GroundednessProEvaluator
    groundedness_eval = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential)
else:
    from azure.ai.evaluation import GroundednessEvaluator
    groundedness_eval = GroundednessEvaluator(model_config)

## Evaluate your Query and Response

In [None]:
# This is an example of a query and response pair with a context
query_response = dict(
    query="Which tent is the most waterproof?",
    context="The Alpine Explorer Tent is the most water-proof of all tents available.",
    response="The Mont Blanc Tent is the most waterproof."
)

# Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response
)

groundedness_score

{'groundedness': 3.0,
 'gpt_groundedness': 3.0,
 'groundedness_reason': 'The response attempts to answer the query but contradicts the context by providing incorrect information.'}

### Sample with the full Q&A Evaluator

In [9]:
from azure.ai.evaluation import QAEvaluator
import json

qa_eval = QAEvaluator(model_config)

query_response = dict(
    query="Which tent is the most waterproof?",
    context="The Alpine Explorer Tent is the most water-proof of all tents available.",
    response="The Alpine Explorer Tent is the most waterproof.",
    ground_truth="The Alpine Explorer Tent."
)

qa_score = qa_eval(
    **query_response
)

print(json.dumps(qa_score, indent=2))

{
  "f1_score": 0.6666666666666666,
  "similarity": 5.0,
  "gpt_similarity": 5.0,
  "groundedness": 5.0,
  "gpt_groundedness": 5.0,
  "groundedness_reason": "The response is fully correct and complete, directly addressing the query with precise information from the context.",
  "coherence": 4.0,
  "gpt_coherence": 4.0,
  "coherence_reason": "The response is coherent and directly answers the question, but it lacks elaboration or supporting details that could enhance its clarity and depth.",
  "relevance": 3.0,
  "gpt_relevance": 3.0,
  "relevance_reason": "The response addresses the query but omits key details necessary for a comprehensive answer, such as why the Alpine Explorer Tent is the most waterproof or how it compares to others.",
  "fluency": 3.0,
  "gpt_fluency": 3.0,
  "fluency_reason": "The response is clear and grammatically correct but lacks sentence complexity and varied vocabulary, which limits its fluency level."
}
