### Initialize our environment. In a real-world scenario, this is where you’d define your LLM and test cases

In [10]:
import os
from dotenv import load_dotenv
# We use 'assert_test' to treat LLM evaluation like traditional software testing 
from deepeval import assert_test
from deepeval.test_case import LLMTestCase

load_dotenv(override=True)

# Importing updated metric names for modern GenAI systems 
from deepeval.metrics import (
    GEval,  # General purpose metric for complex reasoning
    HallucinationMetric,  # Specifically for detecting fabricated content 
    FaithfulnessMetric,  # Measures truth alignment with trusted sources 
    ContextualPrecisionMetric,  # Critical for Retrieval-Augmented Generation (RAG) 
    AnswerRelevancyMetric,  # Replaces consistency for prompt variation checks 
)


### Faithfulness & Groundedness
Concept: Does the AI's response align with the provided facts? This is the "antidote" to hallucinations.

In [11]:
def demo_faithfulness():
    # Context retrieved from a database (the ground truth)
    retrieval_context = [
        "The capital of France is Paris. It has a population of 2.1 million."
    ]

    # What the LLM actually said
    actual_output = "Paris is the capital of France and is home to 5 million people."

    # Initialize the metric to check truth alignment, not just language quality 
    metric = FaithfulnessMetric(threshold=0.7)

    test_case = LLMTestCase(
        input="Tell me about Paris",
        actual_output=actual_output,
        retrieval_context=retrieval_context,
    )

    metric.measure(test_case)
    # This will likely fail/score low because 5 million != 2.1 million 
    print(f"Faithfulness Score: {metric.score}")
    print(f"Reasoning for score: {metric.reason}") 


In [12]:
demo_faithfulness()

Output()

Faithfulness Score: 0.0
Reasoning for score: The score is 0.00 because the actual output incorrectly claims that Paris has a population of 5 million, while the retrieval context clearly states the correct population is 2.1 million. This direct contradiction results in a completely unfaithful response.


### Hallucination Detection

Concept: Explicitly searching for fabricated or unsupported content by comparing the output against verified references.

In [17]:
def demo_hallucination():
    # Trusted reference material
    context = ["The Sun is a G-type main-sequence star."]

    # A fabricated claim
    actual_output = "The Sun is a blue giant star located in the Andromeda galaxy."

    # Metric that scores the probability of fabrications 
    metric = HallucinationMetric(threshold=0.5)

    test_case = LLMTestCase(
        input="What kind of star is the Sun?",
        actual_output=actual_output,
        context=context,
    )

    metric.measure(test_case)
    # Detects fabricated content to prevent dangerous misinformation 
    # 1.0 equals 100% hallucination
    print(f"Hallucination Score (Lower is better): {metric.score}")
    print(f"Reasoning: {metric.reason}")

In [18]:
demo_hallucination()

Output()

Hallucination Score (Lower is better): 1.0
Reasoning: The score is 1.00 because the actual output directly contradicts the context by misclassifying the Sun's stellar type, indicating a complete factual error.


### Answer Consistency via Faithfulness

Concept: A robust model should behave consistently even if you change the wording of the prompt slightly.

In [None]:
# Updated imports for the latest deepeval version
from deepeval.test_case import LLMTestCase
from deepeval.metrics import FaithfulnessMetric


def demo_consistency():
    # A strong model should behave consistently across similar prompts 
    actual_output = "The recipe requires three eggs and two cups of flour."

    # Consistency metrics detect fragile or unstable behavior 
    # We treat these prompt variations as the "ground truth" context
    variations = [
        "You need 3 eggs and 2 cups of flour for this recipe.",
        "Take two cups of flour and three eggs to start the recipe.",
    ]

    # Advanced metrics separate elegance from correctness 
    # This metric checks if our output is 'faithful' to our variations
    metric = FaithfulnessMetric(threshold=0.7)

    test_case = LLMTestCase(
        input="How many eggs do I need?",
        actual_output=actual_output,
        retrieval_context=variations,
    )

    # Metrics are applied automatically at scale 
    metric.measure(test_case)

    # Stable systems inspire more user trust 
    print(f"Consistency Score: {metric.score}")
    # Failures are logged with reasoning context 
    print(f"Reasoning: {metric.reason}")

In [23]:
demo_consistency()

Output()

Consistency Score: 1.0
Reasoning: The score is 1.00 because there are no contradictions—great job staying true to the retrieval context!


### Evaluating RAG Systems

Concept: Retrieval-Augmented Generation (RAG) requires dual evaluation: did we find the right documents, and did we use them correctly?

In [42]:
def demo_rag_precision():
    # The gold-standard documents that should have been found
    expected_documents = ["Document A: Safety protocols for lab work."]

    # What the retrieval system actually pulled up
    retrieved_documents = ["Document B: Cafeteria menu for Tuesday."]
    

    # isolate failures to the retrieval component
    metric = ContextualPrecisionMetric(threshold=0.7)

    test_case = LLMTestCase(
        input="What are the lab safety rules?",
        actual_output="I don't know, but here is the menu.",
        retrieval_context=retrieved_documents,
        expected_output="Users must wear goggles at all times.",
    )

    metric.measure(test_case)
    # Score will be low because retrieval fetched the wrong context [
    print(f"Retrieval Precision Score: {metric.score}")
    print(f"Reason: {metric.reason}")

In [43]:
demo_rag_precision()

Output()

Retrieval Precision Score: 0
Reason: The score is 0.00 because the only node in the retrieval contexts (rank 1) is irrelevant, as it is about 'Document B: Cafeteria menu for Tuesday.' and does not provide any information about lab safety rules. No relevant nodes were retrieved or ranked above irrelevant ones.


### Chain-of-Thought (Reasoning) Analysis

Concept: Even if the final answer is right, the model might have used "lucky" or flawed logic.

In [49]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams


def demo_reasoning():
    # Slide 9: Intermediate thinking is often more important than the final result
    actual_output = "Since 5+5 is 10, and 10 times 2 is 20, the answer is 20."

    # Slide 11: This mirrors how humans assess intelligence
    metric = GEval(
        name="Reasoning",
        criteria="Determine if the intermediate logic steps are valid and necessary.",
        # REQUIRED: Tell GEval to look at the prompt and the response
        evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
        threshold=0.8,
    )

    # Slide 91: Each test defines expected behaviour under known conditions
    test_case = LLMTestCase(input="What is (5+5) times 2?", actual_output=actual_output)

    # Slide 86: Metrics are applied automatically at scale
    metric.measure(test_case)

    # Slide 87: Failures are logged with reasoning context
    print(f"Reasoning Score: {metric.score}")
    print(f"Reason: {metric.reason}")

In [50]:
demo_reasoning()

Output()

Reasoning Score: 1.0
Reason: The response clearly identifies the intermediate steps: first calculating 5+5 to get 10, then multiplying 10 by 2 to get 20. Each step logically follows from the previous one and is necessary to reach the final answer. No essential steps are missing, and there are no redundant steps.


### Robustness & Consistency
A strong model should behave consistently across similar prompts without large output shifts.

In [52]:
def demo_robustness():
    # This checks for fragile or unstable behavior under variation [cite: 23]
    metric = AnswerRelevancyMetric(threshold=0.7)

    # We test if the system is 'robust' against minor phrasing changes [cite: 57, 60]
    test_case = LLMTestCase(
        input="How do I reset my password?",
        actual_output="Go to settings and click 'Forgot Password'.",
    )

    metric.measure(test_case)
    # Stability inspires user trust; inconsistency signals unreliable reasoning [cite: 25, 26]
    print(f"Robustness/Relevancy Score: {metric.score}")
    print(f"Reason: {metric.reason}")

In [53]:
demo_robustness()

Output()

Robustness/Relevancy Score: 1.0
Reason: The score is 1.00 because the answer was fully relevant and addressed the question directly with no irrelevant information. Great job!
