In [None]:
!pip install evidently

In [None]:
import pandas as pd
from evidently.future.datasets import Dataset
from evidently.future.datasets import DataDefinition
from evidently.future.datasets import Descriptor
from evidently.future.descriptors import *
from evidently.future.report import Report
from evidently.future.presets import TextEvals
from evidently.future.metrics import *
from evidently.future.tests import *

from evidently.features.llm_judge import BinaryClassificationPromptTemplate

To connect to Evidently Cloud:

In [None]:
from evidently.ui.workspace.cloud import CloudWorkspace

Optional imports to create monitoring panels:

In [None]:
from evidently.ui.dashboards import DashboardPanelPlot
from evidently.ui.dashboards import DashboardPanelTestSuite
from evidently.ui.dashboards import DashboardPanelTestSuiteCounter
from evidently.ui.dashboards import TestSuitePanelType
from evidently.ui.dashboards import ReportFilter
from evidently.ui.dashboards import PanelValue
from evidently.ui.dashboards import PlotType
from evidently.ui.dashboards import CounterAgg
from evidently.tests.base_test import TestStatus
from evidently.renderers.html_widgets import WidgetSize

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "YOUR KEY"

# Connect to Evidently Cloud

Get token: https://docs.evidentlyai.com/docs/setup/cloud

In [None]:
# ws = CloudWorkspace(token="YOUR_API_TOKEN", url="https://app.evidently.cloud")

Create a Project

In [None]:
# project = ws.create_project("Regression testing example", org_id="YOUR_TEAM_ID")
# project.description = "My project description"
# project.save()

# Reference answers

Prepare the dataset with inputs and approved answers you want to compare against.

In [None]:
data = [
    ["Why is the sky blue?", "The sky is blue because molecules in the air scatter blue light from the sun more than they scatter red light."],
    ["How do airplanes stay in the air?", "Airplanes stay in the air because their wings create lift by forcing air to move faster over the top of the wing than underneath, which creates lower pressure on top."],
    ["Why do we have seasons?", "We have seasons because the Earth is tilted on its axis, which causes different parts of the Earth to receive more or less sunlight throughout the year."],
    ["How do magnets work?", "Magnets work because they have a magnetic field that can attract or repel certain metals, like iron, due to the alignment of their atomic particles."],
    ["Why does the moon change shape?", "The moon changes shape, or goes through phases, because we see different portions of its illuminated half as it orbits the Earth."]
]

columns = ["question", "target_response"]

ref_data = pd.DataFrame(data, columns=columns)

In [None]:
pd.set_option('display.max_colwidth', None)
ref_data.head()

In [None]:
ref_dataset = Dataset.from_pandas(pd.DataFrame(ref_data),
data_definition=DataDefinition(),
descriptors=[
    TextLength("target_response", alias="Length"),
    SentenceCount("target_response", alias="Sentence"),
])
ref_dataset.as_dataframe()

In [None]:
report = Report([
    TextEvals(),
])

my_eval = report.run(ref_dataset, None)
my_eval

#my_eval.as_dict()
#my_eval.json()

# Generate new answers

Let's imitate. In practice, call your LLM app, get new answers, add them to the dataframe.

In [None]:
data = [
    ["Why is the sky blue?",
     "The sky is blue because molecules in the air scatter blue light from the sun more than they scatter red light.",
     "The sky appears blue because air molecules scatter the sun’s blue light more than they scatter other colors."],

    ["How do airplanes stay in the air?",
     "Airplanes stay in the air because their wings create lift by forcing air to move faster over the top of the wing than underneath, which creates lower pressure on top.",
     "Airplanes stay airborne because the shape of their wings causes air to move faster over the top than the bottom, generating lift."],

    ["Why do we have seasons?",
     "We have seasons because the Earth is tilted on its axis, which causes different parts of the Earth to receive more or less sunlight throughout the year.",
     "Seasons occur because of the tilt of the Earth’s axis, leading to varying amounts of sunlight reaching different areas as the Earth orbits the sun."],

    ["How do magnets work?",
     "Magnets work because they have a magnetic field that can attract or repel certain metals, like iron, due to the alignment of their atomic particles.",
     "Magnets generate a magnetic field, which can attract metals like iron by causing the electrons in those metals to align in a particular way, creating an attractive or repulsive force."],

    ["Why does the moon change shape?",
     "The moon changes shape, or goes through phases, because we see different portions of its illuminated half as it orbits the Earth.",
     "The moon appears to change shape as it orbits Earth, which is because we see different parts of its lit-up half at different times. The sun lights up half of the moon, but as the moon moves around the Earth, we see varying portions of that lit-up side. So, the moon's shape in the sky seems to change gradually, from a thin crescent to a full circle and back to a crescent again."]
]

columns = ["question", "target_response", "response"]

eval_data = pd.DataFrame(data, columns=columns)

In [None]:
eval_data.head()

In [None]:
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data),
data_definition=DataDefinition())

# Choose criteria

- LLM-judged correctness based on reference: must be always correct.
- LLM-judged style match to reference: must be always matching.
- Text length is under 200 symbols.


## Correctness LLM judge

In [None]:
correctness = BinaryClassificationPromptTemplate(
        criteria = """An ANSWER is correct when it is the same as the REFERENCE in all facts and details, even if worded differently.
        The ANSWER is incorrect if it contradicts the REFERENCE, adds additional claims, omits or changes details.
        REFERENCE:
        =====
        {target_response}
        =====""",
        target_category="incorrect",
        non_target_category="correct",
        uncertainty="unknown",
        include_reasoning=True,
        pre_messages=[("system", "You are an expert evaluator. You will be given an ANSWER and REFERENCE")],
        )

# Style LLM judge

In [None]:
style_match = BinaryClassificationPromptTemplate(
        criteria = """An ANSWER is style-matching when it matches the REFERENCE answer in STYLE, even if the meaning is different.
The ANSWER is style-mismatched when it diverges from the REFERENCE answer in STYLE, even if the meaning is the same.

Consider the following STYLE attributes:
- tone (friendly, formal, casual, sarcastic, etc.)
- sentence structure (simple, compound, complex, etc.)
- verbosity level (relative length of answers)
- and other similar attributes that may reflect difference in STYLE.

You must focus only on STYLE. Ignore any differences in contents.

=====
{target_response}
=====""",
        target_category="style-mismatched",
        non_target_category="style-matching",
        uncertainty="unknown",
        include_reasoning=True,
        pre_messages=[("system", "You are an expert evaluator. You will be given an ANSWER and REFERENCE")],
        )

# Score the data

In [None]:
descriptors=[LLMEval("response",
            template=correctness,
            provider = "openai",
            model = "gpt-4o-mini",
            alias="Correctness",
            additional_columns={"target_response": "target_response"}),
     LLMEval("response",
            template=style_match,
            provider = "openai",
            model = "gpt-4o-mini",
            alias="Style",
            additional_columns={"target_response": "target_response"}),
    TextLength("response", alias="Length")]

In [None]:
eval_dataset.add_descriptors(descriptors=descriptors)
eval_dataset.as_dataframe()

# Run regression testing

In [None]:
report = Report([
    TextEvals(),
    MaxValue(column="Length", tests=[lte(200)]),
    CategoryCount(column="Correctness", category="incorrect", tests=[eq(0)]),
    CategoryCount(column="Style", category="style-mismatched", tests=[eq(0, is_critical=False)]),
])

my_eval = report.run(eval_dataset, None)

In [None]:
# my_eval
# my_eval.json()

In [None]:
ws.add_run(project.id, my_eval, include_data=True)

# Explore the results

Head to the Evidently Platform UI.

# Next change? Test again.

In [None]:
data = [
    ["Why is the sky blue?",
     "The sky is blue because molecules in the air scatter blue light from the sun more than they scatter red light.",
     "The sky looks blue because air molecules scatter the blue light from the sun more effectively than other colors."],

    ["How do airplanes stay in the air?",
     "Airplanes stay in the air because their wings create lift by forcing air to move faster over the top of the wing than underneath, which creates lower pressure on top.",
     "Airplanes fly by generating lift through the wings, which makes the air move faster above them, lowering the pressure."],

    ["Why do we have seasons?",
     "We have seasons because the Earth is tilted on its axis, which causes different parts of the Earth to receive more or less sunlight throughout the year.",
     "Seasons change because the distance between the Earth and the sun varies throughout the year."],  # This response contradicts the reference.

    ["How do magnets work?",
     "Magnets work because they have a magnetic field that can attract or repel certain metals, like iron, due to the alignment of their atomic particles.",
     "Magnets operate by creating a magnetic field, which interacts with certain metals like iron due to the specific alignment of atomic particles."],

    ["Why does the moon change shape?",
     "The moon changes shape, or goes through phases, because we see different portions of its illuminated half as it orbits the Earth.",
     "The moon's phases occur because we observe varying portions of its lit half as it moves around the Earth."]
]

columns = ["question", "target_response", "response"]

eval_data_2 = pd.DataFrame(data, columns=columns)

In [None]:
eval_dataset_2 = Dataset.from_pandas(pd.DataFrame(eval_data_2),
data_definition=DataDefinition())

In [None]:
eval_dataset_2.add_descriptors(descriptors=descriptors)

In [None]:
my_eval_2 = report.run(eval_dataset_2, None)

In [None]:
ws.add_run(project.id, my_eval_2, include_data=True)

# Get a dashboard

Add a counter panel to show the SUCCESS rate of the latest test run. Add a test monitoring panel to show all test results over time.

In [None]:
project.dashboard.add_panel(
     DashboardPanelTestSuiteCounter(
        title="Latest Test run",
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        size=WidgetSize.FULL,
        statuses=[TestStatus.SUCCESS],
        agg=CounterAgg.LAST,
    ),
    tab="Tests"
)
project.dashboard.add_panel(
    DashboardPanelTestSuite(
        title="Test results",
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        size=WidgetSize.FULL,
        panel_type=TestSuitePanelType.DETAILED,
    ),
    tab="Tests"
)
project.save()