In [None]:
!pip install evidently[llm]

In [None]:
import pandas as pd

from evidently import Dataset
from evidently import DataDefinition
from evidently.descriptors import *

from evidently import Report
from evidently.presets import TextEvals
from evidently.metrics import *
from evidently.tests import *

from evidently.ui.workspace import CloudWorkspace

# Tokens

In [None]:
import os

# os.environ["OPENAI_API_KEY"] = "YOUR KEY"

In [None]:
# ws = CloudWorkspace(token="YOUR_API_TOKEN", url="https://app.evidently.cloud")

# Create a Project

In [None]:
# project = ws.create_project("My test project", org_id="YOUR_ORG_ID")
# project.description = "My project description"
# project.save()

# or project = ws.get_project("PROJECT_ID")

# Retrieval - Single context

In [None]:
synthetic_data = [

    ["Why do flowers bloom in spring?",
     "Plants require extra care during cold months. You should keep them indoors.",
     "because of the rising temperatures"],

    ["Why do we yawn when we see someone else yawn?",
     "Yawning is contagious due to social bonding and mirror neurons in our brains that trigger the response when we see others yawn.",
     "because it's a glitch in the matrix"],

    ["How far is Saturn from Earth?",
     "The distance between Earth and Saturn varies, but on average, Saturn is about 1.4 billion kilometers (886 million miles) away from Earth.",
     "about 1.4 billion kilometers"],

    ["Where do penguins live?",
     "Penguins primarily live in the Southern Hemisphere, with most species found in Antarctica, as well as on islands and coastlines of South America, Africa, Australia, and New Zealand.",
     "mostly in Antarctica and southern regions"],
]

columns = ["Question", "Context", "Response"]
synthetic_df = pd.DataFrame(synthetic_data, columns=columns)

In [None]:
pd.set_option('display.max_colwidth', None)

## ContextQuality

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        ContextQualityLLMEval("Context", question="Question"),
    ]
)
context_based_evals.as_dataframe()

## ContextRelevance

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        ContextRelevance("Question", "Context",
                                  output_scores=True,
                                  aggregation_method="hit",
                                  method="llm",
                                  alias="Hit"
                                  )
    ]
)
context_based_evals.as_dataframe()

# Retrieval - Multi context

In [None]:
synthetic_data = [
    ["Why are bananas healthy?",
     ["Bananas are rich in potassium and vitamins, making them good for heart health.",
      "Bananas provide quick energy due to natural sugars.",
      "Are bananas actually a vegetable?"],
     "because they are rich in nutrients"],

    ["How do you cook potatoes?",
     ["Potatoes are easy to grow.",
      "The best way to cook potatoes is to eat them raw.",
      "Can potatoes be cooked in space?"],
     "boil, bake, or fry them"],
]

# Create a DataFrame
columns = ["Question", "Context", "Response"]
synthetic_df_2 = pd.DataFrame(synthetic_data, columns=columns)


## ContextRelevance, Hit

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df_2),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        ContextRelevance("Question", "Context",
                                  output_scores=True,
                                  aggregation_method="hit",
                                  method="llm",
                                  alias="Hit"
                                  )
    ]
)
context_based_evals.as_dataframe()

## ContextRelevance, Mean

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df_2),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        ContextRelevance("Question", "Context",
                                  output_scores=True,
                                  aggregation_method="mean",
                                  method="llm",
                                  alias="Relevance"
                                  )
    ]
)
context_based_evals.as_dataframe()

# Generation - ground truth

In [None]:
synthetic_data = [
    ["Why do we yawn when we see someone else yawn?",
     "because it's a glitch in the matrix.",
     "Due to social bonding and mirror neurons in our brains."],

    ["Why do flowers bloom in spring?",
     "Because of the the rising temperatures.",
     "Because it is getting warmer."],

    ["Why are bananas healthy?",
     "Because they are rich in nutrients.",
     "Because they contain a lot of nutrients."]
]

columns = ["Question", "Response", "Target"]
synthetic_df_4 = pd.DataFrame(synthetic_data, columns=columns)

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df_4),
    data_definition=DataDefinition(
        text_columns=["Question", "Response", "Target"],
    ),
    descriptors=[
        CorrectnessLLMEval("Response", target_output="Target"),
        BERTScore(columns=["Response", "Target"], alias="BERTScore"),
        SemanticSimilarity(columns=["Response", "Target"], alias="Semantic Similarity"),
    ]
)
context_based_evals.as_dataframe()

# Generation - open-ended

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        FaithfulnessLLMEval("Response", context="Context")
    ]
)
context_based_evals.as_dataframe()

# Report

Combine ContextQuality and faithfulness:

In [None]:
context_based_evals = Dataset.from_pandas(
    pd.DataFrame(synthetic_df),
    data_definition=DataDefinition(
        text_columns=["Question", "Context", "Response"],
    ),
    descriptors=[
        FaithfulnessLLMEval("Response", context="Context"),
        ContextQualityLLMEval("Context", question="Question"),
    ]
)
# context_based_evals.as_dataframe()

In [None]:
report = Report([
    TextEvals()
])

my_eval = report.run(context_based_evals, None)
my_eval

# Add Tests

In [None]:
report = Report([
    TextEvals(),
    CategoryCount(column="Faithfulness", category="UNFAITHFUL", tests=[eq(0)]),
    CategoryCount(column="ContextQuality", category="INVALID", tests=[eq(0)])
])

my_eval = report.run(context_based_evals, None)
my_eval

# Upload to Cloud

In [None]:
# ws.add_run(project.id, my_eval, include_data=True)