# LLM regression testing workflow

In [None]:
#!pip install evidently[llm]

In [1]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
from io import BytesIO

To run open-source evaluations:

In [2]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.metric_preset import TextEvals
from evidently.descriptors import *
from evidently.metrics import *
from evidently.tests import *

**Optional**: To work with Evidently Cloud:

In [3]:
from evidently.ui.workspace.cloud import CloudWorkspace

**Optional**: To manage dashboards as code remotely. You can also do this in UI.

In [4]:
from evidently.ui.dashboards import DashboardPanelTestSuite
from evidently.ui.dashboards import PanelValue
from evidently.ui.dashboards import ReportFilter
from evidently.ui.dashboards import TestFilter
from evidently.ui.dashboards import TestSuitePanelType
from evidently.renderers.html_widgets import WidgetSize

# Prepare a dataset

Get an example dataset. You can also download and import the CSV file directly ([Link](https://github.com/evidentlyai/evidently/blob/main/examples/how_to_questions/chat_df.csv)).

In [5]:
response = requests.get("https://raw.githubusercontent.com/evidentlyai/evidently/main/examples/how_to_questions/chat_df.csv")
csv_content = BytesIO(response.content)

Read the CSV content into a DataFrame. Parse dates and set conversation "start_time" as index.

In [6]:
assistant_logs = pd.read_csv(csv_content, index_col=0, parse_dates=['start_time', 'end_time'])
assistant_logs.index = assistant_logs.start_time
assistant_logs.index.rename('index', inplace=True)

Preview:

In [7]:
pd.set_option('display.max_colwidth', None)

In [8]:
assistant_logs.head(3)

Unnamed: 0_level_0,question,response,start_time,end_time,organization,model_ID,region,environment,feedback
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-04-08 03:55:48.128469,How do I request medical leave through the employee portal?,"Requesting medical leave through the employee portal is a confidential process designed to support your health needs. Follow these steps to submit your request 1. Log in to the employee portal using your credentials. 2. Navigate to the ""Time Off"" or ""Leave Requests"" section. 3. Select the option to request new time off or submit a leave request. 4. Choose the type of leave (e.g., medical leave) and enter the dates you'll be absent (April 10, 2024 - April 20, 2024). 5. Provide any necessary details or reasons for your medical leave request, such as a brief description of the medical condition or treatment plan. 6. Attach any relevant medical documentation or supporting evidence, such as a doctor's note or medical certificate. 7. Review the request summary to ensure accuracy and completeness. 8. Submit your request for approval by clicking on the ""Submit"" button. Once your request is processed, you'll receive a notification confirming the approval of your medical leave. It's important to remember that medical information is highly sensitive and should only be shared with HR or your supervisor on a need-to-know basis. If you have any concerns about requesting medical leave or need assistance, please reach out to HR for confidential support. We're here to ensure that your privacy is respected and that you receive the necessary accommodations to support your health needs.",2024-04-08 03:55:48.128469,2024-04-08 03:58:48.128469,acme,gpt_3.5_turbo,EU-Spain,production,none
2024-04-08 03:59:47.756913,How can I update my direct deposit information on the employee portal?,"To update your direct deposit information on the employee portal, log in to your account and navigate to the payroll or banking section. From there, you should be able to add, edit, or remove bank account details for direct deposit of your paychecks. If you encounter any difficulties, please contact HR for assistance.",2024-04-08 03:59:47.756913,2024-04-08 04:04:47.756913,acme,gpt_3.5_turbo,EU-Germany,production,none
2024-04-08 06:19:47.717513,How do I generate payroll reports in the accounting system?,"Generating payroll reports in the accounting system typically involves these steps 1. Log in to the accounting system and navigate to the payroll module or reporting section. 2. Select the type of payroll report you wish to generate, such as employee earnings, tax withholdings, or benefits contributions. 3. Specify the reporting period and any filters or criteria for the report, such as department or employee type. 4. Customize the report layout and formatting options according to your preferences or requirements. 5. Generate the payroll report and review the data for accuracy before saving or distributing it to relevant stakeholders.",2024-04-08 06:19:47.717513,2024-04-08 06:24:47.717513,acme,gpt_3.5_turbo,US-west,production,none


# Set up your Workspace 

To store the evaluation results, share them with others, and get a live monitoring dashboard, create your free Evidently Cloud account at: http://app.evidently.cloud/signup.

 To connect to the workspace from your Python environment:

In [None]:
ws = CloudWorkspace(token="YOUR_TOKEN", 
                    url="https://app.evidently.cloud")

Create your Project:

In [None]:
project = ws.create_project("My project title", team_id="YOUR_TEAM_ID")
project.description = "My project description"
project.save()

# Run evaluations

Prep: map your input data columns. Optional, but recommended.

In [9]:
column_mapping = ColumnMapping(
    datetime='start_time',
    datetime_features=['end_time'],
    text_features=['question', 'response'],
    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],
)

## Basic example

Run the first evaluation by checking the chatbot response length. You will use the `TextLength()` descriptor. This will return an absolute count for the number of symbols in each text. You can also check `SentenceCount()`, `WordCount()`, etc.

To run the evaluation for the first 100 conversations and get a summary Report:

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response",
              descriptors=[
                  TextLength(),
                  ]
              )
])

text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:100],
                      column_mapping=column_mapping)
text_evals_report

You can also do a side-by-side comparison for two datasets:

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response",
              descriptors=[
                  TextLength(),
                  ]
              )
])

text_evals_report.run(reference_data=assistant_logs[:50],
                      current_data=assistant_logs[50:100],
                      column_mapping=column_mapping)
text_evals_report

Let's look at other evaluation methods one by one. You can later combine multiple descriptors in a single Report.

## Text patterns

You can use regular expressions to check text patterns. For example, check the presence of competitor mentions, topical words, etc.

Let's check for responses that contain words related to compensation. This will automatically account for inflected and variant words. This descriptor returns True/False for **pattern match**.

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response",
              descriptors=[
                  IncludesWords(
                      words_list=['salary', 'benefits', 'payroll'],
                      display_name="Mention Compensation")
            ]
        ),
        ]
)

text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:100],
                      column_mapping=column_mapping)
text_evals_report

Other examples: `Contains(items=[])`, `BeginsWith(prefix="")`, custom `RegEx(reg_exp=r"")`, etc.

## Model-based scoring

You can use pre-trained machine learning models to score your text data.

**Sentiment**. You can use built-in models like `Sentiment()`. This will return a sentiment score from -1 (very negative) to 1 (very positive).

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
            Sentiment(),
        ]
    ),
])

text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:100],
                      column_mapping=column_mapping)
text_evals_report

You can also use models from HuggingFace. This will download the models to score your data locally.

**Toxicity**. You can use a pre-selected toxicity model using `HuggingFaceToxicityModel()` descriptor. This will returns the predicted toxicity score between 0 to 1.

**Neutral emotion**. You can call a named custom model from HuggingFace. For example, let's use the `SamLowe/roberta-base-go_emotions` model and get a score from 0 to 1 for "neutral" label to see if responses convey neutral emotion.

In [10]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
            HuggingFaceToxicityModel(),
            HuggingFaceModel(
                model="SamLowe/roberta-base-go_emotions",
                params={"label": "neutral"},
                display_name="Response Neutrality"),
        ]
    ),
])

text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:100],
                      column_mapping=column_mapping)
text_evals_report

Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint


In [12]:
text_evals_report.save_html("weird_signs_for_Dima.html")

See docs on using HuggingFace models as descriptors: https://docs.evidentlyai.com/user-guide/customization/huggingface_descriptor

## LLM-as-a-judge

You can use LLMs with custom evaluation prompts to label the texts by specific criteria such as tone or conciseness. You can use `OpenAIPrompting` descriptor.

This descriptor requires an OpenAI key. Pass it as an environment variable.

In [None]:
## import os

## os.environ["OPENAI_API_KEY"] = "YOUR KEY"

In [None]:
conciseness_prompt = """
Conciseness refers to the quality of being brief and to the point, while still providing all necessary information.

A concise response should:
- Provide the necessary information without unnecessary details or repetition.
- Be brief yet comprehensive enough to address the query.
- Use simple and direct language to convey the message effectively.

Please evaluate the following chatbot response for conciseness.

response: REPLACE

Use the following categories for conciseness evaluation:
CONCISE if the response is concise and to the point
VERBOSE if the response is overly detailed or contains unnecessary information
UNKNOWN if the information provided is not sufficient to make a clear determination

Return a category only
"""

In [None]:
report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
        OpenAIPrompting(prompt=conciseness_prompt,
                        prompt_replace_string="REPLACE",
                        model="gpt-3.5-turbo-instruct",
                        feature_type="cat",
                        display_name="Response Conciseness"),
    ])
])

report.run(reference_data= None,
           current_data= assistant_logs[:10],
           column_mapping=column_mapping)

report

You can also run evals using two columns: e.g., context and response, request and response. Check the docs: https://docs.evidentlyai.com/user-guide/customization/llm_as_a_judge

## Metadata columns

To summarize metadata fields:

In [None]:
data_report = Report(metrics=[
   ColumnSummaryMetric(column_name="feedback"),
   ]
)

data_report.run(reference_data=None, current_data=assistant_logs[:100], column_mapping=column_mapping)
data_report

## Semantic similarity

**Semantic similarity** evaluates how close two texts are in meaning using an embedding model. It returns a score from 0 to 1 (0: different, 0.5: unrelated, 1: similar). This is a pairwise descriptor. Let's compare the similarity between questions and responses:

In [None]:
text_evals_report = Report(metrics=[
    ColumnSummaryMetric(
        column_name=SemanticSimilarity(
            display_name="Response-Question Similarity"
        )
        .on(["response", "question"])
    )
])

text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:100],
                      column_mapping=column_mapping)
text_evals_report

You can also run evals to compare current responses against "golden" reference examples. Check a tutorial on regression testing: https://www.evidentlyai.com/blog/llm-regression-testing-tutorial

# Export evaluation results

View the dataset with added evaluation results for each row:

In [None]:
text_evals_report.datasets()[1]

Python dictionary with a summary report:

In [None]:
text_evals_report.as_dict()

JSON with a summary report:

In [None]:
text_evals_report.json()

Save as HTML file:

In [None]:
text_evals_report.save_html("report.html")

# Monitoring results over time

Define a combined Report:

In [None]:
text_evals_report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
            Sentiment(),
            TextLength(),
            IncludesWords(words_list=['salary', 'benefits', 'payroll'],
                          display_name="Mention Compensation")

        ],
    ),
    ColumnSummaryMetric(column_name="feedback"),
    ColumnSummaryMetric(column_name="region"),
    ColumnSummaryMetric(column_name="organization"),
    ColumnSummaryMetric(column_name="model_ID"),
    ColumnSummaryMetric(column_name="environment"),
])

Run the Report on the first 50 rows:

In [None]:
text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[:50],
                      column_mapping=column_mapping)

In [None]:
project.id

Send evaluation results to Evidently Cloud.

In [None]:
ws.add_report(project.id, text_evals_report, include_data=True)

**Cloud UI**. In the Evidently Cloud UI, Add a "Descriptors" tab and a "Columns" tab to create a dashboard that plots the metrics.

Let's imitate a few consecutive runs to evaluate batches of data as they come. Run and send several Reports, each time taking 50 rows.

In [None]:
text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[50:100],
                      column_mapping=column_mapping)
ws.add_report(project.id, text_evals_report)

In [None]:
text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[100:150],
                      column_mapping=column_mapping)
ws.add_report(project.id, text_evals_report)

In [None]:
text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[150:200],
                      column_mapping=column_mapping)
ws.add_report(project.id, text_evals_report)

In [None]:
text_evals_report.run(reference_data=None,
                      current_data=assistant_logs[200:250],
                      column_mapping=column_mapping)
ws.add_report(project.id, text_evals_report)

# Extra: Run conditional tests

You can monitor not only values but whether they comply with the conditions you define. For example:
* Average response sentiment should be positive.
* Response length should always be non-zero.
* The maximum response length should be 2000 symbols.
* The mean response length should be above 500 symbols.

Define the test suite:

In [None]:
test_suite = TestSuite(tests=[
    TestColumnValueMean(column_name = Sentiment().on("response"), gte=0),
    TestColumnValueMin(column_name = TextLength().on("response"), gt=0),
    TestColumnValueMax(column_name = TextLength().on("response"), lte=2000),
    TestColumnValueMean(column_name = TextLength().on("response"), gt=500),
])

Imitate sending 5 reports in a row with 1 hour difference in timestamps:

In [None]:
for i in range(5):
    test_suite.run(
        reference_data=None,
        current_data=assistant_logs.iloc[50 * i : 50 * (i + 1), :],
        column_mapping=column_mapping,
        timestamp=datetime.now() + timedelta(hours=i)
    )
    ws.add_test_suite(project.id, test_suite)

**Add a test suite panel**. Copy your Project ID to connect.

In [None]:
#project = ws.get_project("YOUR_PROJECT_ID")

In [None]:
project.dashboard.add_panel(
    DashboardPanelTestSuite(
        title="Test results",
        filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=True),
        size=WidgetSize.FULL,
        panel_type=TestSuitePanelType.DETAILED,
    ),
    tab="Tests"
)
project.save()