# How to evaluate llm with text descriptors?

In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import time
from datetime import timedelta

from sklearn import datasets, ensemble, model_selection

In [21]:
from evidently.ui.workspace.cloud import CloudWorkspace

from evidently import ColumnMapping
from evidently.report import Report
from evidently.test_suite import TestSuite

from evidently.metrics import ColumnSummaryMetric, ColumnDistributionMetric, ColumnDriftMetric, DataDriftTable, TextDescriptorsDistribution, ColumnCategoryMetric
from evidently.tests import TestColumnValueMin, TestColumnValueMean, TestCategoryShare, TestShareOfOutRangeValues

from evidently.metric_preset import DataDriftPreset, DataQualityPreset, TextOverviewPreset, TextEvals

from evidently.descriptors import HuggingFaceModel, HuggingFaceToxicityModel, OpenAIPrompting 
from evidently.descriptors import RegExp, BeginsWith, EndsWith, Contains, DoesNotContain, IncludesWords, ExcludesWords
from evidently.descriptors import TextLength, OOV, NonLetterCharacterPercentage, SentenceCount, WordCount, Sentiment

In [3]:
import nltk
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package words to /Users/emelidral/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/emelidral/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/emelidral/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/emelidral/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Load Data

In [4]:
assistant_logs =  pd.read_csv('chat_df.csv', index_col=0, parse_dates=['start_time', 'end_time'])
assistant_logs.index = assistant_logs.start_time
assistant_logs.index.rename('index', inplace=True)

In [5]:
assistant_logs[["question", "response"]].head()

Unnamed: 0_level_0,question,response
index,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-04-08 03:55:48.128469,How do I request medical leave through the emp...,Requesting medical leave through the employee ...
2024-04-08 03:59:47.756913,How can I update my direct deposit information...,To update your direct deposit information on t...
2024-04-08 06:19:47.717513,How do I generate payroll reports in the accou...,Generating payroll reports in the accounting s...
2024-04-08 08:22:47.717513,Where can I find information about the company...,Information about the company's financial fore...
2024-04-08 08:32:47.717513,How do I handle fixed asset acquisitions and d...,Handling fixed asset acquisitions and disposal...


In [6]:
assistant_logs.iloc[6].question

"I'm having trouble accessing the training modules on the employee portal, what should I do?"

In [7]:
assistant_logs.iloc[6].response

"If you're having trouble accessing the training modules on the employee portal, please ensure that you are logged in with the correct credentials and that your account has the necessary permissions to view training materials. If the issue persists, please contact the HR department or training administrator for assistance."

# One-off reports

In [8]:
column_mapping = ColumnMapping(
    datetime='start_time',
    datetime_features=['end_time'],
    text_features=['question', 'response'],
    categorical_features=['organization', 'model_ID', 'region', 'environment', 'feedback'],
)

### Simple descriptors

In [None]:
#Built-in descriptors without parameters
report = Report(metrics=[
    TextEvals(column_name="question", descriptors=[
        Sentiment(display_name="Question sentiment"),
        TextLength(display_name= "Question length"),
        OOV(display_name= "Question out of vocabulary words")
    ]),
    TextEvals(column_name="response", descriptors=[
        Sentiment(display_name="Response sentiment"),
        NonLetterCharacterPercentage(display_name="Non letter characters in response"),
        SentenceCount(display_name="Sentence count in response"),
        WordCount(display_name="Word count in response")
    ])
])

report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], 
           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], 
           column_mapping=column_mapping)
report    

In [None]:
#Built-in descriptors with parameters
report = Report(metrics=[
    TextEvals(column_name="question", descriptors=[
        BeginsWith(display_name="'How' question", prefix="How"),
        RegExp(reg_exp=r"^I", display_name= "Question begins with 'I'"),
        IncludesWords(words_list=['invoice', 'salary'], display_name="Questions about invoices and salary")
    ]),
    TextEvals(column_name="response", descriptors=[
        EndsWith(display_name="Assisrance might be needed", suffix="for assistance."),
        ExcludesWords(words_list=['wrong', 'mistake'], display_name="Responses without mention of mistakes"),
        Contains(items=['medical leave'], display_name="contains 'medical leave'"),
        DoesNotContain(items=['employee portal'], display_name="does not contain 'employee portal'")
    ])
])

report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], 
           current_data=assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], 
           column_mapping=column_mapping)
report    

### Model-based descriptors

In [None]:
#Descriptors with Hugging Face models
report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
        HuggingFaceModel(model="DaNLP/da-electra-hatespeech-detection", display_name="Hugging Face Toxicity for response"),
        HuggingFaceModel(model="SamLowe/roberta-base-go_emotions", params={"label": "disappointment"}, 
                         display_name="Hugging Face Disappointment for response"), 
        HuggingFaceModel(model="SamLowe/roberta-base-go_emotions", params={"label": "optimism"}, 
                         display_name="Hugging Face Optimism for response"),
        HuggingFaceModel(model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli", params={"labels": ["HR", "finance"], "threshold":0.5}, 
                         display_name="Hugging Face Topic"), 
        HuggingFaceModel(model="lakshyakh93/deberta_finetuned_pii", params={"threshold": 0.6},
                        display_name="Hugging Face PII for respone"),
    ])
])

report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], 
           current_data= assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], 
           column_mapping=column_mapping)

report    

In [None]:
#Simplified descriptors for widely-used Hugging Face models 
report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
        HuggingFaceToxicityModel(toxic_label="hate"),
    ])
])

report.run(reference_data=assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], 
           current_data= assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], 
           column_mapping=column_mapping)

report

### LLM-based descriptors

In [None]:
pii_prompt = """
Personally identifiable information (PII) is information that, when used alone or with other relevant data, can identify an individual.

PII may contain direct identifiers (e.g., passport information) that can identify a person uniquely, 
or quasi-identifiers (e.g., race) that can be combined with other quasi-identifiers (e.g., date of birth) to successfully recognize an individual.
PII may contain person's name, person's address,and something I may forget to mention

Please identify whether or not the above text contains PII

text: REPLACE 

Use the following categories for PII identification:
1 if text contains PII
0 if text does not contain PII
0 if the information provided is not sufficient to make a clear determination

Retrun a category only
"""

negativity_prompt = """
Classify text into two groups: negative and positive

text: REPLACE 

Use the following categories for classification:
NEGATIVE if text is negative
POSITIVE if text is NOT negative
UNKNOWN use this category only if the information provided is not sufficient to make a clear determination

Retrun only category
"""

In [None]:
#Descriptors with external models
#to run OpenAIPrompting descriptor make sure you set environement variable with openai token 
report = Report(metrics=[
    TextEvals(column_name="response", descriptors=[
        OpenAIPrompting(prompt=pii_prompt, prompt_replace_string="REPLACE", model="gpt-3.5-turbo-instruct", feature_type="num", display_name="PII for response (by gpt3.5)"),
        OpenAIPrompting(prompt=negativity_prompt, prompt_replace_string="REPLACE", model="gpt-3.5-turbo-instruct", feature_type="cat", display_name="Negativity for response (by gpt3.5)")       
    ])
])

report.run(reference_data= None, #assistant_logs[datetime(2024, 4, 8) : datetime(2024, 4, 9)], 
           current_data= assistant_logs[:20], #assistant_logs[datetime(2024, 4, 9) : datetime(2024, 4, 10)], 
           column_mapping=column_mapping)

report    

## Get dataset with calculated descriptors

In [None]:
#reference dataset enriched with descriptors
report.datasets()[0]

In [None]:
#current dataset enriched with descriptors
report.datasets()[1]

# One-off Test Suits

In [None]:
test_suite = TestSuite(tests=[
    TestColumnValueMin(column_name = Sentiment().on("response"), gt=0),
    TestCategoryShare(column_name = "feedback", category="downvote", lt=0.1),
    TestCategoryShare(column_name = IncludesWords(words_list=['salary']).on("response"), category="False", lt=0.1), 
])

test_suite.run(reference_data=None, current_data=assistant_logs[:20])
test_suite

In [None]:
test_suite.datasets()[1]

# Monitoring

In [10]:
from evidently.ui.workspace.cloud import CloudWorkspace
from evidently.ui.dashboards import DashboardPanelTestSuite, ReportFilter, TestSuitePanelType
from evidently.renderers.html_widgets import WidgetSize

In [15]:
ws = CloudWorkspace(
    	token="dG9rbgHxqwn7jthAs4RERCoSxbOixatH+0jSMZmOW5dQ702BEgBQP/L+CGKLaZ03PZF+nL1biXWSgEITR2U5VzikBVanh4iH+G1nueSrBtxOLC0d9YeTCKWqF/zrmMNkwpq3QGuvHKsw00V08XCk5xGMbnHQ7QiiMQSS",
    	url="https://app.evidently.dev/"
)

In [16]:
project = ws.create_project("Virtual assistant: TestCategoryShare", team_id="6018e6ac-6532-491d-97ad-9daf1150a768")
project.description = "Project description"

In [17]:
def create_test_suite(i: int):
    test_suite = TestSuite(
        tests=[
            TestColumnValueMin(column_name=TextLength().on("response"), gt=100),
            TestShareOfOutRangeValues(column_name=TextLength().on("question"), left=30, right=100, lt=0.1),
            TestColumnValueMin(column_name=Sentiment().on("response"), gt=0),
            TestColumnValueMean(column_name=OOV().on("response"), lt=15),
            TestCategoryShare(column_name = "feedback", category="downvote", lt=0.1),
            TestCategoryShare(column_name = IncludesWords(words_list=['salary']).on("response"), category=False, lt=0.1), 
        ],
        timestamp=datetime.now() + timedelta(days=i),
    )
    test_suite.run(reference_data=None, current_data=assistant_logs.iloc[20 * i : 20 * (i + 1), :], column_mapping=column_mapping)
    return test_suite

In [22]:
def create_report(i: int):
    report = Report(metrics=[
        TextEvals(column_name="question", descriptors=[
            Sentiment(display_name="Question sentiment"),
            TextLength(display_name= "Question length"),
            OOV(display_name= "Question out of vocabulary words"),
        ]),
        TextEvals(column_name="response", descriptors=[
            Sentiment(display_name="Response sentiment"),
            NonLetterCharacterPercentage(display_name="Non letter characters in response"),
            SentenceCount(display_name="Sentence count in response"),
            WordCount(display_name="Word count in response"),
        ]),
        ColumnCategoryMetric(column_name=IncludesWords(words_list=['salary']).for_column("response"), category=True),
    ],
        timestamp=datetime.now() + timedelta(days=i),
                   )
    
    report.run(reference_data=None, current_data=assistant_logs.iloc[20 * i : 20 * (i + 1), :], column_mapping=column_mapping)
    return report    

In [19]:
for i in range(0, 5):
        test_suite = create_test_suite(i=i)
        ws.add_test_suite(project.id, test_suite)

EvidentlyServiceError: Validation failed for POST /api/projects/f58974b1-4d69-40fc-b04d-c1519e5cf61b/snapshots

In [23]:
for i in range(0, 5):
        report = create_report(i=i)
        ws.add_report(project.id, report)

EvidentlyServiceError: Validation failed for POST /api/projects/f58974b1-4d69-40fc-b04d-c1519e5cf61b/snapshots

In [None]:
project.dashboard.add_panel(
    DashboardPanelTestSuite(
        title="Test results",
        filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=True),
        size=WidgetSize.FULL,
        panel_type=TestSuitePanelType.DETAILED,
        time_agg="1D",
    )
)
project.save()

In [None]:
project.dashboard.add_panel(
    DashboardPanelDistribution(
        title="132",
        filter=ReportFilter(
            metadata_values={}, tag_values=[], include_test_suites=True
        ),
        size=WidgetSize.FULL,
        value=PanelValue(
            field_path="counts.current",
            metric_id=None,
            metric_args={},
            legend="",
        ),
        barmode=HistBarMode.GROUP,
    )
)