In [1]:
from evidently.future.datasets import Dataset
from evidently.future.descriptors import (
    TextLength,
    BERTScore,
    BeginsWith,
    Contains,
    ContainsLink,
    DoesNotContain,
    EndsWith,
    ExactMatch,
    ExcludesWords,
    IncludesWords,
    IsValidJSON,
    IsValidPython,
    ItemMatch,
    ItemNoMatch,
    NonLetterCharacterPercentage,
    OOVWordsPercentage,
    OpenAI,
    RegExp,
    SemanticSimilarity,
    SentenceCount,
    Sentiment,
    TriggerWordsPresent,
    WordCount,
    WordMatch,
    WordNoMatch)
import pandas as pd

In [2]:
df = pd.DataFrame({"str1": ["first value",
                            "second value",
                            "third value",
                            "fourth value"],
                   "str2": ["first value",
                            "http://localhost.com",
                            "{\"a\":\"b\"}",
                            "import evidently"],
                   })

In [3]:
pii_prompt = """
Personally identifiable information (PII) is information that, when used alone or with other relevant data, can identify an individual.

PII may contain direct identifiers (e.g., passport information) that can identify a person uniquely, 
or quasi-identifiers (e.g., race) that can be combined with other quasi-identifiers (e.g., date of birth) to successfully recognize an individual.
PII may contain person's name, person's address,and something I may forget to mention

Please identify whether or not the above text contains PII

text: REPLACE 

Use the following categories for PII identification:
1 if text contains PII
0 if text does not contain PII
0 if the information provided is not sufficient to make a clear determination

Retrun a category only
"""

In [4]:
dataset = Dataset.from_pandas(
    df,
    descriptors=[
        BeginsWith("str1", "first"),
        BERTScore(["str1", "str2"]),
        Contains("str1", ["first"]),
        ContainsLink("str2"),
        DoesNotContain("str1", ["second"]),
        EndsWith("str1", "value"),
        ExactMatch(["str1", "str2"]),
        ExcludesWords("str1", ["second"]),
        # hugging_face("str1"),
        # hugging_face_toxicity("str1"),
        IncludesWords("str1", ["second"]),
        IsValidJSON("str2"),
        IsValidPython("str2"),
        ItemMatch(["str1", "str2"]),
        ItemNoMatch(["str1", "str2"]),
        # json_match("str1"),
        # json_schema_match("str1"),
        # llm_judge("str1"),
        NonLetterCharacterPercentage("str2"),
        OOVWordsPercentage("str1"),
        # openai("str1"),
        RegExp("str1", ".*value"),
        SemanticSimilarity(["str1", "str2"]),
        SentenceCount("str1"),
        Sentiment("str1"),
        TextLength("str1"),
        TriggerWordsPresent("str1", ["first"]),
        WordCount("str1"),
        WordMatch(["str1", "str2"], "includes_any", True),
        WordNoMatch(["str1", "str2"], "includes_any", True),
        # words_presence("str1", ["first"], "includes_any"),
        OpenAI("str1", prompt=pii_prompt, prompt_replace_string="REPLACE", model="gpt-3.5-turbo-instruct", feature_type="num"),
    ],
)


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [10]:
dataset.as_dataframe()

Unnamed: 0,str1,str2,Text Begins with [first] for str1,BERTScore for str1 str2.,Text Contains of any [first] for str1,str2 contains link,Text Does Not Contain of any [second] for str1,Text Ends with [value] for str1,Exact Match for str1 str2.,"Text Excludes excludes_any words [['second']], lemmatize: True] for str1",...,RegExp '.*value' Match for column str1,Semantic Similarity for str1 str2.,Sentence Count for str1,Sentiment for str1,Text Length for str1,"TriggerWordsPresent [words: ['first'], lemmatize: True] for str1",Word Count for str1,Text contains includes defined words,Text does not contain includes defined words,OpenAI for str1
0,first value,first value,True,0.859864,True,False,True,True,True,True,...,1,1.0,1,0.34,11,1,2,False,True,0.0
1,second value,http://localhost.com,False,0.442877,False,True,False,True,False,False,...,1,0.490672,1,0.34,12,0,2,False,True,0.0
2,third value,"{""a"":""b""}",False,0.494391,False,False,True,True,False,True,...,1,0.59031,1,0.34,11,0,2,False,True,0.0
3,fourth value,import evidently,False,0.658956,False,False,True,True,False,True,...,1,0.548347,1,0.34,12,0,2,False,True,0.0
