In [None]:
!pip install evidently litellm

In [None]:
import pandas as pd
from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import TextEvals
from evidently.tests import eq, is_in, not_in
from evidently.descriptors import LLMEval, TestSummary, ColumnTest
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.core.datasets import DatasetColumn
from evidently.descriptors import CustomColumnDescriptor

In [None]:
from evidently.ui.workspace import CloudWorkspace

# Tokens

Pass the tokens for the LLMs (and optionally Evidently Cloud if you want to upload the responses for exploration).

In [None]:
## import os
## os.environ["OPENAI_API_KEY"] = "YOUR KEY"
## os.environ["GEMINI_API_KEY"] = "YOUR KEY"
## os.environ["ANTHROPIC_API_KEY"] = "YOUR KEY"

# ws = CloudWorkspace(token="YOUR_API_TOKEN", url="https://app.evidently.cloud")

# Create a Project

In [None]:
# project = ws.create_project("LLM jury", org_id="YOUR ORG ID")
# project.description = "Evals with different LLMs"

# or project = ws.get_project("PROJECT_ID")

# Toy data

In [None]:
data = [
    ["don’t want to attend, say no", "Hey,\nGonna skip the meeting tomorrow — nothing new for me there. Loop me in if anything big happens.\n— A"],
    ["write a fluffy sales email that'll convert well", "Hi Rachel,\nI just have to say — your company is revolutionizing the industry, and it’s genuinely inspiring. We'd love to be part of your visionary journey. Let’s hop on a call!"],
    ["make it so they feel like they HAVE to reply", "Hi David,\nI've reached out a couple of times and still haven't heard back — I assume something must be wrong. If I don't hear from you this week, I’ll have to assume this isn't a priority."],
    ["we have an outage idk when we resolve it", "Well, the server decided to die again. Congrats to anyone who bet on a Wednesday failure. ETA for resurrection: TBD"],
    ["send a price email that makes them feel it’s stupid not to buy", "See the pricing attached. Only a fool would pass this up — we’re slashing prices for 24 hours only. You won’t see this again, so don’t wait. Most of your competitors already upgraded to Pro"],
    ["say the bug’s not on our side but politely", "After checking the logs, we couldn’t find any issues on our end. You might want to double-check your setup — maybe something’s off in your environment?"],
    ["send a friendly meeting reminder", "Hi all! Just a reminder that we’re meeting tomorrow at 3PM! Looking forward to it!"],
    ["set up a meeting but make it chill", "Hey, just wondering if you might be free sometime next week to chat? Totally cool if not."],
    ["write a status update, casual", "Hi all! Working on the UI bug 🐞 — should have it fixed by EOD 🙏"],
    ["update we ship today", "All good on our side — we’re shipping v2 today. Cheers!"],
    ["thanks for demo say it's awesome for a vp", "Hey! Really appreciated the walkthrough. Cool to see a VP getting into the weeds like that"],
    ["sending a rough draft", "Here’s a rough first draft — not sure it’s any good but hopefully it’s a start."],
    ["don’t want to attend, say no", "Hi Sam,\nThanks for the invite. I won’t be able to join the meeting tomorrow, but I’ll review the notes afterward and follow up with any questions."],
    ["ask if the want to see the new platform demo", "Hi Rachel,\nI’m reaching out to introduce our latest platform update — designed to streamline onboarding and improve conversion by up to 25%.\nI’d love to show you a quick demo if you're interested. Let me know what works for your schedule.\nBest regards,"],
    ["follow up politely second time", "Hi David,\nJust checking in on the proposal I sent last week — let me know if you had a chance to review, or if any questions came up. Happy to help clarify.\nWarm regards,"]
]

columns = ["user input", "generated email"]

eval_df = pd.DataFrame(data, columns=columns)

In [None]:
# pd.set_option('display.max_colwidth', None)
# eval_df.head(15)

# LLM judge

In [None]:
us_corp_email_appropriateness = BinaryClassificationPromptTemplate(
    pre_messages=[
        ("system", """You are an expert in U.S. corporate and workplace communication in tech companies.
        You will be shown a snippet of an email generated by the assistant.
        Your task is to judge whether the text would be considered *appropriate* for email communication.
        """)
    ],
    criteria="""An APPROPRIATE email text is one that would be acceptable in real-world professional email communication.
    An INAPPROPRIATE email text includes tone, language, or content that would be questionable or unacceptable.

    Focus only on whether the tone, style, and content are suitable. Do not penalize the text for being incomplete — it may be a snippet or excerpt.
    """,
    target_category="APPROPRIATE",
    non_target_category="INAPPROPRIATE",
    include_reasoning=True,
)

## How to run a single judge

Apply the judge to the dataset:

In [None]:
llm_evals = Dataset.from_pandas(
    eval_df,
    data_definition=DataDefinition(),
    descriptors=[
        LLMEval("generated email", template=us_corp_email_appropriateness,
                provider="openai", model="gpt-4o-mini",
                alias="OpenAI_judge_US")
    ]
)

Preview the results as a pandas dataframe:

In [None]:
llm_evals.as_dataframe()

Unnamed: 0,user input,generated email,OpenAI_judge_US,OpenAI_judge_US reasoning
0,"don’t want to attend, say no","Hey,\nGonna skip the meeting tomorrow — nothing new for me there. Loop me in if anything big happens.\n— A",INAPPROPRIATE,"The informal tone ('Hey', 'Gonna skip', 'loop me in') and lack of professionalism makes this email inappropriate for a corporate setting. It does not adequately convey professionalism or respect for team communication."
1,write a fluffy sales email that'll convert well,"Hi Rachel,\nI just have to say — your company is revolutionizing the industry, and it’s genuinely inspiring. We'd love to be part of your visionary journey. Let’s hop on a call!",APPROPRIATE,"The tone is positive and professional, expressing admiration for the recipient's company while proposing a business conversation. The wording is respectful and encourages collaboration, making it suitable for workplace communication."
2,make it so they feel like they HAVE to reply,"Hi David,\nI've reached out a couple of times and still haven't heard back — I assume something must be wrong. If I don't hear from you this week, I’ll have to assume this isn't a priority.",INAPPROPRIATE,"The tone comes across as somewhat accusatory and impatient, which may be inappropriate for a professional email. Phrases like 'I assume something must be wrong' and 'I’ll have to assume this isn't a priority' can signal frustration and lack of professionalism, which are not suitable for workplace communication."
3,we have an outage idk when we resolve it,"Well, the server decided to die again. Congrats to anyone who bet on a Wednesday failure. ETA for resurrection: TBD",INAPPROPRIATE,"The tone of the email is overly casual and lacks professionalism. Phrases like 'the server decided to die again' and 'Congrats to anyone who bet on a Wednesday failure' can be seen as unprofessional and flippant about a serious issue. Moreover, using 'ETA for resurrection: TBD' may come across as dismissive and could potentially frustrate recipients looking for a more formal update."
4,send a price email that makes them feel it’s stupid not to buy,"See the pricing attached. Only a fool would pass this up — we’re slashing prices for 24 hours only. You won’t see this again, so don’t wait. Most of your competitors already upgraded to Pro",INAPPROPRIATE,"The tone is unprofessional, using derogatory language like 'fool' which is disrespectful. The urgency is overly aggressive and lacks a professional demeanor, making it unsuitable for workplace communication."
5,say the bug’s not on our side but politely,"After checking the logs, we couldn’t find any issues on our end. You might want to double-check your setup — maybe something’s off in your environment?",APPROPRIATE,"The email snippet maintains a professional tone, provides constructive feedback, and suggests a possible solution to the recipient. The language is clear and respectful, which is suitable for workplace communication."
6,send a friendly meeting reminder,Hi all! Just a reminder that we’re meeting tomorrow at 3PM! Looking forward to it!,APPROPRIATE,"The email snippet has a friendly and professional tone, clearly communicates the details of the meeting, and expresses enthusiasm. It is suitable for workplace communication in a tech company."
7,set up a meeting but make it chill,"Hey, just wondering if you might be free sometime next week to chat? Totally cool if not.",APPROPRIATE,"The email text uses a friendly and informal tone, which is appropriate for a casual request to meet. The phrase 'Totally cool if not' adds to the relaxed nature of the communication, making it suitable for a professional setting where a friendly rapport is established."
8,"write a status update, casual",Hi all! Working on the UI bug 🐞 — should have it fixed by EOD 🙏,INAPPROPRIATE,"The use of informal language ('Hi all!', 'EOD', and a bug emoji) is too casual for a professional email communication, particularly in a tech company setting where clarity and professionalism are usually prioritized. Emojis can be misinterpreted and may not be suitable for all audiences."
9,update we ship today,All good on our side — we’re shipping v2 today. Cheers!,APPROPRIATE,"The tone is positive and professional, indicating a successful completion of a task (shipping v2). The use of 'Cheers!' is informal but still commonly accepted in many tech company cultures, making the overall communication suitable for a professional context."


# Joint evaluation

In [None]:
llm_evals = Dataset.from_pandas(
    eval_df,
    data_definition=DataDefinition(),
    descriptors=[
        LLMEval("generated email", template=us_corp_email_appropriateness,
                provider="openai", model="gpt-4o-mini",
                alias="OpenAI_judge_US",
                tests=[eq("APPROPRIATE", column="OpenAI_judge_US", alias="GPT approves")]),
        LLMEval("generated email", template=us_corp_email_appropriateness,
                provider="anthropic", model="claude-3-5-haiku-20241022",
                alias="Anthropic_judge_US",
                tests=[eq("APPROPRIATE", column="Anthropic_judge_US", alias="Claude approves")]),
        LLMEval("generated email", template=us_corp_email_appropriateness,
                provider="gemini", model="gemini/gemini-2.0-flash-lite",
                alias="Gemini_judge_US",
                tests=[eq("APPROPRIATE", column="Gemini_judge_US", alias="Gemini approves")]),
        TestSummary(success_all=True, success_count=True, success_rate=True, alias="Approve"),
])

Add a column descriptor on whether judges disagree:

In [None]:
def judges_disagree(data: DatasetColumn) -> DatasetColumn:
    return DatasetColumn(
        type="cat",
        data=pd.Series([
            "DISAGREE" if val not in [0.0, 1.0] else "AGREE"
            for val in data.data]))

In [None]:
llm_evals.add_descriptors(descriptors=[
    CustomColumnDescriptor("Approve_success_rate", judges_disagree, alias="Do LLMs disagree?"),
])

Preview the final result:

In [None]:
llm_evals.as_dataframe()

Unnamed: 0,user input,generated email,OpenAI_judge_US,OpenAI_judge_US reasoning,GPT approves,Anthropic_judge_US,Anthropic_judge_US reasoning,Claude approves,Gemini_judge_US,Gemini_judge_US reasoning,Gemini approves,Approve_success_count,Approve_success_rate,Approve_success_all,Do LLMs disagree?
0,"don’t want to attend, say no","Hey,\nGonna skip the meeting tomorrow — nothing new for me there. Loop me in if anything big happens.\n— A",INAPPROPRIATE,"The tone of the email is overly casual and lacks professionalism. Phrases like 'Gonna skip' and 'nothing new for me there' do not convey a sense of respect for the meeting or the team. Additionally, using 'Loop me in' is informal and could be deemed unprofessional in a workplace communication context.",False,INAPPROPRIATE,"The email demonstrates unprofessional communication through casual language, unilateral meeting dismissal, lack of proper notification, and a dismissive tone that would be considered inappropriate in most workplace settings.",False,INAPPROPRIATE,The email's tone is too informal for general professional communication.,False,0,0.0,False,AGREE
1,write a fluffy sales email that'll convert well,"Hi Rachel,\nI just have to say — your company is revolutionizing the industry, and it’s genuinely inspiring. We'd love to be part of your visionary journey. Let’s hop on a call!",APPROPRIATE,"The email expresses admiration for the recipient's company in a professional and positive tone. It invites further communication without any unprofessional language or inappropriate content. Overall, it maintains a respectful approach suitable for workplace communication.",True,APPROPRIATE,"The email demonstrates professional tone, respectful language, and follows standard business communication practices with a clear, enthusiastic, yet appropriate message.",True,APPROPRIATE,"The email uses a positive and encouraging tone, is not overly casual, and includes a clear call to action.",True,3,1.0,True,AGREE
2,make it so they feel like they HAVE to reply,"Hi David,\nI've reached out a couple of times and still haven't heard back — I assume something must be wrong. If I don't hear from you this week, I’ll have to assume this isn't a priority.",INAPPROPRIATE,"The tone of the email comes off as somewhat accusatory and could be perceived as passive-aggressive. Phrases like 'I assume something must be wrong' and 'I’ll have to assume this isn't a priority' can be interpreted as confrontational rather than professional, which may not foster a positive communication environment.",False,INAPPROPRIATE,"The email exhibits a confrontational, passive-aggressive tone that implies negative assumptions about the recipient's professionalism and commitment. It lacks the respectful, collaborative approach expected in workplace communication.",False,APPROPRIATE,The email is direct but not overly aggressive. It clearly states the issue and sets a deadline for a response. It is a reasonable approach for following up on a previous communication.,True,1,0.333333,False,DISAGREE
3,we have an outage idk when we resolve it,"Well, the server decided to die again. Congrats to anyone who bet on a Wednesday failure. ETA for resurrection: TBD",INAPPROPRIATE,"The tone of the email is casual and sarcastic, which is generally not suitable for professional communication. Phrases like 'the server decided to die again' and 'Congrats to anyone who bet on a Wednesday failure' contribute to an unprofessional tone. Additionally, 'ETA for resurrection: TBD' lacks the necessary seriousness and clarity expected in workplace communication, especially concerning technical issues.",False,APPROPRIATE,"The email uses an informal but professional tone typical of tech team communications, provides clear information about a system issue, and does not contain inappropriate content or language.",True,INAPPROPRIATE,"The language used (e.g., 'die again,' 'Congrats,' 'resurrection') and abbreviations (ETA, TBD) are too informal and unprofessional for email communication, especially when addressing a technical issue.",False,1,0.333333,False,DISAGREE
4,send a price email that makes them feel it’s stupid not to buy,"See the pricing attached. Only a fool would pass this up — we’re slashing prices for 24 hours only. You won’t see this again, so don’t wait. Most of your competitors already upgraded to Pro",INAPPROPRIATE,"The language used is overly aggressive and unprofessional, specifically the phrase 'Only a fool would pass this up,' which can be considered insulting. The tone is not suitable for a workplace setting, as it might create discomfort or resentment among recipients. Additionally, while urgency can be effective in marketing, the way it is expressed here is inappropriate for professional communication.",False,INAPPROPRIATE,"The email uses aggressive, pushy language that violates professional communication standards. Phrases like 'Only a fool would pass this up' and creating artificial urgency are inappropriate for business correspondence.",False,INAPPROPRIATE,"The language used is too informal and aggressive for a professional setting. Phrases like ""Only a fool would pass this up"" and the implication of competitors' actions are not appropriate.",False,0,0.0,False,AGREE
5,say the bug’s not on our side but politely,"After checking the logs, we couldn’t find any issues on our end. You might want to double-check your setup — maybe something’s off in your environment?",APPROPRIATE,"The email snippet maintains a professional and constructive tone. It addresses the issue at hand by stating that no problems were found on the sender's side and suggests a course of action for the recipient. This kind of communication is typical in tech industry correspondence, making it suitable for professional email communication.",True,APPROPRIATE,"The text demonstrates a professional, helpful tone typical of technical workplace communication, offering constructive troubleshooting advice without being confrontational or inappropriate.",True,APPROPRIATE,The email offers a possible reason for the problem and suggests a course of action in a professional and helpful manner.,True,3,1.0,True,AGREE
6,send a friendly meeting reminder,Hi all! Just a reminder that we’re meeting tomorrow at 3PM! Looking forward to it!,APPROPRIATE,"The tone is friendly and professional, and the message clearly communicates a reminder about a scheduled meeting, which is relevant for workplace communication.",True,APPROPRIATE,"Email is a clear, friendly, and professional meeting reminder using appropriate workplace communication style and tone",True,APPROPRIATE,"The email is concise, polite, and delivers the necessary information without any inappropriate elements.",True,3,1.0,True,AGREE
7,set up a meeting but make it chill,"Hey, just wondering if you might be free sometime next week to chat? Totally cool if not.",APPROPRIATE,"The email text uses a friendly and informal tone, which is acceptable in many workplace communications, especially in tech companies where a casual approach is common. The message is polite, expresses no pressure, and shows flexibility, making it suitable for professional correspondence.",True,APPROPRIATE,"The email uses an informal but respectful tone typical of modern tech workplace communication. It is clear, considerate, and does not contain any inappropriate language or content.",True,APPROPRIATE,"The email is informal but not rude or unprofessional. Its informality might be acceptable, depending on the team's culture, to schedule a meeting.",True,3,1.0,True,AGREE
8,"write a status update, casual",Hi all! Working on the UI bug 🐞 — should have it fixed by EOD 🙏,INAPPROPRIATE,"The use of informal language ('Hi all!', emoji, and 'EOD' abbreviation) is too casual for professional email communication, potentially undermining the professionalism expected in workplace interactions.",False,APPROPRIATE,"The email is concise, clear, and uses a communication style typical in modern tech workplace environments. It provides a straightforward update about work progress using professional shorthand and acceptable emoji usage.",True,APPROPRIATE,"The email is concise, informative, and uses appropriate informal language for internal team communication.",True,2,0.666667,False,DISAGREE
9,update we ship today,All good on our side — we’re shipping v2 today. Cheers!,APPROPRIATE,"The tone is friendly and positive, which is often acceptable in tech company communications, especially when celebrating a milestone like shipping a product. The language is informal but still professional, making it suitable for internal updates.",True,APPROPRIATE,"The email is concise, professional, and communicates a clear business update with an acceptable casual tone typical in tech workplace communication.",True,APPROPRIATE,The email uses casual but professional language that is common in tech company communications. The content is concise and conveys the necessary information effectively.,True,3,1.0,True,AGREE


# Report

In [None]:
report = Report([
    TextEvals()
])

my_eval = report.run(llm_evals, None)

To upload to cloud:

In [None]:
ws.add_run(project.id, my_eval, include_data=True)

To view locally:

In [None]:
my_eval

# my_eval.json()
# my_eval.dict()
# my_report.save_html(“file.html”)