# Evaluating LLMs on a Content Generation Task

* docs: https://docs.evidentlyai.com/introduction
* repo: https://github.com/evidentlyai/evidently/

In [None]:
#! pip install openai evidently tracely

In [None]:
from openai import OpenAI
import pandas as pd
import time

In [None]:
my_topics = [
    "testing in AI engineering is as important as in development",
    "CI/CD is applicable in AI",
    "Collaboration of subject matter experts and AI engineers improves product",
    "Start LLM apps development from test cases generation",
    "evidently is agreat for LLM testing, use it" #here is a tiny mistake!
]

In [None]:
OA_client = OpenAI()

## Tracing and Evaluation

In [None]:
from evidently.ui.workspace import CloudWorkspace

from evidently import Dataset, DataDefinition, Report
from evidently.descriptors import *
from evidently.tests import lte, lt, gte, gt, eq, is_in
from evidently.core.datasets import DescriptorTest
from evidently.presets import TextEvals
from evidently.llm.templates import BinaryClassificationPromptTemplate
from evidently.presets.dataset_stats import ValueStatsTests
from evidently.metrics import RowTestSummary

In [None]:
from tracely import get_info, init_tracing, trace_event

In [None]:
client = CloudWorkspace()

In [None]:
project = client.create_project("Content Generation",  org_id = "ORG ID")

In [None]:
init_tracing(
    project_id=str(project.id), # Project ID from Evidently Cloud
    export_name="content generation: basic",
    as_global=False
)

In [None]:
get_info()

In [None]:
@trace_event()
def tweet_generation(topic, model="gpt-3.5-turbo", instructions=""):
    response = OA_client.responses.create(
        instructions=instructions,
        model=model,
        input=f"Write a paragraph about {topic}"
    )

    text = response.output_text
    return text

In [None]:
basic_tweets = []
for topic in my_topics:
    basic_tweets.append(tweet_generation(topic, model="gpt-3.5-turbo", instructions=""))
    time.sleep(3)

In [None]:
basic_tweets

In [None]:
dataset_id = get_info()['export_id']
dataset = client.load_dataset(dataset_id)

In [None]:
dataset.data_definition

In [None]:
dataset.as_dataframe()

## Reference-free evals

In [None]:
descriptors = [
    TextLength("tweet_generation.result", alias="Length"),
    Sentiment("tweet_generation.result", alias="Sentiment"),
]

In [None]:
dataset.add_descriptors(descriptors=descriptors)

In [None]:
dataset.as_dataframe()

In [None]:
report = Report(
    metrics=[TextEvals()]
)

In [None]:
basic_tweets_eval =  report.run(dataset, tags=["gpt-3.5-turbo", "simple evals"])

In [None]:
basic_tweets_eval

In [None]:
client.add_run(project.id, basic_tweets_eval, include_data=True)

## LLM-as-a-judge evals

In [None]:
dataset = client.load_dataset(dataset_id)

In [None]:
tweet_quality = BinaryClassificationPromptTemplate(
    pre_messages = [("system","You are evaluating the quality of tweets")],
    criteria="""
        Text is ENGAGING if it meets at least one of the following:
            •        Contains a strong hook (e.g. question, surprise, bold statement)
            •        Uses emotion, humor, or opinion
            •        Encourages interaction (calls to action, second-person voice like “you”)
            •        Demonstrates personality or a distinct tone
            •        Includes vivid language, metaphors, or emojis
            •        Sparks curiosity or gives a new insight

        Text is NEUTRAL if it:
            •        Merely states a fact or observation without emotion or opinion
            •        Lacks clear personality, tone, or call to action
            •        Uses generic language with no rhetorical style
            •        Reads like an internal note, report, or placeholder
        """,
    target_category="ENGAGING",
    non_target_category="NEUTRAL",
    uncertainty="non_target",
    include_reasoning=True,
)

In [None]:
descriptors = [
    TextLength("tweet_generation.result", alias="Length", tests=[lte(280)]),
    Sentiment("tweet_generation.result", alias="Sentiment", tests=[gt(0.8)]),
    LLMEval("tweet_generation.result", template=tweet_quality, tests=[eq(column="Tweet quality", expected="ENGAGING")],
            provider="openai", model="gpt-4o-mini",
            alias="Tweet quality")
]

In [None]:
dataset.add_descriptors(descriptors=descriptors)

In [None]:
dataset.as_dataframe()

In [None]:
report = Report(
    metrics=[
        TextEvals(column_tests={"Length":ValueStatsTests(max_tests=[lte(280)]),
                                "Sentiment":ValueStatsTests(min_tests=[gt(0.5)]),
                                "Tweet quality":ValueStatsTests(unique_values_count_tests={"ENGAGING":[gte(5)]})
                                    })]
)

In [None]:
llm_tweets_eval = report.run(dataset, tags=["gpt-3.5-turbo", "llm evals"])

In [None]:
llm_tweets_eval

In [None]:
client.add_run(project.id, llm_tweets_eval, include_data=True)

## Improved content generation

In [None]:
init_tracing(
    project_id=str(project.id), # Project ID from Evidently Cloud
    export_name="content generation: improved",
    as_global=False
)

In [None]:
instruction="""You are a chief editor with 10 years of experience in technical writing. 
        You specialize in creating concise, engaging, and to-the-point content for engineers. 
        Your style is clear, direct, and focused on delivering technical value without fluff."""

In [None]:
improved_tweets = []
for topic in my_topics:
    improved_tweets.append(tweet_generation(topic, model="gpt-4o-mini", instructions=instruction))
    time.sleep(3)

In [None]:
dataset_id = get_info()['export_id']
improved_dataset = client.load_dataset(dataset_id)

In [None]:
improved_dataset.add_descriptors(descriptors=descriptors)

In [None]:
improved_tweets_eval = report.run(improved_dataset, tags=["gpt-4o-mini", "llm evals"])

In [None]:
improved_tweets_eval

In [None]:
client.add_run(project.id, improved_tweets_eval, include_data=True)

## Prompt optimization

In [None]:
from evidently.llm.optimization import PromptOptimizer, Params

In [None]:
def run_prompt(generation_prompt: str, context):
    """generate engaging tweets"""
    tweets = [tweet_generation(topic, model="gpt-4o-mini", instructions=generation_prompt) for topic in my_topics]
    return tweets

In [None]:
optimizer = PromptOptimizer("tweet_gen_example", strategy="feedback")
optimizer.set_param(Params.BasePrompt, "You are tweet generator")
await optimizer.arun(run_prompt, 
                     scorer=LLMEval("basic_tweet_generation.result",
                                    template=tweet_quality,
                                    provider="openai", 
                                    model="gpt-4o-mini", 
                                    alias="tweet quality"))

In [None]:
print(optimizer.best_prompt())

In [None]:
init_tracing(
    project_id=str(project.id), # Project ID from Evidently Cloud
    export_name="content generation: optimized",
    as_global=False
)

In [None]:
optimized_tweets = []
for topic in my_topics:
    optimized_tweets.append(tweet_generation(topic, model="gpt-4o-mini", instructions=optimizer.best_prompt()))
    time.sleep(3)

In [None]:
dataset_id = get_info()['export_id']
optimized_dataset = client.load_dataset(dataset_id)

In [None]:
optimized_dataset.add_descriptors(descriptors=descriptors)

In [None]:
optimized_tweets_eval = report.run(optimized_dataset, tags=["gpt-4o-mini", "evi optimizer", "llm evals"])

In [None]:
optimized_tweets_eval

In [None]:
client.add_run(project.id, optimized_tweets_eval, include_data=True)