# Evaluating LLMs on a Content Generation Task

* docs: https://docs.evidentlyai.com/introduction
* repo: https://github.com/evidentlyai/evidently/

In [None]:
#! pip install --upgrade "openai"

In [None]:
import openai

In [None]:
from openai import OpenAI
import pandas as pd

In [None]:
my_topics = [
    "testing in AI engineering is as important as in development",
    "CI/CD is applicable in AI",
    "Collaboration of subject matter experts and AI engineers improves product",
    "Start LLM apps development from test cases generation",
    "evidently is agreat for LLM testing, use it"
]

In [None]:
OA_client = OpenAI()

## Basic content generation

In [None]:
response = OA_client.responses.create(
        model="gpt-4o-mini",
        input=f"Write a short paragraph about {my_topics[0]}"
    )

text = response

In [None]:
text.output_text

In [None]:
text.usage.input_tokens, text.usage.output_tokens

In [None]:
def basic_tweet_generation(topic, model="gpt-3.5-turbo", instructions=""):
    response = OA_client.responses.create(
        instructions=instructions,
        model=model,
        input=f"Write a short paragraph about {topic}"
    )

    text = response.output_text
    return text

In [None]:
basic_tweet_generation(my_topics[-1])

## Tracing and Evaluation

In [None]:
#! pip install tracely

In [None]:
#! pip install evidently

In [None]:
from evidently.ui.workspace import Workspace #CloudWorkspace #, RemoteWorkspace

from evidently import Dataset, DataDefinition, Report
from evidently.descriptors import *
from evidently.presets import TextEvals
from evidently.llm.templates import BinaryClassificationPromptTemplate

In [None]:
from tracely import init_tracing, get_info
from tracely import trace_event, get_current_span

In [None]:
client = Workspace(path='workspace')
#client = CloudWorkspace()

In [None]:
project = client.create_project(name="Content evals")
#project = client.create_project("Content Generation: evals & optimization",  
#                                org_id = "ORG ID HERE")

In [None]:
init_tracing(
    address="http://localhost:8000/",
    project_id=project.id, 
    export_name="basic content generation",
    as_global=False,
)

In [None]:
@trace_event()
def tweet_generation(topic, model="gpt-3.5-turbo", instructions=""):
    response = OA_client.responses.create(
        instructions=instructions,
        model=model,
        input=f"Write a short paragraph about {topic}"
    )

    span = get_current_span()
    span.update_usage(
        tokens={
            "input": response.usage.input_tokens,
            "output": response.usage.output_tokens,
        }
    )

    text = response.output_text
    return text

In [None]:
basic_tweets = [tweet_generation(topic, model="gpt-3.5-turbo", instructions="") for topic in my_topics]

In [None]:
basic_tweets

In [None]:
get_info()

In [None]:
dataset_id = get_info()['export_id']
dataset = client.load_dataset(dataset_id)

In [None]:
dataset.data_definition

In [None]:
dataset.as_dataframe()

In [None]:
tweet_quality = BinaryClassificationPromptTemplate(
    pre_messages = [("system","You are evaluating the quality of tweets")],
    criteria="""
        Text is ENGAGING if it meets at least one of the following:
            •        Contains a strong hook (e.g. question, surprise, bold statement)
            •        Uses emotion, humor, or opinion
            •        Encourages interaction (calls to action, second-person voice like “you”)
            •        Demonstrates personality or a distinct tone
            •        Includes vivid language, metaphors, or emojis
            •        Sparks curiosity or gives a new insight

        Text is NEUTRAL if it:
            •        Merely states a fact or observation without emotion or opinion
            •        Lacks clear personality, tone, or call to action
            •        Uses generic language with no rhetorical style
            •        Reads like an internal note, report, or placeholder
        """,
    target_category="ENGAGING",
    non_target_category="NEUTRAL",
    uncertainty="non_target",
    include_reasoning=True,
)

In [None]:
descriptors = [
    TextLength("tweet_generation.result", alias="Length"),
    Sentiment("tweet_generation.result", alias="Sentiment"),
    LLMEval("tweet_generation.result", template=tweet_quality,
           provider="openai", model="gpt-4o-mini",
           alias="Tweet quality")
]

In [None]:
dataset.add_descriptors(descriptors=descriptors)

In [None]:
dataset.as_dataframe()

In [None]:
report = Report(
    metrics=[TextEvals()]
)

In [None]:
basic_tweets_eval = report.run(dataset, tags=["gpt-3.5-turbo", "basic generation"])

In [None]:
basic_tweets_eval

In [None]:
client.add_run(project.id, basic_tweets_eval, include_data=True)

## Prompt Optimization

In [None]:
from evidently.llm.optimization import PromptOptimizer, PromptExecutionLog, Params

In [None]:
def run_prompt(generation_prompt: str, context) -> pd.Series:
    """generate engaging tweets"""
    my_topics = [
        "testing in AI engineering is as important as in development",
        "CI/CD is applicable in AI",
        "Collaboration of subject matter experts and AI engineers improves product",
        "Start LLM apps development from test cases generation",
        "evidently is a great tool for LLM testing"
    ]
    tweets = [basic_tweet_generation(topic, model="gpt-3.5-turbo", instructions=generation_prompt) for topic in my_topics]
    return pd.Series(tweets)

In [None]:
judge = LLMEval("tweet_generation.result", template=tweet_quality,
                provider="openai", model="gpt-4o-mini", alias="Tweet quality")

In [None]:
optimizer = PromptOptimizer("tweet_gen_example", strategy="feedback", verbose=True)
await optimizer.arun(run_prompt, scorer=judge, base_prompt="You are tweet generator", repetitions=2)

In [None]:
print(optimizer.best_prompt())

In [None]:
optimizer.print_stats()

In [None]:
init_tracing(
    address="http://localhost:8000/",
    project_id=project.id, # Project ID from Evidently Cloud
    export_name="optimised content generation",
    as_global=False,
)

In [None]:
better_tweets = [tweet_generation(topic, model="gpt-4o-mini", instructions=optimizer.best_prompt()) for topic in my_topics]

In [None]:
get_info()

In [None]:
dataset_id = get_info()['export_id']
optimized_dataset = client.load_dataset(dataset_id)

In [None]:
optimized_dataset.add_descriptors(descriptors=descriptors)

In [None]:
optimized_dataset.as_dataframe()

In [None]:
optimized_tweets_eval = report.run(optimized_dataset, tags=["gpt-4o-mini", "optimized generation"])

In [None]:
optimized_tweets_eval

In [None]:
client.add_run(project.id, optimized_tweets_eval, include_data=True)