In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
from pydantic import BaseModel, Field

from llamabot import StructuredBot, prompt
from llamabot.experiments import Experiment, metric

In [None]:
# Experiment setup
@prompt("system")
def jdbot_sysprompt(type_of_manager):
    """You are an {{ type_of_manager }}."""


@prompt("user")
def jdbot_user_message(job_description):
    """Give me a name for an job that follows this description: {{ job_description }}."""


class JobDescription(BaseModel):
    name: str = Field(..., description="A job name.")
    description: str = Field(..., description="A job description.")

In [None]:
# TESTING
jdbot_user_message._prompt_hash

In [None]:
@metric  # <-- this decorator validates that the eval function returns a scalar-type thing
def name_length(response):
    return len(response.name)

In [None]:
@prompt("system")
def judgebot_sysprompt():
    """You are a judge of how cool a name is."""


@prompt("user")
def judgebot_userprompt(namebot_response):
    """Return for me your coolness score: 1-10 for this job name: {{ namebot_response.name }}."""


class JobNameCoolness(BaseModel):
    score: int = Field(
        ..., description="How cool the job name is. 1 = not cool, 10 = amazeballer."
    )


@metric
def llm_judge(namebot_response):
    judgebot = StructuredBot(
        judgebot_sysprompt(), model_name="gpt-4o", pydantic_model=JobNameCoolness
    )
    coolness = judgebot(judgebot_userprompt(namebot_response))
    return coolness.score

In [None]:
# Experiment execution. Each execution of this experiment gets us one new run.
with Experiment("experiment_name") as expt:
    # Run your program
    bot = StructuredBot(
        jdbot_sysprompt("data science manager"),
        model_name="gpt-4o",
        pydantic_model=JobDescription,
        temperature=1.0,
    )
    response = bot(jdbot_user_message("someone who builds full stack AI apps"))

    # Evals
    name_length(response)
    llm_judge(response)