In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import nest_asyncio

nest_asyncio.apply()

## Imports

In [None]:
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langsmith import Client
from pydantic import BaseModel, Field

load_dotenv()

## Creating evaluation dataset

In [4]:
ds = load_dataset("AI-MO/aimo-validation-aime")
examples = [
    {"inputs": {"question": d["problem"]}, "outputs": {"answer": int(d["answer"])}}
    for d in ds["train"]
][:15]

In [None]:
client = Client()

dataset_name = "AIME Example Dataset (sample)"

try:
    dataset = client.create_dataset(dataset_name)
    client.create_examples(dataset_id=dataset.id, examples=examples)
except Exception as e:
    print(e)

In [6]:
class Response(BaseModel):
    explanation: str = Field(description="The explanation of the answer")
    answer: int = Field(
        description="The answer to the question. It should be an integer."
    )


model = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
model_with_structure = model.with_structured_output(Response, method="function_calling")


def get_response(question: str) -> Response:
    max_retries = 3
    for _ in range(max_retries):
        try:
            messages = [
                SystemMessage(
                    "You're a math expert. You will always respond in a JSON format with the following fields: explanation and answer."
                ),
                HumanMessage(question),
            ]
            response = model_with_structure.invoke(messages)
            return response
        except Exception as e:
            print(f"Error: {e}")
            continue
    raise ValueError("Failed to get a valid response")

In [7]:
def ls_wrapper(inputs: str) -> dict:
    response = get_response(inputs["question"])
    return response.model_dump()

In [8]:
def accuracy(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    return outputs["answer"] == reference_outputs["answer"]

In [9]:
# experiment_results = client.evaluate(
#     ls_wrapper, data=dataset_name, evaluators=[accuracy], max_concurrency=15
# )

# Exercise

Create an LLM judge that evaluates if the answer is accurate and the clarity of the explanation of the answer.

In [10]:
ds = load_dataset("ChilleD/LastLetterConcat")
examples = [
    {"inputs": {"question": d["question"]}, "outputs": {"answer": d["answer"]}}
    for d in ds["train"]
][:20]

In [None]:
client = Client()

dataset_name = "LastLetterConcat Example Dataset (sample)"

try:
    dataset = client.create_dataset(dataset_name)
    client.create_examples(dataset_id=dataset.id, examples=examples)
except Exception as e:
    print(e)

In [12]:
class Response(BaseModel):
    explanation: str = Field(description="The explanation of the answer")
    answer: str = Field(
        description="The answer to the question. It should be a string with 4 characters.",
        pattern=r"^[a-zA-Z]{4}$",
    )


model = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
model_with_structure = model.with_structured_output(Response, method="function_calling")


def get_response(question: str) -> Response:
    max_retries = 3
    for _ in range(max_retries):
        try:
            messages = [
                SystemMessage(
                    "You're a puzzle expert. You will always respond in a JSON format with the following fields: explanation and answer."
                ),
                HumanMessage(question),
            ]
            response = model_with_structure.invoke(messages)
            return response
        except Exception as e:
            print(f"Error: {e}")
            continue
    raise ValueError("Failed to get a valid response")

In [13]:
class Clarity(BaseModel):
    explanation: str = Field(description="The explanation of the answer")
    clarity: int = Field(description="The clarity of the explanation", ge=1, le=5)


def clarity(inputs: dict, outputs: dict, reference_outputs: dict) -> int:
    messages = [
        SystemMessage(
            content="You are a helpful assistant that evaluates the clarity of the explanation of the answer. You will always return a number between 1 and 5, where 1 is the lowest clarity and 5 is the highest clarity."
        ),
        HumanMessage(content=f"Explanation: {outputs['explanation']}"),
    ]
    model_with_clarity_structure = model.with_structured_output(Clarity)
    response = model_with_clarity_structure.invoke(messages)
    return response.clarity


def accuracy(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    return outputs["answer"] == reference_outputs["answer"]

In [14]:
# experiment_results = client.evaluate(
#     ls_wrapper, data=dataset_name, evaluators=[accuracy, clarity], max_concurrency=15
# )