In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import nest_asyncio

nest_asyncio.apply()

## Imports

In [None]:
from dotenv import load_dotenv
from langsmith import Client
from langchain_openai import ChatOpenAI
from datasets import load_dataset

load_dotenv()

## Creating evaluation dataset

In [None]:
ds = load_dataset("AI-MO/aimo-validation-aime")
examples = [
    {"inputs": {"question": d["problem"]}, "outputs": {"answer": int(d["answer"])}}
    for d in ds["train"]
][:15]

In [None]:
client = Client()

dataset_name = "AIME Example Dataset (sample)"

try:
    dataset = client.create_dataset(dataset_name)
    client.create_examples(dataset_id=dataset.id, examples=examples)
except Exception as e:
    print(e)

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage

from pydantic import BaseModel, Field


class Response(BaseModel):
    explanation: str = Field(description="The explanation of the answer")
    answer: int = Field(
        description="The answer to the question. It should be an integer."
    )


model = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
model_with_structure = model.with_structured_output(Response, method="function_calling")


def get_response(question: str) -> Response:
    max_retries = 3
    for _ in range(max_retries):
        try:
            messages = [
                SystemMessage(
                    "You're a math expert. You will always respond in a JSON format with the following fields: explanation and answer."
                ),
                HumanMessage(question),
            ]
            response = model_with_structure.invoke(messages)
            return response
        except Exception as e:
            print(f"Error: {e}")
            continue
    raise ValueError("Failed to get a valid response")

In [None]:
def ls_wrapper(inputs: str) -> dict:
    response = get_response(inputs["question"])
    return response.model_dump()

In [None]:
def accuracy(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    return outputs["answer"] == reference_outputs["answer"]

In [None]:
# experiment_results = client.evaluate(
#     ls_wrapper, data=dataset_name, evaluators=[correctness], max_concurrency=15
# )

# Exercise

Create an LLM judge that evaluates the clarity of the explanation of the answer

In [None]:
ds = load_dataset("ChilleD/LastLetterConcat")
examples = [
    {"inputs": {"question": d["question"]}, "outputs": {"answer": d["answer"]}}
    for d in ds["train"]
][:20]