---
title: "Will they let you say what you mean? Proprietary LLMs and structured outputs"
date: "11/30/2024"
date-modified: last-modified
description-meta: "Reviewing the performance of proprietary LLMs with structured outputs"
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

In [3]:
# | output: false

import asyncio
import re
from asyncio import Semaphore
from enum import Enum
from textwrap import dedent
from typing import List

import instructor
import numpy as np
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletion
from pydantic import BaseModel
from scipy import stats

np.random.seed(42)

load_dotenv()

dataset = load_dataset("gsm8k", "main")
evals = list(dataset["test"])
MODEL_NAME = "gpt-4o-mini"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
client = wrap_openai(AsyncOpenAI())
tool_calls_client = instructor.from_openai(client, mode=instructor.Mode.TOOLS)
json_mode_client = instructor.from_openai(client, mode=instructor.Mode.JSON)
strict_tool_calls_client = instructor.from_openai(
    client, mode=instructor.Mode.TOOLS_STRICT
)

In [5]:
class Response(BaseModel):
    reasoning: str
    answer: int


class PromptType(Enum):
    WITHOUT_STRUCTURED_OUTPUT = "without_so"
    WITH_TOOL_CALLS = "with_so_tool_calls"
    WITH_JSON_MODE = "with_so_json_mode"
    WITH_STRICT_TOOL_CALLS = "with_so_strict_tool_calls"


class ClientConfig(BaseModel):
    name: str
    use_json_format: bool
    col_name: str
    score_col_name: str


CLIENT_MAPPING = {
    PromptType.WITHOUT_STRUCTURED_OUTPUT.value: client,
    PromptType.WITH_TOOL_CALLS.value: tool_calls_client,
    PromptType.WITH_JSON_MODE.value: json_mode_client,
    PromptType.WITH_STRICT_TOOL_CALLS.value: strict_tool_calls_client,
}

CONFIGS = [
    ClientConfig(
        name=PromptType.WITHOUT_STRUCTURED_OUTPUT.value,
        use_json_format=False,
        col_name=f"response_{PromptType.WITHOUT_STRUCTURED_OUTPUT.value}",
        score_col_name=f"score_{PromptType.WITHOUT_STRUCTURED_OUTPUT.value}",
    ),
    ClientConfig(
        name=PromptType.WITH_TOOL_CALLS.value,
        use_json_format=True,
        col_name=f"response_{PromptType.WITH_TOOL_CALLS.value}",
        score_col_name=f"score_{PromptType.WITH_TOOL_CALLS.value}",
    ),
    ClientConfig(
        name=PromptType.WITH_JSON_MODE.value,
        use_json_format=True,
        col_name=f"response_{PromptType.WITH_JSON_MODE.value}",
        score_col_name=f"score_{PromptType.WITH_JSON_MODE.value}",
    ),
    ClientConfig(
        name=PromptType.WITH_STRICT_TOOL_CALLS.value,
        use_json_format=True,
        col_name=f"response_{PromptType.WITH_STRICT_TOOL_CALLS.value}",
        score_col_name=f"score_{PromptType.WITH_STRICT_TOOL_CALLS.value}",
    ),
]

In [6]:
example_question = [
    "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
    "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
    "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
]

example_explanation = [
    "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6.",
    "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.",
    "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39.",
]

example_answer = [6, 5, 39]


@traceable(run_type="prompt")
def create_prompt(question, use_json_format=True):
    json_system_prompt = dedent("""
        You are an expert in solving grade school math tasks. You will be presented with a grade-school math word problem and be asked to solve it.
        Before answering you should reason about the problem (using the "reasoning" field in the JSON response described below).
          
        You will always respond with JSON in the format described below:
        
        {"reasoning": <str, reasoning about the answer>, "answer": <int, final answer>}

        In the "reasoning" field, you should explain your reasoning about the sequence of events. In the "answer" field, you should provide an integer that corresponds to the answer to the question. Don't include any other text in the "answer" field.
        """)

    explanation_system_prompt = dedent("""
        You are an expert in solving grade school math tasks. You will be presented with a grade-school math word problem and be asked to solve it.
        Before answering, you should explain your reasoning step by step. Then, at the end, you should provide your final answer.
        
        You will always respond in the following format:
        
        <str, reasoning about the answer>
        ANSWER: <int, final answer>
        
        In ANSWER, you should provide an integer that corresponds to the answer to the question. Don't include any other text in the ANSWER.
        """)

    messages = [
        {
            "role": "system",
            "content": json_system_prompt
            if use_json_format
            else explanation_system_prompt,
        },
    ]

    for i in range(len(example_question)):
        messages.append({"role": "user", "content": f"Question: {example_question[i]}"})

        if use_json_format:
            response = f'{{"reasoning": "{example_explanation[i]}", "answer": {example_answer[i]}}}'
        else:
            response = f"{example_explanation[i]}\nANSWER: {example_answer[i]}"

        messages.append({"role": "assistant", "content": response})

    messages.append({"role": "user", "content": f"Question: {question}"})

    return messages

In [7]:
@traceable(run_type="parser")
def parse_response(response: ChatCompletion | Response, prompt_type: str) -> int | None:
    if isinstance(response, Response):
        return int(response.answer)
    elif (
        isinstance(response, ChatCompletion)
        and prompt_type == PromptType.WITHOUT_STRUCTURED_OUTPUT.value
    ):
        response_text = (
            response.choices[0].message.content.split("\nANSWER:")[1].strip()
        )
        if response_text.isnumeric() and response_text.isdigit():
            return int(response_text)
        else:
            return None
    else:
        raise ValueError(f"Invalid response type: {type(response)}")


@traceable(run_type="llm")
async def call_model(
    client_name: str,
    use_json_format: bool,
    question: str,
) -> Response:
    params = {
        "messages": create_prompt(question=question, use_json_format=use_json_format),
        "model": MODEL_NAME,
        "timeout": 120,
    }

    if client_name in (
        PromptType.WITH_TOOL_CALLS.value,
        PromptType.WITH_STRICT_TOOL_CALLS.value,
        PromptType.WITH_JSON_MODE.value,
    ):
        params.update(
            {
                "response_model": Response,
            }
        )
    response = await CLIENT_MAPPING[client_name].chat.completions.create(**params)
    return parse_response(response, client_name)


@traceable(run_type="chain")
async def process_question(
    question: str,
    client_name: str,
    use_json_format: bool,
    semaphore: Semaphore,
    max_attempts: int = 3,
) -> str:
    async with semaphore:
        for _ in range(max_attempts):
            try:
                answer = await call_model(
                    client_name=client_name,
                    use_json_format=use_json_format,
                    question=question,
                )
                return answer
            except Exception as e:
                print(f"{client_name}: Error processing question {question}: {e}")
                continue
        raise Exception(
            f"{client_name}: Failed to process question {question}, after 3 attempts"
        )


@traceable(run_type="chain")
async def process_questions(
    questions: List[dict],
    client_name: str,
    use_json_format: bool,
    concurrency: int = 100,
) -> List[str]:
    semaphore = Semaphore(concurrency)
    tasks = [
        process_question(
            question=question["question"],
            client_name=client_name,
            use_json_format=use_json_format,
            semaphore=semaphore,
        )
        for question in questions
    ]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results


def extract_gt_answer(question: str) -> int:
    raw_int = question.split("#### ")[1]
    raw_int = re.sub(",", "", raw_int)
    return int(raw_int)


def generate_outputs(questions: List[dict]):
    df = pd.DataFrame(
        {
            "id": [i for i in range(len(questions))],
            "question": [question["question"] for question in questions],
            "answer": [extract_gt_answer(question["answer"]) for question in questions],
        }
    )
    for config in CONFIGS:
        responses = asyncio.run(
            process_questions(
                questions=questions,
                client_name=config.name,
                use_json_format=config.use_json_format,
            )
        )
        df[config.col_name] = responses
    return df


def evaluate_outputs(df):
    df_copy = df.copy()
    for config in CONFIGS:
        df_copy[config.score_col_name] = (
            df_copy["answer"] == df_copy[config.col_name]
        ) * 1
    return df_copy

## Evaluation

In [25]:
# | output: false
def calculate_confidence_intervals(df, conf_level: float = 0.95):
    for config in CONFIGS:
        score_col = config.score_col_name
        scores = df[score_col]

        if len(scores) == 0:
            print(f"No scores available for {score_col}")
            continue

        mean_score = scores.mean()
        se_score = scores.std() / np.sqrt(len(scores))

        z_score = stats.norm.ppf((1 + conf_level) / 2)
        margin_error = z_score * se_score
        ci = [mean_score - margin_error, mean_score + margin_error]

        print(
            f"{score_col} - Mean: {mean_score * 100:.2f}% CI: {ci[0] * 100:.2f}% - {ci[1] * 100:.2f}%"
        )
    print()


def run_paired_t_test(df):
    scores = {}

    for config in CONFIGS:
        score_col = config.score_col_name
        scores[score_col] = df[score_col] * 1

    for score_col_1, score_col_2 in [
        ("score_without_so", "score_with_so_tool_calls"),
        ("score_without_so", "score_with_so_json_mode"),
        ("score_without_so", "score_with_so_strict_tool_calls"),
    ]:
        if score_col_1 in scores and score_col_2 in scores:
            t_stat, p_value = stats.ttest_rel(scores[score_col_1], scores[score_col_2])
            print(f"{score_col_1} vs {score_col_2}")
            print(f"t-statistic: {t_stat}, p-value: {p_value}")

### GSM8K

In [9]:
df = generate_outputs(evals)
df = evaluate_outputs(df)

In [26]:
calculate_confidence_intervals(df)
run_paired_t_test(df)

score_without_so - Mean: 93.25% CI: 91.90% - 94.61%
score_with_so_tool_calls - Mean: 92.42% CI: 90.99% - 93.85%
score_with_so_json_mode - Mean: 91.74% CI: 90.25% - 93.22%
score_with_so_strict_tool_calls - Mean: 92.87% CI: 91.48% - 94.26%

score_without_so vs score_with_so_tool_calls
t-statistic: 1.3058087263195448, p-value: 0.19184547875805577
score_without_so vs score_with_so_json_mode
t-statistic: 2.42986097186779, p-value: 0.015237431807874796
score_without_so vs score_with_so_strict_tool_calls
t-statistic: 0.6297966935389578, p-value: 0.5289367690095945


### OpenAI Results