---
title: "Structured outputs: putting the cart before the horse"
date: "11/08/2024"
date-modified: last-modified
description-meta: "A common error I've seen people do when using LLMs to generate structured outputs and how to avoid it."
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

In [12]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

In [13]:
import asyncio
from asyncio import Semaphore
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from openai import AsyncOpenAI
from scipy import stats

load_dotenv()

client = wrap_openai(AsyncOpenAI())

In [14]:
data_dir = Path().absolute().parent / "data" / "live_bench"
reasoning_dir = data_dir / "reasoning"
live_bench_json = reasoning_dir / "question.jsonl"

df = (
    pd.read_json(live_bench_json, lines=True)
    .query("livebench_release_date == '2024-07-26'")
    .assign(
        turns_str=lambda x: x.turns.str[0], 
        expects_integer=lambda x: x.turns.str[0].str.contains("integer", case=False)
    )
)

In [15]:
from pydantic import BaseModel

class ResponseFormatA(BaseModel):
    reasoning: str
    answer: str 

class ResponseFormatB(BaseModel):
    answer: str 
    reasoning: str

In [16]:
response_formatting_prompt = (
    "You're a helpful assistant. You will help me answer a question."
    "\n{specific_instructions}"
    "\nYou will use this JSON schema for your response:"
    "\n{response_format}"
)

In [17]:
def create_system_prompt(expects_integer, response_format):
    if expects_integer:
        specific_instructions = "Provide your answer as a single integer."
    else:
        specific_instructions = "Provide your answer as a a bolded string (**)."
    return response_formatting_prompt.format(
        specific_instructions=specific_instructions, 
        response_format=response_format
    )


In [19]:
@traceable
async def process_row(
    row: pd.Series, 
    response_format: ResponseFormatA | ResponseFormatB, 
    semaphore: Semaphore
) -> ResponseFormatA | ResponseFormatB:
    system_prompt = create_system_prompt(row.expects_integer, response_format.model_json_schema())
    async with semaphore:
        response = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Question:\n{row.turns_str}"}
            ],
            response_format={"type": "json_object"}
        )
        response_data = response_format.model_validate_json(
            response.choices[0].message.content
        )
        return response_data

async def main(response_format, concurrency: int = 30):
    semaphore = Semaphore(concurrency)
    tasks = [process_row(row, response_format, semaphore) for _, row in df.iterrows()]
    responses = await asyncio.gather(*tasks)

    return responses

def extract_answer(answer):
    return str(answer).strip("**")

for rf in [
    ("A", ResponseFormatA),
    ("B", ResponseFormatB)
]:
    responses = asyncio.run(main(rf[1]))
    df[f"raw_answer_{rf[0]}"] = [r.answer for r in responses]
    df[f"response_{rf[0]}"] = df.apply(lambda row: extract_answer(row[f"raw_answer_{rf[0]}"]), axis=1)
    df[f"is_correct_{rf[0]}"] = df[f"response_{rf[0]}"] == df["ground_truth"]

In [59]:

bootstrap_accuracies_A = []
bootstrap_accuracies_B = []
for _ in range(10_000):
    df_bootstrap = df.sample(frac=1, replace=True)
    bootstrap_accuracies_A.append(df_bootstrap['is_correct_A'].mean())
    bootstrap_accuracies_B.append(df_bootstrap['is_correct_B'].mean())

ci_A = np.percentile(bootstrap_accuracies_A, [2.5, 97.5])
ci_B = np.percentile(bootstrap_accuracies_B, [2.5, 97.5])

print(f"Response format A - Mean: {df['is_correct_A'].mean():.3f}, 95% CI: [{ci_A[0]:.3f}, {ci_A[1]:.3f}]")
print(f"Response format B - Mean: {df['is_correct_B'].mean():.3f}, 95% CI: [{ci_B[0]:.3f}, {ci_B[1]:.3f}]")

Response format A - Mean: 0.420, 95% CI: [0.280, 0.560]
Response format B - Mean: 0.240, 95% CI: [0.120, 0.360]


In [62]:
df["is_correct_A"] = df["is_correct_A"].astype(int)
df["is_correct_B"] = df["is_correct_B"].astype(int)

t_stat, p_value = stats.ttest_rel(df["is_correct_A"], df["is_correct_B"], alternative="greater")

print(f"t-statistic: {t_stat}, p-value: {p_value}")

t-statistic: 2.2718381048200493, p-value: 0.013761289074778684


In [9]:
df.to_excel(
    data_dir / "results" / f"{datetime.now().strftime('%Y-%m-%d')}_reasoning_results.xlsx"
)