---
title: "Is Pydantic making your model dumber?"
date: "11/10/2024"
date-modified: last-modified
description-meta: ""
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

## Set up the environment

In [1]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

First, start by importing the necessary libraries:

In [2]:
import asyncio
import difflib
import json
import re
import time
from asyncio import Semaphore
from enum import Enum
from itertools import permutations
from pathlib import Path
from typing import List

import instructor
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletion
from pydantic import BaseModel, Field
from scipy import stats

np.random.seed(42)

load_dotenv()

langsmith_client = wrap_openai(AsyncOpenAI())
instructor_client = instructor.from_openai(langsmith_client, mode=instructor.Mode.TOOLS)

  from .autonotebook import tqdm as notebook_tqdm


## Set up

In [3]:
data_dir = Path().absolute().parent / "data" / "live_bench"
reasoning_dir = data_dir / "reasoning"
math_dir = data_dir / "math"
language_dir = data_dir / "language"

df_reasoning = pd.read_json(reasoning_dir / "updated_questions.jsonl", lines=True)
df_language = pd.read_json(language_dir / "updated_questions.jsonl", lines=True)
df_math = pd.read_json(math_dir / "updated_questions.jsonl", lines=True)

In [4]:
class Response(BaseModel):
    reasoning: str = Field(description="Your reasoning explaining your answer.")
    answer: str = Field(description="Your answer, don't include any other text.")


class PromptType(Enum):
    WITHOUT_STRUCTURED_OUTPUT = "without_structured_output"
    WITH_TOOL_CALLS = "with_structured_output_tool_calls"
    WITH_JSON_MODE = "with_structured_output_json_mode"


SYSTEM_MESSAGE_MAPPING = {
    PromptType.WITHOUT_STRUCTURED_OUTPUT.value: (
        "You're a helpful assistant. You will help me answer a question."
        "\nYou must respond using the following format:"
        "\nREASONING: <your reasoning explaining your answer>"
        "\nANSWER: <your answer, don't include any other text>"
    ),
    PromptType.WITH_TOOL_CALLS.value: (
        "You're a helpful assistant. You will help me answer a question."
    ),
    PromptType.WITH_JSON_MODE.value: (
        "You're a helpful assistant. You will help me answer a question."
        + "\nYou must respond using the following JSON schema:"
        + json.dumps(Response.model_json_schema())
    ),
}

In [5]:
def parse_response(
    response: ChatCompletion | Response, response_type: PromptType
) -> str:
    if isinstance(response, Response):
        return response.answer
    elif (
        isinstance(response, ChatCompletion)
        and response_type == PromptType.WITHOUT_STRUCTURED_OUTPUT
    ):
        return response.choices[0].message.content.split("\nANSWER:")[1].strip()
    elif (
        isinstance(response, ChatCompletion)
        and response_type == PromptType.WITH_JSON_MODE
    ):
        return Response.model_validate_json(response.choices[0].message.content).answer
    raise ValueError(f"Invalid response type: {type(response)}")


@traceable
async def call_model(
    client,
    prompt_type: PromptType,
    user_message: str,
    timeout: int = 120,
) -> Response:
    params = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": SYSTEM_MESSAGE_MAPPING[prompt_type.value]},
            {"role": "user", "content": user_message},
        ],
        "timeout": timeout,
    }
    if prompt_type == PromptType.WITH_JSON_MODE:
        params.update({"response_format": {"type": "json_object"}})
    if prompt_type == PromptType.WITH_TOOL_CALLS:
        params.update(
            {
                "response_model": Response,
            }
        )
    response = await client.chat.completions.create(**params)
    return parse_response(response, prompt_type)


@traceable
async def process_row(
    row: pd.Series,
    prompt_type: PromptType,
    semaphore: Semaphore,
) -> str:
    async with semaphore:
        for _ in range(3):
            try:
                answer = await call_model(
                    client=(
                        instructor_client
                        if prompt_type == PromptType.WITH_TOOL_CALLS
                        else langsmith_client
                    ),
                    prompt_type=prompt_type,
                    user_message=f"Question:\n{row.updated_question}",
                )
                return answer
            except Exception as e:
                print(f"Error processing row {row.name}: {e}")
                continue
        raise Exception(f"Failed to process row {row.name}, after 3 attempts")


@traceable
async def process_df(
    df: pd.DataFrame,
    prompt_type: PromptType,
    concurrency: int = 100,
) -> List[str]:
    semaphore = Semaphore(concurrency)
    tasks = [process_row(row, prompt_type, semaphore) for _, row in df.iterrows()]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

## Running the experiment


In [6]:
# Adapted from:
# https://github.com/LiveBench/LiveBench/blob/main/livebench/process_results/writing/plot_unscrambling/utils.py
def levenshtein_distance(A, B):
    N, M = len(A), len(B)
    # Create an array of size NxM
    dp = [[0 for i in range(M + 1)] for j in range(N + 1)]

    # Base Case: When N = 0
    for j in range(M + 1):
        dp[0][j] = j
    # Base Case: When M = 0
    for i in range(N + 1):
        dp[i][0] = i
    # Transitions
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            if A[i - 1] == B[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],  # Insertion
                    dp[i][j - 1],  # Deletion
                    dp[i - 1][j - 1],  # Replacement
                )

    return dp[N][M]


def plot_unscrambling_process_results(ground_truth: str, llm_answer: str) -> float:
    gt_sentences = [s.strip() for s in ground_truth.split(".")]
    ans_sentences = [s.strip() for s in llm_answer.split(".")]

    gt_sentences = [s for s in gt_sentences if s]
    ans_sentences = [s for s in ans_sentences if s]

    ans_ordering = []
    for x in gt_sentences:
        best_match = difflib.get_close_matches(x, ans_sentences, n=1, cutoff=0.0)
        if best_match:
            ans_ordering.append(ans_sentences.index(best_match[0]))

    n_sentences_gt = len(gt_sentences)
    raw_distance = levenshtein_distance(list(range(len(gt_sentences))), ans_ordering)
    score = 1 - (raw_distance / n_sentences_gt)

    return score


def evaluate_language_task(ground_truth: str, task_type: str, response: str):
    if task_type == "connections":
        objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]

        groups = [set(objects[i : i + 4]) for i in range(0, len(objects), 4)]
        gt_groups = [set(gt_objects[i : i + 4]) for i in range(0, len(gt_objects), 4)]

        max_correct = 0
        for perm in permutations(groups):
            correct_groups = sum(g1 == g2 for g1, g2 in zip(perm, gt_groups))
            max_correct = max(max_correct, correct_groups)
        return max_correct / len(gt_groups)
    elif task_type == "plot_unscrambling":
        return plot_unscrambling_process_results(ground_truth, response)
    elif task_type == "typos":
        return ground_truth in response
    else:
        raise ValueError(f"Invalid task type: {task_type}")


def evaluate_reasoning_task(ground_truth: str, task_type: str, response: str):
    if task_type == "web_of_lies_v2":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type in ("spatial", "zebra_puzzle"):
        response = response.rstrip(".")
        return ground_truth.lower().strip() == response.lower().strip()
    else:
        raise ValueError(f"Invalid task type: {task_type}")


def evaluate_math_task(ground_truth: str, task_type: str, response: str):
    if task_type == "olympiad":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type == "math_comp":
        return ground_truth == response
    else:
        raise ValueError(f"Invalid task type: {task_type}")

In [22]:
# | output: false
def generate_outputs(df):
    df_copy = df.copy()
    responses_without_so = asyncio.run(
        process_df(df_copy, PromptType.WITHOUT_STRUCTURED_OUTPUT)
    )
    responses_with_so_tool_calls = asyncio.run(
        process_df(df_copy, PromptType.WITH_TOOL_CALLS)
    )
    responses_with_so_json_mode = asyncio.run(
        process_df(df_copy, PromptType.WITH_JSON_MODE)
    )
    df_copy["response_without_so"] = responses_without_so
    df_copy["response_with_so_tool_calls"] = responses_with_so_tool_calls
    df_copy["response_with_so_json_mode"] = responses_with_so_json_mode
    return df_copy


def evaluate_outputs(df, evaluator):
    df_copy = df.copy()
    df_copy["score_without_so"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_without_so"]
        ) * 1,
        axis=1,
    )
    df_copy["score_with_so_tool_calls"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_with_so_tool_calls"]
        ) * 1,
        axis=1,
    )
    df_copy["score_with_so_json_mode"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_with_so_json_mode"]
        ) * 1,
        axis=1,
    )
    return df_copy

### Reasoning

In [8]:
df_reasoning_results = generate_outputs(df_reasoning)

In [23]:
df_reasoning_results = evaluate_outputs(df_reasoning_results, evaluate_reasoning_task)
df_reasoning_results.to_csv(data_dir / "reasoning" / "reasoning_results.csv")

In [24]:
# | output: false
# | echo: false

df_reasoning_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    score_without_so=("score_without_so", "mean"),
    score_with_so_tool_calls=("score_with_so_tool_calls", "mean"),
    score_with_so_json_mode=("score_with_so_json_mode", "mean"),
)

Unnamed: 0_level_0,n_questions,score_without_so,score_with_so_tool_calls,score_with_so_json_mode,elapsed_time_without_so_p50,elapsed_time_with_so_tool_calls_p50,elapsed_time_with_so_json_mode_p50,elapsed_time_without_so_p99,elapsed_time_with_so_tool_calls_p99,elapsed_time_with_so_json_mode_p99,elapsed_time_without_so_max,elapsed_time_with_so_tool_calls_max,elapsed_time_with_so_json_mode_max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
spatial,50,0.44,0.48,0.44,6.309775,5.758013,5.291537,21.688894,14.160294,16.911668,26.070317,14.635145,17.563303
web_of_lies_v2,50,0.52,0.4,0.48,23.5561,25.448873,16.081478,33.850377,37.454179,22.355108,34.307903,39.952389,22.828892
zebra_puzzle,50,0.32,0.3,0.4,15.25936,13.891867,10.480265,32.695507,23.824518,26.589262,33.320583,26.188481,29.870442


### Language

In [11]:
df_language_results = generate_outputs(df_language)

In [25]:
df_language_results = evaluate_outputs(df_language_results, evaluate_language_task)
df_language_results.to_csv(data_dir / "language" / "language_results.csv")

In [26]:
# | output: false
# | echo: false

df_language_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    score_without_so=("score_without_so", "mean"),
    score_with_so_tool_calls=("score_with_so_tool_calls", "mean"),
    score_with_so_json_mode=("score_with_so_json_mode", "mean"),
)

Unnamed: 0_level_0,n_questions,score_without_so,score_with_so_tool_calls,score_with_so_json_mode,elapsed_time_without_so_p50,elapsed_time_with_so_tool_calls_p50,elapsed_time_with_so_json_mode_p50,elapsed_time_without_so_p99,elapsed_time_with_so_tool_calls_p99,elapsed_time_with_so_json_mode_p99,elapsed_time_without_so_max,elapsed_time_with_so_tool_calls_max,elapsed_time_with_so_json_mode_max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
connections,50,0.48,0.406667,0.448333,26.431623,27.048912,24.045916,38.849826,40.733236,30.271607,40.925307,41.86882,30.432643
plot_unscrambling,40,0.361738,0.333854,0.347735,25.695105,25.504034,20.316156,46.584709,51.921579,38.502521,48.36081,54.453975,39.024707
typos,50,0.6,0.58,0.58,6.360404,7.692957,9.245303,14.5163,20.595285,50.410145,18.56314,22.796652,50.561014


### Math

In [14]:
df_math_results = generate_outputs(df_math)

Error processing row 105: list index out of range
Error processing row 113: 1 validation error for Response
answer
  Field required [type=missing, input_value={'reasoning': 'The soluti...NLOCKED_MODIFIER': True}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing


In [27]:
df_math_results = evaluate_outputs(df_math_results, evaluate_math_task)
df_math_results.to_csv(data_dir / "math" / "math_results.csv")

In [28]:
# | output: false
# | echo: false

df_math_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    score_without_so=("score_without_so", "mean"),
    score_with_so_tool_calls=("score_with_so_tool_calls", "mean"),
    score_with_so_json_mode=("score_with_so_json_mode", "mean"),
)

Unnamed: 0_level_0,n_questions,score_without_so,score_with_so_tool_calls,score_with_so_json_mode,elapsed_time_without_so_p50,elapsed_time_with_so_tool_calls_p50,elapsed_time_with_so_json_mode_p50,elapsed_time_without_so_p99,elapsed_time_with_so_tool_calls_p99,elapsed_time_with_so_json_mode_p99,elapsed_time_without_so_max,elapsed_time_with_so_tool_calls_max,elapsed_time_with_so_json_mode_max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
math_comp,96,0.34375,0.385417,0.354167,17.434276,18.396737,11.845642,31.977144,33.291323,37.755325,39.439779,38.248845,42.578924
olympiad,36,0.361111,0.333333,0.305556,32.917217,33.718626,24.595792,42.860183,67.166222,66.068712,43.268417,74.60972,76.681866


## Evaluation

In [29]:
# | output: false
def calculate_confidence_intervals(df):
    mean_score_without_so = df["score_without_so"].mean()
    mean_score_with_so_tool_calls = df["score_with_so_tool_calls"].mean()
    mean_score_with_so_json_mode = df["score_with_so_json_mode"].mean()

    n = len(df)
    se_score_without_so = df["score_without_so"].std() / np.sqrt(n)
    se_score_with_so_tool_calls = df["score_with_so_tool_calls"].std() / np.sqrt(n)
    se_score_with_so_json_mode = df["score_with_so_json_mode"].std() / np.sqrt(n)

    ci_score_without_so = [
        mean_score_without_so - 1.96 * se_score_without_so,
        mean_score_without_so + 1.96 * se_score_without_so,
    ]
    ci_score_with_so_tool_calls = [
        mean_score_with_so_tool_calls - 1.96 * se_score_with_so_tool_calls,
        mean_score_with_so_tool_calls + 1.96 * se_score_with_so_tool_calls,
    ]
    ci_score_with_so_json_mode = [
        mean_score_with_so_json_mode - 1.96 * se_score_with_so_json_mode,
        mean_score_with_so_json_mode + 1.96 * se_score_with_so_json_mode,
    ]

    print(
        f"Response format without SO - Mean: {mean_score_without_so * 100:.2f}% CI: {ci_score_without_so[0] * 100:.2f}% - {ci_score_without_so[1] * 100:.2f}%"
    )
    print(
        f"Response format with SO tool calls - Mean: {mean_score_with_so_tool_calls * 100:.2f}% CI: {ci_score_with_so_tool_calls[0] * 100:.2f}% - {ci_score_with_so_tool_calls[1] * 100:.2f}%"
    )
    print(
        f"Response format with SO JSON mode - Mean: {mean_score_with_so_json_mode * 100:.2f}% CI: {ci_score_with_so_json_mode[0] * 100:.2f}% - {ci_score_with_so_json_mode[1] * 100:.2f}%"
    )


def run_paired_t_test(df):
    score_without_so = df["score_without_so"] * 1
    score_with_so_tool_calls = df["score_with_so_tool_calls"] * 1
    score_with_so_json_mode = df["score_with_so_json_mode"] * 1

    t_stat_without_so_tool_calls, p_value_without_so_tool_calls = stats.ttest_rel(
        score_without_so, score_with_so_tool_calls
    )
    print("Without SO vs With SO Tool Calls")
    print(
        f"t-statistic: {t_stat_without_so_tool_calls}, p-value: {p_value_without_so_tool_calls}"
    )

    t_stat_without_so_json_mode, p_value_without_so_json_mode = stats.ttest_rel(
        score_without_so, score_with_so_json_mode
    )
    print("Without SO vs With SO JSON Mode")
    print(
        f"t-statistic: {t_stat_without_so_json_mode}, p-value: {p_value_without_so_json_mode}"
    )

    print("With SO Tool Calls vs With SO JSON Mode")
    t_stat_with_so_tool_calls_json_mode, p_value_with_so_tool_calls_json_mode = (
        stats.ttest_rel(score_with_so_tool_calls, score_with_so_json_mode)
    )
    print(
        f"t-statistic: {t_stat_with_so_tool_calls_json_mode}, p-value: {p_value_with_so_tool_calls_json_mode}"
    )

In [30]:
calculate_confidence_intervals(df_reasoning_results)
run_paired_t_test(df_reasoning_results)

Response format without SO - Mean: 42.67% CI: 34.73% - 50.61%
Response format with SO tool calls - Mean: 39.33% CI: 31.49% - 47.18%
Response format with SO JSON mode - Mean: 44.00% CI: 36.03% - 51.97%
Without SO vs With SO Tool Calls
t-statistic: 0.744246831182308, p-value: 0.4578992890783663
Without SO vs With SO JSON Mode
t-statistic: -0.28779122814116004, p-value: 0.7739064538761314
With SO Tool Calls vs With SO JSON Mode
t-statistic: -1.0000000000000002, p-value: 0.3189317446414372


In [31]:
calculate_confidence_intervals(df_language_results)
run_paired_t_test(df_language_results)

Response format without SO - Mean: 48.91% CI: 42.05% - 55.76%
Response format with SO tool calls - Mean: 44.78% CI: 37.96% - 51.59%
Response format with SO JSON mode - Mean: 46.66% CI: 40.03% - 53.30%
Without SO vs With SO Tool Calls
t-statistic: 1.2362091633273395, p-value: 0.21846571843131643
Without SO vs With SO JSON Mode
t-statistic: 0.7979403511518648, p-value: 0.42626620343526367
With SO Tool Calls vs With SO JSON Mode
t-statistic: -0.5662363791890643, p-value: 0.5721461676427859


In [32]:
calculate_confidence_intervals(df_math_results)
run_paired_t_test(df_math_results)

Response format without SO - Mean: 34.85% CI: 26.69% - 43.01%
Response format with SO tool calls - Mean: 37.12% CI: 28.85% - 45.39%
Response format with SO JSON mode - Mean: 34.09% CI: 25.97% - 42.21%
Without SO vs With SO Tool Calls
t-statistic: -0.5985396996906245, p-value: 0.5505134016336659
Without SO vs With SO JSON Mode
t-statistic: 0.19174662859148225, p-value: 0.8482375752451983
With SO Tool Calls vs With SO JSON Mode
t-statistic: 0.7833154780631507, p-value: 0.4348557497710035


These are the mean scores with confidence intervals:

| **Category** | **Response Format**              | **Mean (CI %)**         |
|--------------|----------------------------------|-------------------------|
| **Reasoning** | Without SO                      | 42.67% (34.73 - 50.61)  |
|               | With SO Tool Calls               | 39.33% (31.49 - 47.18)  |
|               | With SO JSON Mode                | 44.00% (36.03 - 51.97)  |
| **Language**  | Without SO                      | 48.91% (42.05 - 55.76)  |
|               | With SO Tool Calls               | 44.78% (37.96 - 51.59)  |
|               | With SO JSON Mode                | 46.66% (40.03 - 53.30)  |
| **Math**      | Without SO                      | 34.85% (26.69 - 43.01)  |
|               | With SO Tool Calls               | 37.12% (28.85 - 45.39)  |
|               | With SO JSON Mode                | 34.09% (25.97 - 42.21)  |

And these are the results of the paired t-tests:

### 2. T-Test Results

| **Category** | **Comparison**                         | **t-Statistic** | **p-Value** |
|--------------|----------------------------------------|-----------------|-------------|
| **Reasoning** | Without SO vs With SO Tool Calls       | 0.7442          | 0.4579      |
|               | Without SO vs With SO JSON Mode        | -0.2878         | 0.7739      |
|               | With SO Tool Calls vs With SO JSON Mode| -1.0000         | 0.3189      |
| **Language**  | Without SO vs With SO Tool Calls       | 1.2362          | 0.2185      |
|               | Without SO vs With SO JSON Mode        | 0.7979          | 0.4263      |
|               | With SO Tool Calls vs With SO JSON Mode| -0.5662         | 0.5721      |
| **Math**      | Without SO vs With SO Tool Calls       | -0.5985         | 0.5505      |
|               | Without SO vs With SO JSON Mode        | 0.1917          | 0.8482      |
|               | With SO Tool Calls vs With SO JSON Mode| 0.7833          | 0.4349      |

## Conclusion
