---
title: "Is Pydantic making your model dumber?"
date: "11/10/2024"
date-modified: last-modified
description-meta: ""
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

## Set up the environment

In [1]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

First, start by importing the necessary libraries:

In [2]:
import asyncio
import difflib
import json
import re
import time
from asyncio import Semaphore
from enum import Enum
from itertools import permutations
from pathlib import Path
from typing import List

import instructor
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from openai import AsyncOpenAI
from openai.types.chat import ChatCompletion
from pydantic import BaseModel, Field
from scipy import stats

np.random.seed(42)

load_dotenv()

langsmith_client = wrap_openai(AsyncOpenAI())
instructor_client = instructor.from_openai(langsmith_client, mode=instructor.Mode.TOOLS)

  from .autonotebook import tqdm as notebook_tqdm


## Set up

In [3]:
data_dir = Path().absolute().parent / "data" / "live_bench"
reasoning_dir = data_dir / "reasoning"
math_dir = data_dir / "math"
language_dir = data_dir / "language"

df_reasoning = pd.read_json(reasoning_dir / "updated_questions.jsonl", lines=True)
df_language = pd.read_json(language_dir / "updated_questions.jsonl", lines=True)
df_math = pd.read_json(math_dir / "updated_questions.jsonl", lines=True)

In [4]:
class Response(BaseModel):
    reasoning: str = Field(description="Your reasoning explaining your answer.")
    answer: str = Field(description="Your answer, don't include any other text.")


class ProcessedResponse(BaseModel):
    value: str
    elapsed_time: float


class PromptType(Enum):
    WITHOUT_STRUCTURED_OUTPUT = "without_structured_output"
    WITH_TOOL_CALLS = "with_structured_output_tool_calls"
    WITH_JSON_MODE = "with_structured_output_json_mode"


SYSTEM_MESSAGE_MAPPING = {
    PromptType.WITHOUT_STRUCTURED_OUTPUT.value: (
        "You're a helpful assistant. You will help me answer a question."
        "\nYou must respond using the following format:"
        "\nREASONING: <your reasoning here>"
        "\nANSWER: <your answer here, don't include any other text>"
    ),
    PromptType.WITH_TOOL_CALLS.value: (
        "You're a helpful assistant. You will help me answer a question."
    ),
    PromptType.WITH_JSON_MODE.value: (
        "You're a helpful assistant. You will help me answer a question."
        + "\nYou must respond using the following JSON schema:"
        + json.dumps(Response.model_json_schema())
    ),
}

In [5]:
def parse_response(
    response: ChatCompletion | Response, response_type: PromptType
) -> str:
    if isinstance(response, Response):
        return response.answer
    elif (
        isinstance(response, ChatCompletion)
        and response_type == PromptType.WITHOUT_STRUCTURED_OUTPUT
    ):
        return response.choices[0].message.content.split("\nANSWER:")[1].strip()
    elif (
        isinstance(response, ChatCompletion)
        and response_type == PromptType.WITH_JSON_MODE
    ):
        return Response.model_validate_json(response.choices[0].message.content).answer
    raise ValueError(f"Invalid response type: {type(response)}")


@traceable
async def call_model(
    client,
    prompt_type: PromptType,
    user_message: str,
    timeout: int = 120,
) -> Response:
    params = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": SYSTEM_MESSAGE_MAPPING[prompt_type.value]},
            {"role": "user", "content": user_message},
        ],
        "timeout": timeout,
    }
    if prompt_type == PromptType.WITH_JSON_MODE:
        params.update({"response_format": {"type": "json_object"}})
    if prompt_type == PromptType.WITH_TOOL_CALLS:
        params.update(
            {
                "response_model": Response,
            }
        )
    response = await client.chat.completions.create(**params)
    return parse_response(response, prompt_type)


@traceable
async def process_row(
    row: pd.Series,
    prompt_type: PromptType,
    semaphore: Semaphore,
) -> ProcessedResponse:
    start_time = time.time()
    async with semaphore:
        for _ in range(3):
            try:
                answer = await call_model(
                    client=(
                        instructor_client
                        if prompt_type == PromptType.WITH_TOOL_CALLS
                        else langsmith_client
                    ),
                    prompt_type=prompt_type,
                    user_message=f"Question:\n{row.updated_question}",
                )
                return ProcessedResponse(
                    value=answer,
                    elapsed_time=time.time() - start_time,
                )
            except Exception as e:
                print(f"Error processing row {row.name}: {e}")
                continue
        raise Exception(f"Failed to process row {row.name}, after 3 attempts")


@traceable
async def process_df(
    df: pd.DataFrame,
    prompt_type: PromptType,
    concurrency: int = 50,
) -> List[ProcessedResponse]:
    semaphore = Semaphore(concurrency)
    tasks = [process_row(row, prompt_type, semaphore) for _, row in df.iterrows()]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    return results

## Running the experiment


In [6]:
# Adapted from:
# https://github.com/LiveBench/LiveBench/blob/main/livebench/process_results/writing/plot_unscrambling/utils.py
def levenshtein_distance(A, B):
    N, M = len(A), len(B)
    # Create an array of size NxM
    dp = [[0 for i in range(M + 1)] for j in range(N + 1)]

    # Base Case: When N = 0
    for j in range(M + 1):
        dp[0][j] = j
    # Base Case: When M = 0
    for i in range(N + 1):
        dp[i][0] = i
    # Transitions
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            if A[i - 1] == B[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],  # Insertion
                    dp[i][j - 1],  # Deletion
                    dp[i - 1][j - 1],  # Replacement
                )

    return dp[N][M]


def plot_unscrambling_process_results(ground_truth: str, llm_answer: str) -> float:
    gt_sentences = [s.strip() for s in ground_truth.split(".")]
    ans_sentences = [s.strip() for s in llm_answer.split(".")]

    gt_sentences = [s for s in gt_sentences if s]
    ans_sentences = [s for s in ans_sentences if s]

    ans_ordering = []
    for x in gt_sentences:
        best_match = difflib.get_close_matches(x, ans_sentences, n=1, cutoff=0.0)
        if best_match:
            ans_ordering.append(ans_sentences.index(best_match[0]))

    n_sentences_gt = len(gt_sentences)
    raw_distance = levenshtein_distance(list(range(len(gt_sentences))), ans_ordering)
    score = 1 - (raw_distance / n_sentences_gt)

    return score


def evaluate_language_task(ground_truth: str, task_type: str, response: str):
    if task_type == "connections":
        objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]

        groups = [set(objects[i : i + 4]) for i in range(0, len(objects), 4)]
        gt_groups = [set(gt_objects[i : i + 4]) for i in range(0, len(gt_objects), 4)]

        max_correct = 0
        for perm in permutations(groups):
            correct_groups = sum(g1 == g2 for g1, g2 in zip(perm, gt_groups))
            max_correct = max(max_correct, correct_groups)
        return max_correct / len(gt_groups)
    elif task_type == "plot_unscrambling":
        return plot_unscrambling_process_results(ground_truth, response)
    elif task_type == "typos":
        return ground_truth in response
    else:
        raise ValueError(f"Invalid task type: {task_type}")


def evaluate_reasoning_task(ground_truth: str, task_type: str, response: str):
    if task_type == "web_of_lies_v2":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type in ("spatial", "zebra_puzzle"):
        response = response.rstrip(".")
        return ground_truth.lower().strip() == response.lower().strip()
    else:
        raise ValueError(f"Invalid task type: {task_type}")


def evaluate_math_task(ground_truth: str, task_type: str, response: str):
    if task_type == "olympiad":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type == "math_comp":
        return ground_truth == response
    else:
        raise ValueError(f"Invalid task type: {task_type}")

In [7]:
# | output: false
def generate_outputs(df):
    df_copy = df.copy()
    responses_without_so = asyncio.run(
        process_df(df_copy, PromptType.WITHOUT_STRUCTURED_OUTPUT)
    )
    responses_with_so_tool_calls = asyncio.run(
        process_df(df_copy, PromptType.WITH_TOOL_CALLS)
    )
    responses_with_so_json_mode = asyncio.run(
        process_df(df_copy, PromptType.WITH_JSON_MODE)
    )
    df_copy["response_without_so"] = [r.value for r in responses_without_so]
    df_copy["response_with_so_tool_calls"] = [
        r.value for r in responses_with_so_tool_calls
    ]
    df_copy["response_with_so_json_mode"] = [
        r.value for r in responses_with_so_json_mode
    ]
    df_copy["elapsed_time_without_so"] = [r.elapsed_time for r in responses_without_so]
    df_copy["elapsed_time_with_so_tool_calls"] = [
        r.elapsed_time for r in responses_with_so_tool_calls
    ]
    df_copy["elapsed_time_with_so_json_mode"] = [
        r.elapsed_time for r in responses_with_so_json_mode
    ]
    return df_copy


def evaluate_outputs(df, evaluator):
    df_copy = df.copy()
    df_copy["is_correct_without_so"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_without_so"]
        ),
        axis=1,
    )
    df_copy["is_correct_with_so_tool_calls"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_with_so_tool_calls"]
        ),
        axis=1,
    )
    df_copy["is_correct_with_so_json_mode"] = df_copy.apply(
        lambda row: evaluator(
            row["ground_truth"], row["task"], row["response_with_so_json_mode"]
        ),
        axis=1,
    )
    return df_copy

### Reasoning

In [8]:
df_reasoning_results = generate_outputs(df_reasoning)

In [98]:
df_reasoning_results = evaluate_outputs(df_reasoning_results, evaluate_reasoning_task)
df_reasoning_results.to_csv(data_dir / "reasoning" / "reasoning_results.csv")

In [None]:
# | output: false
# | echo: false

df_reasoning_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so_tool_calls=("is_correct_with_so_tool_calls", "mean"),
    accuracy_with_so_json_mode=("is_correct_with_so_json_mode", "mean"),
    elapsed_time_without_so_p50=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_tool_calls_p50=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_json_mode_p50=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_without_so_p99=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_tool_calls_p99=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_json_mode_p99=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 99),
    ),
)

### Language

In [101]:
df_language_results = generate_outputs(df_language)

In [102]:
df_language_results = evaluate_outputs(df_language_results, evaluate_language_task)
df_language_results.to_csv(data_dir / "language" / "language_results.csv")

In [None]:
# | output: false
# | echo: false

df_language_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so_tool_calls=("is_correct_with_so_tool_calls", "mean"),
    accuracy_with_so_json_mode=("is_correct_with_so_json_mode", "mean"),
    elapsed_time_without_so_p50=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_tool_calls_p50=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_json_mode_p50=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_without_so_p99=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_tool_calls_p99=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_json_mode_p99=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 99),
    ),
)

### Math

In [None]:
df_math_results = generate_outputs(df_math)

In [10]:
df_math_results = evaluate_outputs(df_math_results, evaluate_math_task)
df_math_results.to_csv(data_dir / "math" / "math_results.csv")

In [None]:
# | output: false
# | echo: false

df_math_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so_tool_calls=("is_correct_with_so_tool_calls", "mean"),
    accuracy_with_so_json_mode=("is_correct_with_so_json_mode", "mean"),
    elapsed_time_without_so_p50=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_tool_calls_p50=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_with_so_json_mode_p50=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 50),
    ),
    elapsed_time_without_so_p99=(
        "elapsed_time_without_so",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_tool_calls_p99=(
        "elapsed_time_with_so_tool_calls",
        lambda x: np.percentile(x, 99),
    ),
    elapsed_time_with_so_json_mode_p99=(
        "elapsed_time_with_so_json_mode",
        lambda x: np.percentile(x, 99),
    ),
)

## Evaluation

In [16]:
# | output: false
def calculate_confidence_intervals(df):
    mean_accuracy_without_so = df["is_correct_without_so"].mean()
    mean_accuracy_with_so_tool_calls = df["is_correct_with_so_tool_calls"].mean()
    mean_accuracy_with_so_json_mode = df["is_correct_with_so_json_mode"].mean()

    n = len(df)
    se_without_so = df["is_correct_without_so"].std() / np.sqrt(n)
    se_with_so_tool_calls = df["is_correct_with_so_tool_calls"].std() / np.sqrt(n)
    se_with_so_json_mode = df["is_correct_with_so_json_mode"].std() / np.sqrt(n)

    ci_without_so = [
        mean_accuracy_without_so - 1.96 * se_without_so,
        mean_accuracy_without_so + 1.96 * se_without_so,
    ]
    ci_with_so_tool_calls = [
        mean_accuracy_with_so_tool_calls - 1.96 * se_with_so_tool_calls,
        mean_accuracy_with_so_tool_calls + 1.96 * se_with_so_tool_calls,
    ]
    ci_with_so_json_mode = [
        mean_accuracy_with_so_json_mode - 1.96 * se_with_so_json_mode,
        mean_accuracy_with_so_json_mode + 1.96 * se_with_so_json_mode,
    ]

    print(
        f"Response format without SO - Mean: {mean_accuracy_without_so * 100:.2f}% CI: {ci_without_so[0] * 100:.2f}% - {ci_without_so[1] * 100:.2f}%"
    )
    print(
        f"Response format with SO tool calls - Mean: {mean_accuracy_with_so_tool_calls * 100:.2f}% CI: {ci_with_so_tool_calls[0] * 100:.2f}% - {ci_with_so_tool_calls[1] * 100:.2f}%"
    )
    print(
        f"Response format with SO JSON mode - Mean: {mean_accuracy_with_so_json_mode * 100:.2f}% CI: {ci_with_so_json_mode[0] * 100:.2f}% - {ci_with_so_json_mode[1] * 100:.2f}%"
    )


def run_paired_t_test(df):
    accuracies_without_so = df["is_correct_without_so"] * 1
    accuracies_with_so_tool_calls = df["is_correct_with_so_tool_calls"] * 1
    accuracies_with_so_json_mode = df["is_correct_with_so_json_mode"] * 1

    t_stat_without_so_tool_calls, p_value_without_so_tool_calls = stats.ttest_rel(
        accuracies_without_so, accuracies_with_so_tool_calls
    )
    print("Without SO vs With SO Tool Calls")
    print(
        f"t-statistic: {t_stat_without_so_tool_calls}, p-value: {p_value_without_so_tool_calls}"
    )

    t_stat_without_so_json_mode, p_value_without_so_json_mode = stats.ttest_rel(
        accuracies_without_so, accuracies_with_so_json_mode
    )
    print("Without SO vs With SO JSON Mode")
    print(
        f"t-statistic: {t_stat_without_so_json_mode}, p-value: {p_value_without_so_json_mode}"
    )

    print("With SO Tool Calls vs With SO JSON Mode")
    t_stat_with_so_tool_calls_json_mode, p_value_with_so_tool_calls_json_mode = (
        stats.ttest_rel(accuracies_with_so_tool_calls, accuracies_with_so_json_mode)
    )
    print(
        f"t-statistic: {t_stat_with_so_tool_calls_json_mode}, p-value: {p_value_with_so_tool_calls_json_mode}"
    )

In [None]:
calculate_confidence_intervals(df_reasoning_results)
run_paired_t_test(df_reasoning_results)

In [None]:
calculate_confidence_intervals(df_language_results)
run_paired_t_test(df_language_results)

In [None]:
calculate_confidence_intervals(df_math_results)
run_paired_t_test(df_math_results)

## Conclusion
