---
title: "Is Pydantic making your model dumber?"
date: "11/10/2024"
date-modified: last-modified
description-meta: ""
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

## Set up the environment

In [461]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

First, start by importing the necessary libraries:

In [462]:
import asyncio
import difflib
import json
import re
from asyncio import Semaphore
from enum import Enum
from itertools import permutations
from pathlib import Path

import instructor
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from langsmith import traceable
from langsmith.wrappers import wrap_openai
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from scipy import stats

np.random.seed(42)

load_dotenv()

langsmith_client = wrap_openai(AsyncOpenAI())
instructor_client = instructor.from_openai(langsmith_client, mode=instructor.Mode.TOOLS)

## Set up

In [463]:
data_dir = Path().absolute().parent / "data" / "live_bench"
reasoning_dir = data_dir / "reasoning"
math_dir = data_dir / "math"
language_dir = data_dir / "language"

df_reasoning = pd.read_json(reasoning_dir / "updated_questions.jsonl", lines=True)
df_language = pd.read_json(language_dir / "updated_questions.jsonl", lines=True)
df_math = pd.read_json(math_dir / "updated_questions.jsonl", lines=True)

In [464]:
system_prompt_without_structured_output = (
    "You're a helpful assistant. You will help me answer a question."
    "\nYou must respond using the following format:"
    "\nREASONING: <your reasoning here>"
    "\nANSWER: <your answer here, don't include any other text>"
)

system_prompt_with_structured_output = (
    "You're a helpful assistant. You will help me answer a question."
    "\nYou must respond using the following JSON schema:"
    "\n{response_format}"
)


class ResponseFormat(BaseModel):
    reasoning: str = Field(description="Your reasoning explaining your answer.")
    answer: str = Field(description="Your answer, don't include any other text.")


In [465]:
def validate_response(response_text: str, type: str):
    if type == "without_structured_output":
        answer = response_text.split("\nANSWER:")[1].strip()
        return answer
    else:
        return ResponseFormat.model_validate_json(response_text).answer


@traceable
async def process_row(row: pd.Series, type: str, semaphore: Semaphore) -> dict:
    if type == "without_structured_output":
        system_prompt = system_prompt_without_structured_output
    else:
        system_prompt = system_prompt_with_structured_output.format(
            response_format=ResponseFormat.model_json_schema()
        )
    async with semaphore:
        for _ in range(3):
            try:
                response = await langsmith_client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {
                            "role": "user",
                            "content": f"Question:\n{row.updated_question}",
                        },
                    ],
                    response_format={"type": "json_object"}
                    if type == "with_structured_output"
                    else None,
                    timeout=120,
                )
                return validate_response(response.choices[0].message.content, type)
            except Exception as e:
                print(f"Failed to generate a valid response for row {row.name}: {e}")
        raise Exception("Failed to generate a valid response and ran out of retries.")


@traceable
async def process_df(df, response_format, concurrency: int = 50):
    semaphore = Semaphore(concurrency)
    tasks = [process_row(row, response_format, semaphore) for _, row in df.iterrows()]
    responses = await asyncio.gather(*tasks)
    return responses

## Running the experiment


In [466]:
# Adapted from:
# https://github.com/LiveBench/LiveBench/blob/main/livebench/process_results/writing/plot_unscrambling/utils.py
def levenshtein_distance(A, B):
    N, M = len(A), len(B)
    # Create an array of size NxM
    dp = [[0 for i in range(M + 1)] for j in range(N + 1)]

    # Base Case: When N = 0
    for j in range(M + 1):
        dp[0][j] = j
    # Base Case: When M = 0
    for i in range(N + 1):
        dp[i][0] = i
    # Transitions
    for i in range(1, N + 1):
        for j in range(1, M + 1):
            if A[i - 1] == B[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],  # Insertion
                    dp[i][j - 1],  # Deletion
                    dp[i - 1][j - 1],  # Replacement
                )

    return dp[N][M]


def plot_unscrambling_process_results(ground_truth: str, llm_answer: str) -> float:
    gt_sentences = [s.strip() for s in ground_truth.split(".")]
    ans_sentences = [s.strip() for s in llm_answer.split(".")]

    gt_sentences = [s for s in gt_sentences if s]
    ans_sentences = [s for s in ans_sentences if s]

    ans_ordering = []
    for x in gt_sentences:
        best_match = difflib.get_close_matches(x, ans_sentences, n=1, cutoff=0.0)
        if best_match:
            ans_ordering.append(ans_sentences.index(best_match[0]))

    n_sentences_gt = len(gt_sentences)
    raw_distance = levenshtein_distance(list(range(len(gt_sentences))), ans_ordering)
    score = 1 - (raw_distance / n_sentences_gt)

    return score


def evaluate_language_task(ground_truth: str, task_type: str, response: str):
    if task_type == "connections":
        objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]

        groups = [set(objects[i : i + 4]) for i in range(0, len(objects), 4)]
        gt_groups = [set(gt_objects[i : i + 4]) for i in range(0, len(gt_objects), 4)]

        max_correct = 0
        for perm in permutations(groups):
            correct_groups = sum(g1 == g2 for g1, g2 in zip(perm, gt_groups))
            max_correct = max(max_correct, correct_groups)
        return max_correct / len(gt_groups)
    elif task_type == "plot_unscrambling":
        return plot_unscrambling_process_results(ground_truth, response)
    elif task_type == "typos":
        return ground_truth in response
    else:
        raise ValueError(f"Invalid task type: {task_type}")

def evaluate_reasoning_task(ground_truth: str, task_type: str, response: str):
    if task_type == "web_of_lies_v2":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type in ("spatial", "zebra_puzzle"):
        response = response.rstrip(".")
        return ground_truth.lower().strip() == response.lower().strip()
    else:
        raise ValueError(f"Invalid task type: {task_type}")

def evaluate_math_task(ground_truth: str, task_type: str, response: str):
    if task_type == "olympiad":
        response_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in response.split(",")
        ]
        gt_objects = [
            re.sub(r"[^\w\s]", "", o.strip().lower()) for o in ground_truth.split(",")
        ]
        return response_objects == gt_objects
    elif task_type == "math_comp":
        return ground_truth == response
    else:
        raise ValueError(f"Invalid task type: {task_type}")

In [467]:
# | output: false
def generate_outputs(df):
    df_copy = df.copy()
    responses_without_so = asyncio.run(process_df(df_copy, "without_structured_output"))
    responses_with_so = asyncio.run(process_df(df_copy, "with_structured_output"))
    df_copy["response_without_so"] = responses_without_so
    df_copy["response_with_so"] = responses_with_so
    return df_copy


def evaluate_outputs(df, evaluator):
    df_copy = df.copy()
    df_copy["is_correct_without_so"] = df_copy.apply(
        lambda row: evaluator(row["ground_truth"], row["task"], row["response_without_so"]),
        axis=1,
    )
    df_copy["is_correct_with_so"] = df_copy.apply(
        lambda row: evaluator(row["ground_truth"], row["task"], row["response_with_so"]),
        axis=1,
    )
    return df_copy

### Reasoning

In [386]:
df_reasoning_results = generate_outputs(df_reasoning)

Failed to generate a valid response for row 76: Request timed out.
Failed to generate a valid response for row 75: 1 validation error for ResponseFormat
answer
  Field required [type=missing, input_value={'reasoning': "Let's reas...s:', answer='manager'}"}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing


In [400]:
df_reasoning_results = evaluate_outputs(df_reasoning_results, evaluate_reasoning_task)
df_reasoning_results.to_csv(data_dir / "reasoning" / "reasoning_results.csv")

In [401]:
df_reasoning_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so=("is_correct_with_so", "mean"),
)

Unnamed: 0_level_0,n_questions,accuracy_without_so,accuracy_with_so
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
spatial,50,0.36,0.44
web_of_lies_v2,50,0.38,0.6
zebra_puzzle,50,0.44,0.44


### Language

In [423]:
df_language_results = generate_outputs(df_language)

In [428]:
df_language_results = evaluate_outputs(df_language_results, evaluate_language_task)
df_language_results.to_csv(data_dir / "language" / "language_results.csv")

In [429]:
df_language_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so=("is_correct_with_so", "mean"),
)

Unnamed: 0_level_0,n_questions,accuracy_without_so,accuracy_with_so
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
connections,50,0.446667,0.43
plot_unscrambling,40,0.337397,0.37747
typos,50,0.66,0.64


### Math

In [468]:
df_math_results = generate_outputs(df_math)

Failed to generate a valid response for row 14: 1 validation error for ResponseFormat
answer
  Field required [type=missing, input_value={'reasoning': 'To solve t...= Math-refundable: \\)'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
Failed to generate a valid response for row 17: 1 validation error for ResponseFormat
answer
  Field required [type=missing, input_value={'reasoning': 'To solve t...n \'answer\': \'025\'}'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing


In [471]:
df_math_results = evaluate_outputs(df_math_results, evaluate_math_task)
df_math_results.to_csv(data_dir / "math" / "math_results.csv")

In [474]:
df_math_results.groupby("task").agg(
    n_questions=("question_id", "count"),
    accuracy_without_so=("is_correct_without_so", "mean"),
    accuracy_with_so=("is_correct_with_so", "mean"),
)

Unnamed: 0_level_0,n_questions,accuracy_without_so,accuracy_with_so
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
math_comp,96,0.375,0.427083
olympiad,36,0.277778,0.305556


## Evaluation

In [475]:
# | output: false
def calculate_confidence_intervals(df):
    mean_accuracy_without_so = df["is_correct_without_so"].mean()
    mean_accuracy_with_so = df["is_correct_with_so"].mean()

    n = len(df)
    se_without_so = df["is_correct_without_so"].std() / np.sqrt(n)
    se_with_so = df["is_correct_with_so"].std() / np.sqrt(n)

    ci_without_so = [
        mean_accuracy_without_so - 1.96 * se_without_so,
        mean_accuracy_without_so + 1.96 * se_without_so,
    ]
    ci_with_so = [
        mean_accuracy_with_so - 1.96 * se_with_so,
        mean_accuracy_with_so + 1.96 * se_with_so,
    ]

    print(
        f"Response format without SO - Mean: {mean_accuracy_without_so * 100:.2f}% CI: {ci_without_so[0] * 100:.2f}% - {ci_without_so[1] * 100:.2f}%"
    )
    print(
        f"Response format with SO - Mean: {mean_accuracy_with_so * 100:.2f}% CI: {ci_with_so[0] * 100:.2f}% - {ci_with_so[1] * 100:.2f}%"
    )

def run_paired_t_test(df):
    accuracies_without_so = df["is_correct_without_so"] * 1
    accuracies_with_so = df["is_correct_with_so"] * 1
    t_stat, p_value = stats.ttest_rel(accuracies_without_so, accuracies_with_so)
    print(f"t-statistic: {t_stat}, p-value: {p_value}")

In [476]:
calculate_confidence_intervals(df_reasoning_results)
run_paired_t_test(df_reasoning_results)

Response format without SO - Mean: 39.33% CI: 31.49% - 47.18%
Response format with SO - Mean: 49.33% CI: 41.31% - 57.36%
t-statistic: -2.2667004730406615, p-value: 0.024847877331910594


In [477]:
calculate_confidence_intervals(df_language_results)
run_paired_t_test(df_language_results)

Response format without SO - Mean: 49.16% CI: 42.39% - 55.94%
Response format with SO - Mean: 49.00% CI: 42.07% - 55.92%
t-statistic: 0.05090289868160946, p-value: 0.9594759715752339


In [478]:
calculate_confidence_intervals(df_math_results)
run_paired_t_test(df_math_results)

Response format without SO - Mean: 34.85% CI: 26.69% - 43.01%
Response format with SO - Mean: 39.39% CI: 31.03% - 47.76%
t-statistic: -1.227088882859258, p-value: 0.22198988572802872


## Conclusion
