---
title: "Is Pydantic making your LLM dumber?"
date: "09/15/2024"
date-modified: last-modified
description-meta: "Pydantic is a powerful tool for data validation, but can it make your LLM dumber?"
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

In [1]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

In [2]:
from pathlib import Path

import pandas as pd

from dotenv import load_dotenv
from openai import AsyncOpenAI

load_dotenv()

exercism_path = Path("../data/exercism")
folders = [f for f in exercism_path.iterdir() if f.is_dir()]


df = pd.DataFrame()
for folder in folders:
    files = [f.name for f in folder.iterdir() if f.is_file()]
    code_stub = [f for f in files if f.endswith(".py") and not f.endswith("_test.py")][0]
    unit_tests = [f for f in files if f.endswith("_test.py")][0]
    instructions_file = folder / ".docs" / "instructions.md"
    df = pd.concat([df, pd.DataFrame({
        "folder": [folder.name],
        "instructions": [instructions_file.read_text()],
        "code_stub_name": [code_stub],
        "code_stub": [(folder / code_stub).read_text()],
        "unit_tests_name": [unit_tests],
        "unit_tests": [(folder / unit_tests).read_text()]
    })])

In [19]:
from pathlib import Path

import unittest
import sys
import importlib
import shutil
import os
import subprocess

def run_unit_tests(code_stub_name, code_stub, unit_tests_name, unit_tests):
    tmp_dir = Path('tmp')
    main_file_path = tmp_dir / code_stub_name
    main_file_path.write_text(code_stub)

    test_file_path = tmp_dir / unit_tests_name
    test_file_path.write_text(unit_tests)

    original_dir = os.getcwd()
    os.chdir(tmp_dir)
    sys.path.insert(0, '')

    try:
        result = subprocess.run(['python', '-m', 'unittest', unit_tests_name], 
                                capture_output=True, text=True, timeout=10)
        print(result.stdout)
        print(result.stderr)
        return result.returncode == 0

    except subprocess.SubprocessError as e:
        print(f"Error running subprocess: {e}")
        return False
    finally:
        sys.path.pop(0)
        if unit_tests_name in sys.modules:
            del sys.modules[unit_tests_name]
        os.chdir(original_dir)


# tmp_dir = Path('tmp')
# tmp_dir.mkdir(exist_ok=True)
# try:
#     for _, row in df.iterrows():
#         run_unit_tests(row.code_stub_name, row.code_stub, row.unit_tests_name, row.unit_tests)
# finally:
#     # Clear the tmp folder
#     shutil.rmtree(tmp_dir)

In [4]:
full_message_coding_completion = """You are an expert Python programmer. You will be given a question (problem specification) and a code stub. You must fill out the code stub to produce a correct Python program that matches the specification. You will NOT return anything except the code you wrote.

You will write the answer using the following JSON format:
{{ "answer": "<answer>" }}

{instructions}

{code_stub_name}
```python
{code_stub}```
""".strip()

In [5]:
df["prompt"] = df.apply(
    lambda x: full_message_coding_completion.format(
        instructions=x.instructions,
        code_stub=x.code_stub,
        code_stub_name=x.code_stub_name
    ),
    axis=1
)

In [22]:
import json
import asyncio
from asyncio import Semaphore

client = AsyncOpenAI()

async def process_row(row, semaphore):
    async with semaphore:
        response = await client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": row.prompt}
            ],
            response_format={"type": "json_object"}
        )
        response_data = json.loads(response.choices[0].message.content)
        return response_data

async def main():
    semaphore = Semaphore(20)
    tasks = [process_row(row, semaphore) for _, row in df.iterrows()]
    responses = await asyncio.gather(*tasks)

    return responses

responses = asyncio.run(main())

df["response"] = [response["answer"] for response in responses]

In [23]:
tmp_dir = Path.cwd() / 'tmp'
tmp_dir.mkdir(exist_ok=True)

successes = []
try:
    for i, (_, row) in enumerate(df.iterrows()):
        result = run_unit_tests(
            code_stub_name=row.code_stub_name,
            code_stub=row.response,
            unit_tests_name=row.unit_tests_name,
            unit_tests=row.unit_tests
        )
        print(f"Test {i+1}: {'Passed' if result else 'Failed'}")
        successes.append(result)
except Exception as e:
    print(f"Error running unit tests: {e}")
finally:
    shutil.rmtree(tmp_dir)

print(f"Success rate: {sum(successes) / len(successes) * 100:.1f}%")


......
----------------------------------------------------------------------
Ran 6 tests in 0.000s

OK

Test 1: Passed

.....
----------------------------------------------------------------------
Ran 5 tests in 0.000s

OK

Test 2: Passed

...........................
----------------------------------------------------------------------
Ran 27 tests in 0.000s

OK

Test 3: Passed

FFFFF......
FAIL: test_empty_series_is_invalid (series_test.SeriesTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/Users/dcast/Documents/GitHub/blog/posts/tmp/series_test.py", line 66, in test_empty_series_is_invalid
    self.assertEqual(err.exception.args[0], "series cannot be empty")
AssertionError: 'Invalid length' != 'series cannot be empty'
- Invalid length
+ series cannot be empty


FAIL: test_slice_length_cannot_be_negative (series_test.SeriesTest)
----------------------------------------------------------------------
Traceback (m