---
title: "Can you really 'say what you mean'?"
date: "12/01/2024"
date-modified: last-modified
description-meta: "A look at the impact of structured outputs on the performance of proprietary LLMs."
toc: true
toc-depth: 3
lightbox: true
fig-cap-location: margin
categories:
  - llm
  - openai
  - pydantic
  - python
author:
  - name: Dylan Castillo
    url: https://dylancastillo.co
    affiliation: Iwana Labs
    affiliation-url: https://iwanalabs.com
citation: true
comments:
  utterances:
    repo: dylanjcastillo/blog_comments
    theme: dark-blue
    issue-term: pathname
---

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# | output: false
# | echo: false

import nest_asyncio

nest_asyncio.apply()

In [3]:
# | output: false

import json
from enum import Enum
from pathlib import Path
from textwrap import dedent
from typing import Callable, List, Literal

import numpy as np
import outlines
import pandas as pd
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from langsmith import traceable
from outlines.fsm.json_schema import build_regex_from_schema
from outlines.samplers import greedy
from pydantic import BaseModel, Field, constr
from scipy import stats
from transformers import AutoTokenizer

np.random.seed(42)

load_dotenv()

MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"

model = outlines.models.transformers(
    MODEL_NAME,
    device="cuda",
    model_kwargs={"torch_dtype": torch.bfloat16, "trust_remote_code": True},
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
class PromptType(Enum):
    WITHOUT_STRUCTURED_GENERATION = "without_sg"
    STRUCTURED_GENERATION = "with_sg"


class ClientConfig(BaseModel):
    name: str
    use_response_model: bool
    col_name: str
    score_col_name: str


CONFIGS = [
    ClientConfig(
        name=PromptType.WITHOUT_STRUCTURED_GENERATION.value,
        use_response_model=False,
        col_name=f"response_{PromptType.WITHOUT_STRUCTURED_GENERATION.value}",
        score_col_name=f"score_{PromptType.WITHOUT_STRUCTURED_GENERATION.value}",
    ),
    ClientConfig(
        name=PromptType.STRUCTURED_GENERATION.value,
        use_response_model=True,
        col_name=f"response_{PromptType.STRUCTURED_GENERATION.value}",
        score_col_name=f"score_{PromptType.STRUCTURED_GENERATION.value}",
    ),
]

In [8]:
class LLMEvaluator:
    def __init__(
        self,
        configs: List[ClientConfig],
        create_prompt_fn: Callable,
        parse_response_fn: Callable,
        response_model: BaseModel,
        batch_size: int = 5,
    ):
        self.configs = configs
        self.create_prompt_fn = create_prompt_fn
        self.parse_response_fn = parse_response_fn
        self.schema_regex = build_regex_from_schema(
            json.dumps(response_model.model_json_schema())
        )
        self.unstructured_generator = outlines.generate.text(model, sampler=greedy())
        self.structured_generator = outlines.generate.regex(
            model, self.schema_regex, sampler=greedy()
        )
        self.batch_size = batch_size

    @traceable(run_type="prompt")
    def create_prompt(
        self,
        question: str,
        prompt_type: str,
    ) -> List[dict]:
        return self.create_prompt_fn(
            question=question,
            prompt_type=prompt_type,
        )

    @traceable(run_type="parser")
    def parse_response(
        self,
        response: str,
        prompt_type: str,
    ) -> str | int:
        try:
            return self.parse_response_fn(response, prompt_type)
        except Exception as e:
            print(f"Error parsing response: {e}")
            return None

    @traceable(run_type="llm")
    def call_llm(
        self,
        config: ClientConfig,
        questions: List[str],
    ) -> List[str | int | None]:
        messages = [
            self.create_prompt(question=question, prompt_type=config.name)
            for question in questions
        ]
        if config.name == PromptType.STRUCTURED_GENERATION.value:
            if len(messages) == 1:
                completion = self.structured_generator(messages[0], max_tokens=1000)
            else:
                completion = self.structured_generator(messages, max_tokens=1000)
        else:
            if len(messages) == 1:
                completion = self.unstructured_generator(messages[0], max_tokens=1000)
            else:
                completion = self.unstructured_generator(messages, max_tokens=1000)
        return [completion]

    @traceable(run_type="chain")
    def process_batch(
        self,
        questions: List[str],
        config: ClientConfig,
    ) -> List[str | int | None]:
        answers = self.call_llm(
            config=config,
            questions=questions,
        )
        parsed_answers = [
            self.parse_response(answer, config.name) for answer in answers
        ]
        return parsed_answers

    @traceable(run_type="chain")
    def process_questions(
        self,
        run_name: str,
        questions: List[dict],
        config: ClientConfig,
    ) -> List[str | int | None]:
        results = []
        for i in range(0, len(questions), self.batch_size):
            batch = questions[i:i + self.batch_size]
            batch_results = self.process_batch(
                questions=[q["question"] for q in batch],
                config=config,
            )
            results.extend(batch_results)
        return results

    def generate_outputs(self, questions: List[dict]) -> pd.DataFrame:
        df = pd.DataFrame(
            {
                "id": [i for i in range(len(questions))],
                "question": [question["question"] for question in questions],
                "answer": [question["answer"] for question in questions],
            }
        )
        for config in self.configs:
            responses = self.process_questions(
                run_name=config.name,
                questions=questions,
                config=config,
            )
            df[config.col_name] = responses
        return df

    def evaluate_outputs(self, df: pd.DataFrame) -> pd.DataFrame:
        df_copy = df.copy()
        for config in self.configs:
            df_copy[config.score_col_name] = (
                df_copy["answer"] == df_copy[config.col_name]
            ) * 1
        return df_copy

    def calculate_confidence_intervals(
        self, df: pd.DataFrame, conf_level: float = 0.95
    ) -> None:
        print(
            f"Calculating confidence intervals ({conf_level}) with {len(df)} observations:"
        )
        for config in self.configs:
            score_col = config.score_col_name
            scores = df[score_col]

            if len(scores) == 0:
                print(f"No scores available for {score_col}")
                continue

            mean_score = scores.mean()
            se_score = scores.std() / np.sqrt(len(scores))

            z_score = stats.norm.ppf((1 + conf_level) / 2)
            margin_error = z_score * se_score
            ci = [
                max(0.0, mean_score - margin_error),
                min(1.0, mean_score + margin_error),
            ]
            print(
                f"{score_col} - Mean: {mean_score * 100:.2f}% CI: {ci[0] * 100:.2f}% - {ci[1] * 100:.2f}%"
            )
        print()

    def run_paired_t_test(self, df: pd.DataFrame) -> None:
        scores = {}

        for config in self.configs:
            score_col = config.score_col_name
            scores[score_col] = df[score_col] * 1

        for score_col_1, score_col_2 in [
            ("score_without_so", "score_with_so_tool_calls"),
            ("score_without_so", "score_with_so_strict_tool_calls"),
            ("score_without_so", "score_with_so_json_mode"),
        ]:
            if score_col_1 in scores and score_col_2 in scores:
                t_stat, p_value = stats.ttest_rel(
                    scores[score_col_1], scores[score_col_2]
                )
                print(f"{score_col_1} vs {score_col_2}")
                print(f"t-statistic: {t_stat}, p-value: {p_value}")

## GSM8K

In [11]:
class Response(BaseModel):
    reasoning: constr(max_length=1000)
    answer: int = Field(pattern=r"[1-9][0-9]{0,9}")


def create_prompt_gsm8k(question, prompt_type: str):
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        system_prompt = dedent("""
        You are an expert in solving grade school math tasks. You will be presented with a grade-school math word problem and be asked to solve it.

        You will always respond with a JSON object matching the following JSON schema:
        
        {"reasoning": <str, step by step reasoning about the answer>, "answer": <int, final answer>}
        
        First, provide your step by step reasoning in the "reasoning" field. Then, in the "answer" field, provide an integer that corresponds to the correct answer to the question. Don't include any other text in the "answer" field.
        """)
    else:
        system_prompt = dedent("""
        You are an expert in solving grade school math tasks. You will be presented with a grade-school math word problem and be asked to solve it.
        
        You will always respond in the following format:
        
        <str, step by step reasoning about the answer>
        ANSWER: <int, final answer>
        
        First, provide your step by step reasoning. Then, in ANSWER, provide an integer that corresponds to the correct answer to the question. Don't include any other text in ANSWER.
        """)

    examples = [
        (
            "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
            "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6.",
            6,
        ),
        (
            "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
            "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5.",
            5,
        ),
        (
            "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
            "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39.",
            39,
        ),
    ]

    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
    ]

    for example_q, example_reason, example_ans in examples:
        messages.append(
            {
                "role": "user",
                "content": f"Question: {example_q}",
            }
        )
        if prompt_type == PromptType.STRUCTURED_GENERATION.value:
            response = f'{{"reasoning": "{example_reason}", "answer": {example_ans}}}'
        else:
            response = f"{example_reason}\nANSWER: {example_ans}"
        messages.append(
            {
                "role": "assistant",
                "content": response,
            }
        )
    messages.append(
        {
            "role": "user",
            "content": f"Question: {question}",
        }
    )

    return tokenizer.apply_chat_template(messages, tokenize=False)


def parse_response_gsm8k(response: str, prompt_type: str) -> int:
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        return int(json.loads(response)["answer"])
    else:
        cleaned_response = (
            response.split("\nANSWER:")[1].replace(",", "").rstrip(".").strip()
        )
        return int(cleaned_response)


evaluator = LLMEvaluator(
    configs=CONFIGS,
    create_prompt_fn=create_prompt_gsm8k,
    parse_response_fn=parse_response_gsm8k,
    response_model=Response,
)

In [14]:
dataset = load_dataset("gsm8k", "main")
evals = [
    {
        "question": d["question"],
        "answer": int(d["answer"].split("#### ")[1].replace(",", "").strip()),
    }
    for d in dataset["test"]
][:20]

df = evaluator.generate_outputs(evals)
df_results = evaluator.evaluate_outputs(df)



: 

In [13]:
evaluator.calculate_confidence_intervals(df_results)
evaluator.run_paired_t_test(df_results)

Calculating confidence intervals (0.95) with 10 observations:
score_without_sg - Mean: 70.00% CI: 40.06% - 99.94%
score_with_sg - Mean: 70.00% CI: 40.06% - 99.94%



## Last Letter

In [9]:
class Response(BaseModel):
    reasoning: constr(max_length=1000)
    answer: str = Field(pattern=r"[a-z]+")


def create_prompt_last_letter(question, prompt_type: str):
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        system_prompt = dedent("""
        You are an expert in string manipulation tasks. You will be given a sequence of words and need to create a new string made from the last letter of each word. Before answering, explain your reasoning about how you'll extract and concatenate the letters.
          
        You will always respond with a JSON object matching the following JSON schema:

        {"reasoning": <str, step by step reasoning about the answer>, "answer": <str, final answer>}
        
        First, provide your step by step reasoning in the "reasoning" field. Then, in the "answer" field, provide only the lowercase concatenated letters without any additional text.
        """)
    else:
        system_prompt = dedent("""
        You are an expert in string manipulation tasks. You will be given a sequence of words and need to create a new string made from the last letter of each word. Before answering, explain your reasoning about how you'll extract and concatenate the letters.
        
        You will always respond in the following format:
        
        <str, step by step reasoning about the answer>
        ANSWER: <str, final answer>
        
        First, provide your step by step reasoning. Then, in ANSWER, provide only the lowercase concatenated letters without any additional text.
        """)

    fewshot_examples = [
        (
            "Ian Peter Bernard Stephen",
            "The last letter of 'Ian' is 'n'. The last letter of 'Peter' is 'r'. The last letter of 'Bernard' is 'd'. The last letter of 'Stephen' is 'N'. Concatenating them is 'nrdn'.",
            "nrdn",
        ),
        (
            "Larry Page",
            "The last letter of 'Larry' is 'y'. The last letter of 'Page' is 'e'. Concatenating them is 'ye'.",
            "ye",
        ),
        (
            "Sergey Brin",
            "The last letter of 'Sergey' is 'y'. The last letter of 'Brin' is 'n'. Concatenating them is 'yn'.",
            "yn",
        ),
        (
            "Bill Gates",
            "The last letter of 'Bill' is 'l'. The last letter of 'Gates' is 's'. Concatenating them is 'ls'.",
            "ls",
        ),
        (
            "Jason Wei",
            "The last letter of 'Jason' is 'n'. The last letter of 'Wei' is 'i'. Concatenating them is 'ni'.",
            "ni",
        ),
        (
            "François Chollet",
            "The last letter of 'François' is 's'. The last letter of 'Chollet' is 't'. Concatenating them is 'st'.",
            "st",
        ),
        (
            "Yann LeCun",
            "The last letter of 'Yann' is 'n'. The last letter of 'LeCun' is 'n'. Concatenating them is 'nn'.",
            "nn",
        ),
        (
            "Eliezer Yudkowsky",
            "The last letter of 'Eliezer' is 'r'. The last letter of 'Yudkowsky' is 'y'. Concatenating them is 'ry'.",
            "ry",
        ),
    ]

    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
    ]

    for example_q, example_reason, example_ans in fewshot_examples:
        messages.append(
            {
                "role": "user",
                "content": f"Question: Take the last letters of the words in '{example_q}' and concatenate them.",
            }
        )
        if prompt_type == PromptType.STRUCTURED_GENERATION.value:
            response = f'{{"reasoning": "{example_reason}", "answer": "{example_ans}"}}'
        else:
            response = f"{example_reason}\nANSWER: {example_ans}"
        messages.append(
            {
                "role": "assistant",
                "content": response,
            }
        )

    messages.append(
        {
            "role": "user",
            "content": f"Question: {question}",
        }
    )
    return messages


def parse_response_last_letter(response: str, prompt_type: str) -> str | None:
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        return json.loads(response)["answer"]
    else:
        return response.split("\nANSWER:")[1].rstrip(".").strip()

In [10]:
dataset = load_dataset("ChilleD/LastLetterConcat")
evals = [
    {"question": d["question"], "answer": d["answer"].lower()} for d in dataset["test"]
]

evaluator = LLMEvaluator(
    configs=CONFIGS,
    create_prompt_fn=create_prompt_last_letter,
    parse_response_fn=parse_response_last_letter,
    response_model=Response,
)

df = evaluator.generate_outputs(evals)
df_results = evaluator.evaluate_outputs(df)

In [None]:
evaluator.calculate_confidence_intervals(df_results)
evaluator.run_paired_t_test(df_results)

## Shuffled objects   

In [16]:
class Response(BaseModel):
    reasoning: constr(max_length=1200)
    answer: str = Field(pattern=r"[A-E]")


def create_prompt_shuffled_objects(question: str, prompt_type: str):
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        system_prompt = dedent("""
        You are an expert in performing common sense tasks involving the ordering of a sequence of events.
        Each question will present you with a sequence of events and a question about it. Your task is to determine the correct answer from the options provided.
          
        You will always respond with a JSON object matching the following JSON schema:
                   
        {"reasoning": <str, step by step reasoning about the answer>, "answer": <str, final answer>}

        First, provide your step by step reasoning in the "reasoning" field. Then, in the "answer" field, provide only the single letter representing the correct choice you are presented with. Don't include any other text in the "answer" field.
        """)
    else:
        system_prompt = dedent("""
        You are an expert in performing common sense tasks involving the ordering of a sequence of events.
        Each question will present you with a sequence of events and a question about it. Your task is to determine the correct answer from the options provided.
        
        You will always respond in the following format:
        
        <str, step by step reasoning about the answer>
        ANSWER: <str, final answer>
        
        First, provide your step by step reasoning. Then, in ANSWER, provide only the single letter representing the correct choice you are presented with. Don't include any other text in ANSWER.
        """)

    fewshot_examples = [
        (
            "Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Patrick, Bob is dancing with Sam, Claire is dancing with Jamie, Dave is dancing with Lola, and Eve is dancing with Melissa.\nThroughout the song, the dancers often trade partners. First, Dave and Eve switch partners. Then, Dave and Alice switch partners. Then, Eve and Alice switch partners. Then, Claire and Bob switch partners. Finally, Dave and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Patrick\n(B) Sam\n(C) Jamie\n(D) Lola\n(E) Melissa",
            "Dave and Eve switch partners, so Dave's partner is now Melissa and Eve's partner is now Patrick. Then Dave and Alice switch partners so Dave's partner is now Patrick and Alice's partner is now Melissa. Then Eve and Alice switch partners so Eve's partner is now Melissa and Alice's partner is now Lola. Then Claire and Bob switch patners so Claire's partner is now Sam, and Bob's partner is now Jamie. Finally, Dave and Alice switch partners so Dave's new partner is Lola, and Alice's new partner is Patrick. Alice is dance in with Patrick, choice A.",
            "A",
        ),
        (
            "Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Ophelia, Bob is dancing with Jamie, Claire is dancing with Melissa, Dave is dancing with Rodrigo, and Eve is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Claire and Bob switch partners. Then, Claire and Eve switch partners. Then, Claire and Bob switch partners. Then, Eve and Dave switch partners. Finally, Claire and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Ophelia\n(B) Jamie\n(C) Melissa\n(D) Rodrigo\n(E) Patrick",
            "Claire and Bob switch partners, so Claire's partner is now Jamie and Bob's partner is now Melissa. Then, Claire and Eve switch partners, so Claire's partner becomes Patrick and Eve's partner becomes Jamie. Next, Claire and Bob switch partners again, making Claire's partner Melissa and Bob's partner Patrick. After that, Eve and Dave switch partners, resulting in Eve's partner being Rodrigo and Dave's partner being Jamie. Finally, Claire and Alice switch partners, so Claire's partner is now Ophelia and Alice's partner becomes Melissa. Alice is dancing with Melissa, which is choice C.",
            "C",
        ),
        (
            "Alice, Bob, Claire, Dave, and Eve are friends and avid readers who occasionally trade books. At the start of the semester, they each buy one new book: Alice gets Catch-22, Bob gets Hound of the Baskervilles, Claire gets Frankenstein, Dave gets The Pearl, and Eve gets The Fellowship of the Ring.\nAs the semester proceeds, they start trading around the new books. First, Eve and Alice swap books. Then, Alice and Claire swap books. Then, Alice and Bob swap books. Then, Dave and Alice swap books. Finally, Dave and Claire swap books. At the end of the semester, Dave has\nOptions:\n(A) Catch-22\n(B) Hound of the Baskervilles\n(C) Frankenstein\n(D) The Pearl\n(E) The Fellowship of the Ring",
            "First, Eve and Alice swap, so Alice gets The Fellowship of the Ring and Eve gets Catch-22. Next, Alice and Claire swap, giving Claire The Fellowship of the Ring and Alice Frankenstein. Then, Alice and Bob swap, resulting in Bob holding Frankenstein and Alice having Hound of the Baskervilles. Dave and Alice then swap, so Dave takes Hound of the Baskervilles and Alice receives The Pearl. Finally, Dave and Claire swap books, which means Dave takes The Fellowship of the Ring from Claire. Therefore, at the end of all the swaps, Dave possesses The Fellowship of the Ring, making option E the correct answer.",
            "E",
        ),
    ]

    messages = [
        {
            "role": "system",
            "content": system_prompt,
        },
    ]
    for example_q, example_reason, example_ans in fewshot_examples:
        messages.append(
            {
                "role": "user",
                "content": f"Question: {example_q}",
            }
        )
        if prompt_type == PromptType.STRUCTURED_GENERATION.value:
            response = f'{{"reasoning": "{example_reason}", "answer": "{example_ans}"}}'
        else:
            response = f"{example_reason}\nANSWER: {example_ans}"
        messages.append(
            {
                "role": "assistant",
                "content": response,
            }
        )

    messages.append(
        {
            "role": "user",
            "content": f"Question: {question}",
        }
    )
    return messages


def parse_response_shuffled_objects(response: str, prompt_type: str) -> str:
    if prompt_type == PromptType.STRUCTURED_GENERATION.value:
        return json.loads(response)["answer"]
    else:
        return response.split("\nANSWER:")[1].rstrip(".").strip()

In [None]:
# | output: false
dataset = load_dataset(
    "openeval/BIG-Bench-Hard", data_files="tracking_shuffled_objects_five_objects.json"
)
evals = [
    {
        "question": d["input"],
        "answer": d["target"].replace("(", "").replace(")", "").strip(),
    }
    for d in dataset["train"]["examples"][0][4:]  # first 3 are few-shot examples
]

evaluator = LLMEvaluator(
    configs=CONFIGS,
    create_prompt_fn=create_prompt_shuffled_objects,
    parse_response_fn=parse_response_shuffled_objects,
    response_model=Response,
)

df = evaluator.generate_outputs(evals)
df_results = evaluator.evaluate_outputs(df)

In [None]:
evaluator.calculate_confidence_intervals(df_results)
evaluator.run_paired_t_test(df_results)