# Evaluate LLM Responses

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "<openai-api-key"

In [2]:
import random 
import re
import pandas as pd

import openai_utils.common as common
from openai_utils.types import Eval, EvalResult, SamplerBase, SingleEvalResult
from openai_utils.chat_completion_sampler import ChatCompletionSampler
from openai_utils.simpleqa_constants import GRADER_TEMPLATE

## Read LLM responeses and use an LLM to evaluate them

The `SimpleQAEval` class below was obtained from https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py, and then slightly modified so that it directly reads the prompt-response pairs from a file.

In [3]:
class SimpleQAEval(Eval):
    def __init__(self, grader_model: SamplerBase, dataset: pd.DataFrame, num_examples: int | None = None, n_repeats: int = 1):
        examples = [row.to_dict() for _, row in dataset.iterrows()]
        if num_examples:
            assert n_repeats == 1, "n_repeats only supported when max_examples = None"
            rng = random.Random(0)
            examples = rng.sample(examples, num_examples)
        self.examples = examples * n_repeats
        self.grader_model = grader_model

    def grade_sample(self, question: str, target: str, predicted_answer: str) -> str:
        grader_prompt = GRADER_TEMPLATE.format(
            question=question,
            target=target,
            predicted_answer=predicted_answer,
        )
        
        prompt_messages = [
            self.grader_model._pack_message(content=grader_prompt, role="user")
        ]
        grading_response = self.grader_model(prompt_messages)
        
        match = re.search(r"(A|B|C)", grading_response)
        return match.group(0) if match else "C"  # Default to "NOT_ATTEMPTED" if no match

    def __call__(self) -> EvalResult:
            def fn(row: dict):
                prompt_messages = [
                    self.grader_model._pack_message(content=row.get("problem", ""), role="user")
                ]
                response_text = row.get("response", "")
                grade_letter = self.grade_sample(row.get("problem", ""), row.get("answer", ""), response_text)
                
                # Metrics based on grading response
                is_correct = grade_letter == "A"
                is_incorrect = grade_letter == "B"
                is_not_attempted = grade_letter == "C"
                
                score = is_correct

                # Create HTML for each sample result
                html = common.jinja_env.from_string(common.HTML_JINJA).render(
                    prompt_messages=prompt_messages,
                    next_message=dict(content=response_text, role="assistant"),
                    score=score,
                    correct_answer=row["answer"],
                    extracted_answer=response_text,
                )
                convo = prompt_messages + [dict(content=response_text, role="assistant")]
                return SingleEvalResult(html=html, score=score, convo=convo, metrics={
                    "is_correct": is_correct,
                    "is_incorrect": is_incorrect,
                    "is_not_attempted": is_not_attempted
                })

            # Run evaluation and collect results
            results = common.map_with_progress(fn, self.examples)

            # Aggregate metrics
            aggregate_metrics = {
                "is_correct": sum(result.metrics["is_correct"] for result in results) / len(results),
                "is_incorrect": sum(result.metrics["is_incorrect"] for result in results) / len(results),
                "is_not_attempted": sum(result.metrics["is_not_attempted"] for result in results) / len(results),
            }
            aggregate_metrics["is_given_attempted"] = aggregate_metrics["is_correct"] + aggregate_metrics["is_incorrect"]
            # Calculate accuracy_given_attempted
            aggregate_metrics["accuracy_given_attempted"] = (
                aggregate_metrics["is_correct"]
                / aggregate_metrics["is_given_attempted"]
                if aggregate_metrics["is_given_attempted"] > 0
                else 0
            )
            print("AGGREGATE METRICS") 
            print(aggregate_metrics) 
            print("##################")

            output_d = {
                "accuracy_given_attempted": aggregate_metrics["accuracy_given_attempted"],
                "f1": (
                    2 * aggregate_metrics["accuracy_given_attempted"] * aggregate_metrics["is_correct"]
                    / (aggregate_metrics["accuracy_given_attempted"] + aggregate_metrics["is_correct"])
                    if (aggregate_metrics["accuracy_given_attempted"] + aggregate_metrics["is_correct"]) > 0
                    else 0
                )
            }
            
            print(f"Accuracy Given Attempted: {output_d['accuracy_given_attempted']:.3f}")
            print(f"F1 Score: {output_d['f1']:.3f}")
            
            return common.aggregate_results(results)

In the next cell we show the evaluation for the GPT-4o baseline responses. Here we only run the evaluation on the first 10 examples as a sample, to run it on all examples, set `num_examples` to None).

In [4]:
gpt_4o_baseline = pd.read_csv("results/gpt-4o-baseline-responses.csv")
grading_sampler = ChatCompletionSampler(model="gpt-4o")

simple_qa_eval = SimpleQAEval(grading_sampler, dataset=gpt_4o_baseline, num_examples=10)
res = simple_qa_eval()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12.43it/s]

AGGREGATE METRICS
{'is_correct': 0.3, 'is_incorrect': 0.7, 'is_not_attempted': 0.0, 'is_given_attempted': 1.0, 'accuracy_given_attempted': 0.3}
##################
Accuracy Given Attempted: 0.300
F1 Score: 0.300





Execute the function below to evaluate all datasets generated in the [get_tlm_response.ipynb](get_tlm_response.ipynb) script.

In [5]:
def evaluate_all_datasets():
    dataset_list = [
        "results/gpt-4o-baseline-responses.csv",
        "results/gpt-4o-baseline-25-responses.csv",
        "results/gpt-4o-baseline-80-responses.csv",
        "results/gpt-4o-best-responses.csv",
        "results/gpt-4o-best-25-responses.csv",
        "results/gpt-4o-best-80-responses.csv",
    ]

    grading_sampler = ChatCompletionSampler(model="gpt-4o")

    for dataset in dataset_list:
        print(dataset)
        df = pd.read_csv(dataset)
        simple_qa_eval = SimpleQAEval(grading_sampler, dataset=df)
        res = simple_qa_eval()
        print()

# evaluate_all_datasets()