In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from dotenv import load_dotenv
from tqdm import tqdm

import pandas as pd
import time

In [None]:
_ = load_dotenv()

In [None]:
shot0_results = pd.read_csv("Llama-3.2-3B-Instruct_MATH_0_shot_test_results.csv")
reasn_results = pd.read_csv("Llama-3.2-3B-Instruct_MATH_cot_reasoning_test_results.csv")
test_data = pd.read_csv("MATH_test_staging.csv")

In [None]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
    model="gpt-4o",
)

In [None]:
logical_metric = GEval(
    name="Logical Coherence",
    criteria="Determine whether the steps followed in actual output are logically correct and coherent based on the expected output. Whether the final answer matches is not important.",
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
    model="gpt-4o",
)

In [None]:
geval_score = GEval(
    name="Reasoning Quality",
    criteria="Evaluate the chain-of-thought reasoning in the actual output compared to the expected output. Assess whether the reasoning steps are logical, complete, and lead to the correct conclusion.",
    evaluation_steps=[
        "Check if each reasoning step in the actual output is explicitly stated and follows logically from the previous step.",
        "Verify that all critical steps from the expected output are present in the actual output.",
        "Determine if the final conclusion in the actual output matches the expected outout and is supported by the reasoning."
    ],
    evaluation_params=[
        LLMTestCaseParams.INPUT,
        LLMTestCaseParams.ACTUAL_OUTPUT,
        LLMTestCaseParams.EXPECTED_OUTPUT,
    ],
    model="gpt-4o",
    threshold=0.7,
)

In [None]:
dataset_id = []
question_id = []
correctness_score = []
correctness_reason = []
logical_score = []
logical_reason = []

In [None]:
for idx in tqdm(range(100), desc="evaluating"):
    test_row = test_data.loc[idx]
    sample_row = shot0_results.loc[idx]
    testcase = LLMTestCase(
        input=test_row["question_text"],
        actual_output=sample_row["response"],
        expected_output=test_row["reasoning"],
    )
    while True:
        try:
            _ = correctness_metric.measure(testcase, _show_indicator=False)
            break
        except:
            time.sleep(2)
    correctness_score.append(correctness_metric.score)
    correctness_reason.append(correctness_metric.reason)
    time.sleep(2)
    while True:
        try:
            _ = logical_metric.measure(testcase, _show_indicator=False)
            break
        except:
            time.sleep(2)
    logical_score.append(logical_metric.score)
    logical_reason.append(logical_metric.reason)
    dataset_id.append(test_row["dataset_id"])
    question_id.append(test_row["question_id"])
    time.sleep(4)

In [None]:
pd.DataFrame(
    {
        "dataset_id": dataset_id,
        "question_id": question_id,
        "geval_correctness": correctness_score,
        "geval_correctness_trace": correctness_reason,
        "geval_logical": logical_score,
        "geval_logical_trace": logical_reason,
    }
).to_csv("Llama-3.2-3B-Instruct_MATH_0_shot_test_geval.csv", index=False)

In [None]:
dataset_id = []
question_id = []
correctness_score = []
correctness_reason = []
logical_score = []
logical_reason = []

In [None]:
for idx in tqdm(range(100), desc="evaluating"):
    test_row = test_data.loc[idx]
    sample_row = reasn_results.loc[idx]
    dataset_id.append(test_row["dataset_id"])
    question_id.append(test_row["question_id"])
    testcase = LLMTestCase(
        input=test_row["question_text"],
        actual_output=sample_row["response"],
        expected_output=test_row["reasoning"],
    )
    while True:
        try:
            _ = correctness_metric.measure(testcase, _show_indicator=False)
            break
        except:
            time.sleep(2)
    correctness_score.append(correctness_metric.score)
    correctness_reason.append(correctness_metric.reason)
    time.sleep(2)
    while True:
        try:
            _ = logical_metric.measure(testcase, _show_indicator=False)
            break
        except:
            time.sleep(2)
    logical_score.append(logical_metric.score)
    logical_reason.append(logical_metric.reason)
    time.sleep(4)

In [None]:
pd.DataFrame(
    {
        "dataset_id": dataset_id,
        "question_id": question_id,
        "geval_correctness": correctness_score,
        "geval_correctness_trace": correctness_reason,
        "geval_logical": logical_score,
        "geval_logical_trace": logical_reason,
    }
).to_csv("Llama-3.2-3B-Instruct_MATH_cot_reasoning_test_geval.csv", index=False)