In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
cr_df_P0 = pd.read_csv("eval/results/correctness/P0/correctness_results_P0.csv")

cr_df_P1 = pd.read_csv("eval/results/correctness/P1/correctness_results_P1.csv")

cr_df_P2 = pd.read_csv("eval/results/correctness/P2/correctness_results_P2.csv")

cr_df_P3 = pd.read_csv("eval/results/correctness/P3/correctness_results_P3.csv")


In [None]:
def correctness_rates(df: pd.DataFrame) -> dict:
    """
    Returns rates (percentages) for a three-stage correctness pipeline:

    1) Syntactic not ok:
       syntactic_ok == 0, computed over all generated tests.

    2) Execution not ok (given syntactic validity):
       syntactic_ok == 1 AND execution_ok == 0,
       computed over syntactically valid tests only.

    3) Assertion not ok (given execution):
       syntactic_ok == 1 AND execution_ok == 1 AND assertion_ok == 0,
       computed over executed tests only.

    Note:
    assertion_ok encodes both assertion presence and passing behavior.
    Therefore, assertion_not_ok includes tests with missing assertions
    as well as tests with failing assertions.
    """
    total = len(df)
    if total == 0:
        raise ValueError("DataFrame is empty")

    syntactic_not_ok = (df["syntactic_ok"] == 0).sum()
    syntactic_valid = (df["syntactic_ok"] == 1).sum()

    execution_not_ok = (
        (df["syntactic_ok"] == 1) &
        (df["execution_ok"] == 0)
    ).sum()

    # executed tests (syntactic_ok == 1 AND execution_ok == 1)
    executed = df[
        (df["syntactic_ok"] == 1) &
        (df["execution_ok"] == 1)
    ]
    executed_total = len(executed)

    assertion_not_ok_executed = (
        (executed["assertion_ok"] == 0).sum()
        if executed_total > 0 else 0
    )

    fully_correct = (
        (df["syntactic_ok"] == 1) &
        (df["execution_ok"] == 1) &
        (df["assertion_ok"] == 1)
    ).sum()

    return {
        "total_tests": total,

        "syntactic_not_ok_count": syntactic_not_ok,
        "syntactic_not_ok_pct": 100 * syntactic_not_ok / total,

        "syntactic_valid_tests": syntactic_valid,
        "execution_not_ok_count": execution_not_ok,
        "execution_not_ok_pct": (
            100 * execution_not_ok / syntactic_valid
            if syntactic_valid > 0 else 0.0
        ),

        "executed_tests": executed_total,
        "assertion_not_ok_executed_count": assertion_not_ok_executed,
        "assertion_not_ok_executed_pct": (
            100 * assertion_not_ok_executed / executed_total
            if executed_total > 0 else 0.0
        ),

        "fully_correct_count": fully_correct,
        "fully_correct_pct": 100 * fully_correct / total,
    }



In [None]:
# Build correctness summary
correctness_results = []

for strategy, df in [
    ("P0", cr_df_P0),
    ("P1", cr_df_P1),
    ("P2", cr_df_P2),
    ("P3", cr_df_P3),
]:
    r = correctness_rates(df)
    r["strategy"] = strategy
    correctness_results.append(r)

correctness_summary = pd.DataFrame(correctness_results)

# Paper-ready table (compact)
paper_correctness = correctness_summary[[
    "strategy",
    "total_tests",
    "syntactic_not_ok_pct",
    "execution_not_ok_pct",
    "assertion_not_ok_executed_pct",
    "fully_correct_pct",
]].copy()

paper_correctness = paper_correctness.rename(columns={
    "strategy": "Strategy",
    "total_tests": "N tests",
    "syntactic_not_ok_pct": "Syntactic not ok (\\%)",
    "execution_not_ok_pct": "Execution not ok* (\\%)",
    "assertion_not_ok_executed_pct": "Assertion not ok** (\\%)",
    "fully_correct_pct": "Fully correct (\\%)",
})

paper_correctness = paper_correctness.round(2)

paper_correctness




In [None]:
#Print table in template required latex form

def df_to_latex_tabular(df: pd.DataFrame) -> str:
    cols = df.columns.tolist()
    col_format = "|l|" + "|".join(["c"] * (len(cols) - 1)) + "|"

    lines = []
    lines.append(f"\\begin{{tabular}}{{{col_format}}}")
    lines.append("\\hline")

    # Header
    header = " & ".join(cols) + " \\\\"
    lines.append(header)
    lines.append("\\hline")

    # Rows
    for _, row in df.iterrows():
        row_str = " & ".join(str(v) for v in row.values) + " \\\\"
        lines.append(row_str)
        lines.append("\\hline")

    lines.append("\\end{tabular}")
    return "\n".join(lines)


latex_tabular = df_to_latex_tabular(paper_correctness)
print(latex_tabular)



In [None]:
import os

def save_latex(path: str, content: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(content)

#Save table txt in the repository
save_latex(
    "latex/tables/API/correctness.tex",
    latex_tabular
)

In [None]:
# Paper-ready correctness table (absolute counts + one percentage column)

paper_correctness_counts = correctness_summary[[
    "strategy",
    "total_tests",
    "syntactic_not_ok_count",
    "execution_not_ok_count",
    "assertion_not_ok_executed_count",
    "fully_correct_count",
]].copy()

# Add: percentage of tests that are NOT fully correct
paper_correctness_counts["not_fully_correct_pct"] = (
    100.0
    * (paper_correctness_counts["total_tests"]
       - paper_correctness_counts["fully_correct_count"])
    / paper_correctness_counts["total_tests"]
)

paper_correctness_counts = paper_correctness_counts.rename(columns={
    "strategy": "Strategy",
    "total_tests": "N tests",
    "syntactic_not_ok_count": "Syntactic not ok",
    "execution_not_ok_count": "Execution not ok*",
    "assertion_not_ok_executed_count": "Assertion not ok**",
    "fully_correct_count": "Fully correct",
    "not_fully_correct_pct": "Not fully correct (\\%)",
})

paper_correctness_counts = paper_correctness_counts.round({
    "Not fully correct (\\%)": 2
})

paper_correctness_counts



In [None]:
import os



def save_latex(path: str, content: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(content)

latex_tabular_absolute = df_to_latex_tabular(paper_correctness_counts)
print(latex_tabular_absolute)

#Save table txt in the repository
save_latex(
    "latex/tables/API/correctness_absolute2.tex",
    latex_tabular_absolute
)

In [None]:
#Graph for correctness percentages

import matplotlib.pyplot as plt

plot_df = correctness_summary.set_index("strategy")[
    ["syntactic_not_ok_pct", "execution_not_ok_pct", "assertion_not_ok_executed_pct"]
]

colors = ['deepskyblue', 'royalblue', 'midnightblue']

fig, ax = plt.subplots(figsize=(7, 4.5))

plot_df.plot(
    kind="bar",
    ax=ax,
    color=colors,
    width=0.7
)

ax.set_ylabel("Not ok rate (%)", fontsize=11, labelpad=10)
ax.set_xlabel("Prompt strategy", fontsize=11, labelpad=10)

ax.set_title(
    "Correctness not ok rates by prompt strategy",
    pad=25
)

ax.set_ylim(0, plot_df.max().max() * 1.25)

ax.legend(
    [
        "Syntactic not ok (all tests)",
        "Execution not ok (syntactic-valid tests)",
        "Assertion not ok (executed tests only)"
    ],
    loc="upper center",
    bbox_to_anchor=(0.5, 1.05),
    frameon=False
)

ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

fig.subplots_adjust(top=0.82)

#Save image in the repository
plt.savefig(
    "latex/figures/API/correctness_bar.png",
    dpi=300,
    bbox_inches="tight"
)

plt.show()

In [None]:
#Show which kind of execution errors did the LLM struggle with the most (assertion errors not included)

def categorize_error(msg) -> str:
    # Handles NaN/None and non-strings safely
    msg = "" if pd.isna(msg) else str(msg)
    msg_l = msg.lower()

    if "valueerror" in msg_l:
        return "ValueError"
    if "attributeerror" in msg_l:
        return "AttributeError"
    if "typeerror" in msg_l:
        return "TypeError"
    if "assertionerror" in msg_l or "assert" in msg_l:
        return "AssertionFailure"

    return "Other"


error_summary = []

for strategy, df in [("P0", cr_df_P0), ("P1", cr_df_P1), ("P2", cr_df_P2), ("P3", cr_df_P3)]:
    exec_fail_df = df[(df["syntactic_ok"] == 1) & (df["execution_ok"] == 0)]

    categories = (
        exec_fail_df["error_message"]
        .fillna("")
        .apply(categorize_error)
    )

    counts = categories.value_counts()

    for cat, count in counts.items():
        error_summary.append({
            "strategy": strategy,
            "error_category": cat,
            "count": count
        })

error_summary_df = pd.DataFrame(error_summary)
error_summary_df

if error_summary_df.empty:
    print("No execution not ok tests found (after filtering syntactic_ok==1 & execution_ok==0).")
else:
    pivot = error_summary_df.pivot(
        index="error_category",
        columns="strategy",
        values="count"
    ).fillna(0)

    pivot.astype(int)

    pivot.plot(kind="bar")

    plt.ylabel("Number of execution not ok tests")
    plt.xlabel("Error category")
    plt.title("Execution error categories by prompt strategy")
    plt.xticks(rotation=0)

    # Save image in the repository (will fail if directory doesn't exist)
    plt.savefig(
        "latex/figures/API/correctness_error_types.png",
        dpi=300,
        bbox_inches="tight"
    )

    plt.show()
