In [2]:
import os
import pickle
import time
import json
from evalplus.gen.util import trusted_exec
from evalplus.data.utils import CACHE_DIR
from evalplus.data import get_human_eval_plus, get_human_eval_plus_hash

from evalplus.eval._special_oracle import (
    MBPP_OUTPUT_NOT_NONE_TASKS,
    MBPP_OUTPUT_SET_EQ_TASKS,
    _poly,
)

def get_groundtruth(problems, hashcode, tasks_only_output_not_none):
    cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
    if os.path.exists(cache_file):
        #print(f"Load from ground-truth from {cache_file}")
        with open(cache_file, "rb") as f:
            return pickle.load(f)

    os.makedirs(CACHE_DIR, exist_ok=True)
    #print("Computing expected output...")
    tbegin = time.time()
    expected_output = {}
    for task_id, problem in problems.items():
        oracle = {}
        oracle["base"], oracle["base_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["base_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )

        oracle["plus"], oracle["plus_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["plus_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )
        expected_output[task_id] = oracle
    #print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")

    with open(cache_file, "wb") as f:
        pickle.dump(expected_output, f)

    return expected_output

dataset = get_human_eval_plus()
dataset_hash = get_human_eval_plus_hash()
expected_output = get_groundtruth(
    dataset,
    dataset_hash,
    MBPP_OUTPUT_NOT_NONE_TASKS,
)


In [9]:
#load trial_inputs from jsonl file
trial_inputs = {}
with open("../other_data/trial_inputs.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        trial_inputs[data["task_id"]] = data["trial_input"]


In [24]:
counts = 0
based_matched_counts = 0
plus_matched_counts = 0
for task_id in trial_inputs:
    for trial_input in trial_inputs[task_id]:
        counts += 1
        if trial_input in dataset[task_id]["base_input"]:
            based_matched_counts += 1
        if trial_input in dataset[task_id]["plus_input"]:
            plus_matched_counts += 1

print(f"Matched {based_matched_counts} out of {counts}")
print(f"Matched {plus_matched_counts} out of {counts}")



Matched 344 out of 460
Matched 90 out of 460


In [26]:
344/460

0.7478260869565218

In [27]:
90/460

0.1956521739130435