In [2]:
import argparse
import json
import multiprocessing
import os
import pickle
import threading
import time
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from typing import Any, Dict, List, Tuple
from warnings import warn

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

import numpy as np

from evalplus.data.utils import CACHE_DIR

from evalplus.data import (
    get_human_eval_plus,
    get_human_eval_plus_hash,
    get_mbpp_plus,
    get_mbpp_plus_hash,
    load_solutions,
)

from evalplus.eval._special_oracle import (
    MBPP_OUTPUT_NOT_NONE_TASKS,
    MBPP_OUTPUT_SET_EQ_TASKS,
    _poly,
)

from evalplus.gen.util import trusted_exec

def is_floats(x) -> bool:
    # check if it is float; List[float]; Tuple[float]
    if isinstance(x, float):
        return True
    if isinstance(x, (list, tuple)):
        return all(isinstance(i, float) for i in x)
    if isinstance(x, np.ndarray):
        return x.dtype == np.float64 or x.dtype == np.float32
    return False

def ut_exact_match(
    hyp_ut, 
    ref_ut, 
    entry_point, 
    dataset, 
    inp=None, 
    atol=0 # need to change this later
    ):
    exact_match = hyp_ut == ref_ut

    # ================================================ #
    # ============== special oracles ================= #
    if dataset == "mbpp":
        if "are_equivalent" == entry_point:  # Mbpp/164 special oracle
            exact_match = exact_match or True
        elif "sum_div" == entry_point:  # Mbpp/295 special oracle
            exact_match = exact_match or hyp_ut == 0 or ref_ut == 0
        elif entry_point in MBPP_OUTPUT_SET_EQ_TASKS:
            exact_match = set(hyp_ut) == set(ref_ut)
        elif entry_point in MBPP_OUTPUT_NOT_NONE_TASKS:
            # exp is True  if not None
            #        False if None
            if isinstance(hyp_ut, bool):
                hyp_ut = hyp_ut is not None
            if isinstance(ref_ut, bool):
                ref_ut = ref_ut is not None
            exact_match = hyp_ut == ref_ut

    if dataset == "humaneval":
        if "find_zero" == entry_point:
            hyp_ut = _poly(*inp, hyp_ut) <= atol
            ref_ut = _poly(*inp, ref_ut) <= atol
            exact_match = hyp_ut == ref_ut
    # ============== special oracles ================= #
    # ================================================ #

    if atol == 0 and (is_floats(ref_ut) or is_floats(hyp_ut)):
        atol = 1e-6  # enforce atol for float comparison
    if not exact_match and atol != 0:
        # explicitly set rtol=1e-07
        # to match `np.testing.assert_allclose`'s default values
        exact_match =  np.allclose(hyp_ut, ref_ut, rtol=1e-07, atol=atol)
    
    return int(exact_match)

def get_groundtruth(problems, hashcode, tasks_only_output_not_none):
    cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
    if os.path.exists(cache_file):
        #print(f"Load from ground-truth from {cache_file}")
        with open(cache_file, "rb") as f:
            return pickle.load(f)

    os.makedirs(CACHE_DIR, exist_ok=True)
    #print("Computing expected output...")
    tbegin = time.time()
    expected_output = {}
    for task_id, problem in problems.items():
        oracle = {}
        oracle["base"], oracle["base_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["base_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )

        oracle["plus"], oracle["plus_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["plus_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )
        expected_output[task_id] = oracle
    #print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")

    with open(cache_file, "wb") as f:
        pickle.dump(expected_output, f)

    return expected_output

In [3]:
def mbr_exec(hyp_uts, ref_uts, entry_point, dataset, n_uts, inps=None, granular=False):
    n_matches = 0
    for i in range(n_uts):
        # skip if either hyp_ut or ref_ut is not in the list
        if i not in hyp_uts or i not in ref_uts:
            continue
        # if there's an error, we return 0
        if type(hyp_uts[i]) == str and hyp_uts[i].startswith("failed:"):
            return 0 
        if type(ref_uts[i]) == str and ref_uts[i].startswith("failed:"):
            return 0
        # we start counting the number of matches
        try:
            n_matches += ut_exact_match(
                hyp_uts[i], 
                ref_uts[i], 
                entry_point, 
                dataset, 
                inp=inps[i] if inps else None
                )
        except:
            n_matches += 0
        
    if granular:
        try:
            return n_matches/ n_uts
        except:
            return 0
    else:
        return int(n_matches == n_uts)


In [4]:
work_dir = "/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs"
dataset = "humaneval"
#gen_dir = "deepseek-coder-33b-instruct_temp_0.8"
#gen_dir = "deepseek-coder-7b-instruct-v1.5_temp_1.2"
gen_dir = "deepseek-coder-6.7b-instruct_temp_1.2"
#gen_dir = "code-llama-13b-instruct_temp_1.6"
#gen_dir = "code-llama-7b-instruct_temp_1.6"
#debug_gen_dir = gen_dir + "_debug1_not_change_positive"
debug_gen_dir = gen_dir + "_debug1_sd-ut"
debug_3times_gen_dir = gen_dir + "_debug3_sd-ut"
#_debug1_not_change_positive
# load exec_outputs

# load problems
if dataset == "mbpp":
    problems = get_mbpp_plus()
    dataset_hash = get_mbpp_plus_hash()
    expected_output = get_groundtruth(
        problems,
        dataset_hash,
        MBPP_OUTPUT_NOT_NONE_TASKS,
    )
elif dataset == "humaneval":
    problems = get_human_eval_plus()
    dataset_hash = get_human_eval_plus_hash()
    expected_output = get_groundtruth(
        problems,
        dataset_hash,
        []
    )
else:
    raise ValueError("Invalid dataset")

with open(f"{work_dir}/{dataset}/{gen_dir}/exec_outputs_v2.pkl", "rb") as f:
    exec_outputs = pickle.load(f)
print("exec_outputs loaded")

# load exec_outputs
with open(f"{work_dir}/{dataset}/{debug_gen_dir}/exec_outputs_v2.pkl", "rb") as f:
    exec_outputs_debug = pickle.load(f)
print("exec_outputs_debug loaded")

# load eval_results
with open(f"{work_dir}/{dataset}/{gen_dir}/eval_results.json", "r") as f:
    eval_results = json.load(f)
for task_id in eval_results["eval"]:
    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
    
#pop out ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]
if dataset == "mbpp":
    for task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
        eval_results["eval"].pop(task_id)
print("eval_results loaded")

# load eval_results
with open(f"{work_dir}/{dataset}/{debug_gen_dir}/eval_results.json", "r") as f:
    ape_eval_results = json.load(f)
for task_id in ape_eval_results["eval"]:
    ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
print("ape_eval_results loaded")


# load eval_results
with open(f"{work_dir}/{dataset}/{debug_3times_gen_dir}/eval_results.json", "r") as f:
    ape_3times_eval_results = json.load(f)
for task_id in ape_3times_eval_results["eval"]:
    ape_3times_eval_results["eval"][task_id] = sorted(ape_3times_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
print("ape_3times_eval_results loaded")



exec_outputs loaded
exec_outputs_debug loaded
eval_results loaded
ape_eval_results loaded
ape_3times_eval_results loaded


In [5]:
if dataset == "humaneval":
    with open(f"{work_dir}/{dataset}/{gen_dir}/errors.pkl", "rb") as f:
        errors = pickle.load(f)


In [340]:
from concurrent.futures import ThreadPoolExecutor

def process_task(task_id, eval_results_dict, exec_outputs_dict, max_hyps = 200, start_id = 0, num_base_test_cases = 3,num_plus_test_cases=3, granular=False, filter = True):
    test_all_plus_cases = True
    test_all_base_cases = True
    assert start_id >= 0 and start_id + max_hyps <= 200
    n_expected_outputs_base = len(expected_output[task_id]["base"])
    n_expected_outputs_plus = len(expected_output[task_id]["plus"])
    if num_base_test_cases < n_expected_outputs_base:
        test_all_base_cases = False
    if num_plus_test_cases < n_expected_outputs_plus:
        test_all_plus_cases = False
    task_utility_base = []
    task_utility_plus = []
    
    for hyp_id, hyp in enumerate(eval_results_dict["eval"][task_id]):
        if hyp_id >= max_hyps + start_id or hyp_id < start_id:
            continue
        hyp_base_outputs = exec_outputs_dict[task_id][hyp_id]["base"]
        hyp_plus_outputs = exec_outputs_dict[task_id][hyp_id]["plus"]
        if not test_all_base_cases:
            hyp_base_outputs = {i: hyp_base_outputs[i] for i in range(num_base_test_cases) if i < min(num_base_test_cases, len(hyp_base_outputs))}
        if not test_all_plus_cases:
            hyp_plus_outputs = {i: hyp_plus_outputs[i] for i in range(num_plus_test_cases) if i < min(num_plus_test_cases, len(hyp_plus_outputs))}
        hyp_utility_base = []
        hyp_utility_plus = []
        
        for ref_id, ref in enumerate(eval_results_dict["eval"][task_id]):
            if ref_id >= max_hyps + start_id or ref_id < start_id:
                continue
            if ref["base_status"] == ref["plus_status"] == hyp["base_status"] == hyp["plus_status"] == "pass":
                hyp_utility_base.append(1)
                hyp_utility_plus.append(1)
                continue
            if filter:
                ### add the filtering baseline
                if dataset == "humaneval":
                    if errors[task_id][hyp_id]["base"]["status"] != "pass":
                        hyp_utility_base.append(0)
                        hyp_utility_plus.append(0)
                        continue
                elif dataset == "mbpp":
                    if len(hyp["base_details"]) == 0 or hyp["base_details"][0] == 0:
                        hyp_utility_base.append(0)
                        hyp_utility_plus.append(0)
                        continue
                ### end of filtering baseline
            ref_base_outputs = exec_outputs_dict[task_id][ref_id]["base"]
            ref_plus_outputs = exec_outputs_dict[task_id][ref_id]["plus"]
            if not test_all_base_cases:
                ref_base_outputs = {i: ref_base_outputs[i] for i in range(num_base_test_cases) if i < min(num_base_test_cases, len(ref_base_outputs))}
            if not test_all_plus_cases:
                ref_plus_outputs = {i: ref_plus_outputs[i] for i in range(num_plus_test_cases) if i < min(num_plus_test_cases, len(ref_plus_outputs))}
            
            util_score_base = mbr_exec(hyp_base_outputs, ref_base_outputs, problems[task_id]["entry_point"], "mbpp", n_expected_outputs_base, granular=granular)
            util_score_plus = mbr_exec(hyp_plus_outputs, ref_plus_outputs, problems[task_id]["entry_point"], "mbpp", n_expected_outputs_plus, granular=granular)
            hyp_utility_base.append(util_score_base)
            if granular:
                hyp_utility_plus.append((util_score_plus*n_expected_outputs_plus+util_score_base*n_expected_outputs_base)/(n_expected_outputs_plus+n_expected_outputs_base))
            else:
                if num_plus_test_cases == 0:
                    hyp_utility_plus.append(util_score_base)
                else:
                    hyp_utility_plus.append(int(util_score_plus==util_score_base==1))
            #hyp_utility_plus.append(mbr_exec(hyp_plus_outputs, ref_plus_outputs, problems[task_id]["entry_point"], "mbpp", n_expected_outputs_plus))
        #hyp_utility_plus.extend(hyp_utility_base)
        task_utility_base.append(np.mean(hyp_utility_base))
        task_utility_plus.append(np.mean(hyp_utility_plus))
    
    # get argmax
    argmax_base = np.argmax(task_utility_base) + start_id
    argmax_plus = np.argmax(task_utility_plus) + start_id
    if not test_all_base_cases:
        argmax_plus = argmax_base
    assert argmax_base == int(eval_results_dict["eval"][task_id][argmax_base]["solution_id"])
    assert argmax_plus == int(eval_results_dict["eval"][task_id][argmax_plus]["solution_id"])
    base_status = eval_results_dict["eval"][task_id][argmax_base]["base_status"]
    plus_status = eval_results_dict["eval"][task_id][argmax_plus]["plus_status"]
    return (int(base_status == "pass"), int(base_status == plus_status == "pass"), argmax_base, argmax_plus)


In [341]:
def get_results(eval_results, exec_outputs, max_hyps=200, start_id=0, num_base_test_cases=10, num_plus_test_cases=300, granular=False, filter=True, workers=20):
    for task_id in eval_results["eval"]:
        eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
        
    base_results = {}
    plus_results = {}
    argmax_bases = {}
    argmax_pluss = {}
    
    def process_single_task(task_id):
        base_result, plus_result, argmax_base, argmax_plus = process_task(task_id, 
                                                                          eval_results, 
                                                                          exec_outputs, 
                                                                          max_hyps=max_hyps, 
                                                                          start_id=start_id, 
                                                                          num_base_test_cases=num_base_test_cases,
                                                                          num_plus_test_cases=num_plus_test_cases, 
                                                                          granular=granular, 
                                                                          filter=filter)
        base_results[task_id] = base_result
        plus_results[task_id] = plus_result
        argmax_bases[task_id] = argmax_base
        argmax_pluss[task_id] = argmax_plus

    with ThreadPoolExecutor(max_workers=workers) as executor:
        list(tqdm(executor.map(process_single_task, eval_results["eval"]), total=len(eval_results["eval"])))
        
    for task_id in ape_3times_eval_results["eval"]:
        ape_3times_eval_results["eval"][task_id] = sorted(ape_3times_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

    ape_base_results = {}
    ape_plus_results = {}
    for task_id in ape_3times_eval_results["eval"]:
        argmax_base_solution = ape_3times_eval_results["eval"][task_id][argmax_bases[task_id]]
        argmax_plus_solution = ape_3times_eval_results["eval"][task_id][argmax_pluss[task_id]]
        argmax_base_status = argmax_base_solution["base_status"]
        argmax_plus_status = argmax_plus_solution["plus_status"]
        assert int(argmax_base_solution["solution_id"]) == argmax_bases[task_id]
        assert int(argmax_plus_solution["solution_id"]) == argmax_pluss[task_id]
        ape_base_results[task_id] = int(argmax_base_status == "pass")
        ape_plus_results[task_id] = int(argmax_plus_status == argmax_base_status == "pass")
    
    return base_results, plus_results, ape_base_results, ape_plus_results

In [342]:
def get_debugged_results(ape_eval_results, exec_outputs, max_hyps=200, start_id=0, num_base_test_cases=10, num_plus_test_cases=300, granular=False, filter=True, workers=20):
    for task_id in ape_eval_results["eval"]:
        ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
    
    debugged_base_results = {}
    debugged_plus_results = {}
    debugged_argmax_bases = {}
    debugged_argmax_pluss = {}

    def process_single_task(task_id):
        debugged_base_result, debugged_plus_result, debugged_argmax_base, debugged_argmax_plus = process_task(task_id, 
                                                                                                              ape_eval_results, 
                                                                                                              exec_outputs,
                                                                                                              max_hyps=max_hyps, 
                                                                                                              start_id=start_id, 
                                                                                                              num_base_test_cases=num_base_test_cases,
                                                                                                              num_plus_test_cases=num_plus_test_cases,
                                                                                                              granular=granular,
                                                                                                              filter=filter)
        debugged_base_results[task_id] = debugged_base_result
        debugged_plus_results[task_id] = debugged_plus_result
        debugged_argmax_bases[task_id] = debugged_argmax_base
        debugged_argmax_pluss[task_id] = debugged_argmax_plus

    with ThreadPoolExecutor(max_workers=workers) as executor:
        list(tqdm(executor.map(process_single_task, ape_eval_results["eval"]), total=len(ape_eval_results["eval"])))

    return debugged_base_results, debugged_plus_results, debugged_argmax_bases, debugged_argmax_pluss

In [359]:
num_base_test_cases = 300
num_plus_test_cases = 10
max_hyps = 50
granular = False
filter = True
workers = 1
base_score = []
plus_score = []
ape_base_score = []
ape_plus_score = []
debugged_base_score = []
debugged_plus_score = []
for start_id in range(0, 200, max_hyps):
    base_results, plus_results, ape_base_results, ape_plus_results = get_results(eval_results,
                                                                                 exec_outputs, 
                                                                                 max_hyps=max_hyps, 
                                                                                 start_id=start_id, 
                                                                                 num_base_test_cases=num_base_test_cases,
                                                                                 num_plus_test_cases=num_plus_test_cases,
                                                                                 granular=granular,
                                                                                 filter=filter,
                                                                                 workers=workers)
    #debugged_base_results, debugged_plus_results, debugged_argmax_bases, debugged_argmax_pluss = get_debugged_results(ape_eval_results, 
    #                                                                                                                      exec_outputs_debug, 
    #                                                                                                                      max_hyps=max_hyps, 
    #                                                                                                                      start_id=start_id, 
    #                                                                                                                      num_base_test_cases=num_base_test_cases,
    #                                                                                                                      num_plus_test_cases=num_plus_test_cases,
    #                                                                                                                      granular=granular,
    #                                                                                                                      filter=filter,
    #                                                                                                                      workers=workers)
    
    base_score.append(sum(base_results.values())/len(base_results))
    plus_score.append(sum(plus_results.values())/len(plus_results))
    ape_base_score.append(sum(ape_base_results.values())/len(ape_base_results))
    ape_plus_score.append(sum(ape_plus_results.values())/len(ape_plus_results))
    #debugged_base_score.append(sum(debugged_base_results.values())/len(debugged_base_results))
    #debugged_plus_score.append(sum(debugged_plus_results.values())/len(debugged_plus_results))

 71%|███████   | 116/164 [00:02<00:00, 50.88it/s]

100%|██████████| 164/164 [00:03<00:00, 51.53it/s]
100%|██████████| 164/164 [00:03<00:00, 41.61it/s]
100%|██████████| 164/164 [00:03<00:00, 43.94it/s]
100%|██████████| 164/164 [00:03<00:00, 51.11it/s]


In [360]:
round_digits = 10
num_base_tests = 10
print("model: {}".format(gen_dir))
print("dataset: {}".format(dataset))
print("filter: {}".format(filter))
print("number of Test Cases: {}".format(num_plus_test_cases+num_base_tests))
print("number of hypotheses: {}".format(max_hyps))
print("MBR base     ", np.round(sum(base_score)/len(base_score) * 100, round_digits), "%")
print("MBR->APE base", np.round(sum(ape_base_score)/len(ape_base_score) * 100, round_digits), "%")
#print("APE->MBR base", np.round(sum(debugged_base_score)/len(debugged_base_score) * 100, round_digits), "%")
print("-"*100)
print("MBR plus     ", np.round(sum(plus_score)/len(plus_score) * 100, round_digits), "%")
print("MBR->APE plus", np.round(sum(ape_plus_score)/len(ape_plus_score) * 100, round_digits), "%")
#print("APE->MBR plus", np.round(sum(debugged_plus_score)/len(debugged_plus_score) * 100, round_digits), "%")

model: code-llama-13b-instruct_temp_1.6
dataset: humaneval
filter: True
number of Test Cases: 20
number of hypotheses: 50
MBR base      80.487804878 %
MBR->APE base 80.9451219512 %
----------------------------------------------------------------------------------------------------
MBR plus      76.0670731707 %
MBR->APE plus 76.5243902439 %


In [51]:
print(base_score)
print(plus_score)
print(ape_base_score)
print(ape_plus_score)
print(debugged_base_score)
print(debugged_plus_score)



[0.8405063291139241, 0.8481012658227848, 0.8430379746835444, 0.830379746835443]
[0.7518987341772152, 0.7518987341772152, 0.7518987341772152, 0.7341772151898734]
[0.8430379746835444, 0.8481012658227848, 0.8405063291139241, 0.8253164556962025]
[0.7493670886075949, 0.7468354430379747, 0.7468354430379747, 0.7265822784810126]
[0.8405063291139241, 0.850632911392405, 0.8582278481012658, 0.8481012658227848]
[0.7569620253164557, 0.7645569620253164, 0.7670886075949367, 0.759493670886076]


In [None]:
# model: code-llama-7b-instruct_temp_1.6
# dataset: mbpp
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      78.9873417722 %
# MBR->APE base 79.3037974684 %
# APE->MBR base 81.2658227848 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      69.3037974684 %
# MBR->APE plus 69.1772151899 %
# APE->MBR plus 72.0886075949 %

# model: code-llama-7b-instruct_temp_1.6
# dataset: mbpp
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      63.9873417722 %
# MBR->APE base 65.7594936709 %
# APE->MBR base 66.3291139241 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      57.8481012658 %
# MBR->APE plus 59.0506329114 %
# APE->MBR plus 60.3797468354 %

# model: code-llama-7b-instruct_temp_1.6
# dataset: humaneval
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      74.5426829268 %
# MBR->APE base 74.3902439024 %
# APE->MBR base 76.5243902439 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      70.1219512195 %
# MBR->APE plus 70.1219512195 %
# APE->MBR plus 72.256097561 %

# model: code-llama-7b-instruct_temp_1.6
# dataset: humaneval
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      52.1341463415 %
# MBR->APE base 51.9817073171 %
# APE->MBR base 52.4390243902 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      49.5426829268 %
# MBR->APE plus 49.5426829268 %
# APE->MBR plus 50.0 %

# model: deepseek-coder-6.7b-instruct_temp_1.2
# dataset: mbpp
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      89.1139240506 %
# MBR->APE base 88.9873417722 %
# APE->MBR base 89.8734177215 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      82.0886075949 %
# MBR->APE plus 81.7088607595 %
# APE->MBR plus 82.8481012658 %

# model: deepseek-coder-6.7b-instruct_temp_1.2
# dataset: mbpp
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      83.9873417722 %
# MBR->APE base 84.5569620253 %
# APE->MBR base 84.8734177215 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      78.6075949367 %
# MBR->APE plus 78.7974683544 %
# APE->MBR plus 79.3037974684 %

# model: deepseek-coder-6.7b-instruct_temp_1.2
# dataset: humaneval
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      91.6158536585 %
# MBR->APE base 91.4634146341 %
# APE->MBR base 91.6158536585 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      90.7012195122 %
# MBR->APE plus 90.3963414634 %
# APE->MBR plus 90.3963414634 %

# model: deepseek-coder-6.7b-instruct_temp_1.2
# dataset: humaneval
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      86.4329268293 %
# MBR->APE base 87.0426829268 %
# APE->MBR base 87.6524390244 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      85.5182926829 %
# MBR->APE plus 85.9756097561 %
# APE->MBR plus 87.0426829268 %

# model: code-llama-13b-instruct_temp_1.6
# dataset: mbpp
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      84.0506329114 %
# MBR->APE base 83.9240506329 %
# APE->MBR base 84.9367088608 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      74.746835443 %
# MBR->APE plus 74.2405063291 %
# APE->MBR plus 76.2025316456 %

# model: code-llama-13b-instruct_temp_1.6
# dataset: mbpp
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      73.9240506329 %
# MBR->APE base 74.3670886076 %
# APE->MBR base 74.4303797468 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      67.2151898734 %
# MBR->APE plus 66.9620253165 %
# APE->MBR plus 68.3544303797 %

# model: code-llama-13b-instruct_temp_1.6
# dataset: humaneval
# filter: True
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      80.487804878 %
# MBR->APE base 80.9451219512 %
# APE->MBR base 81.5548780488 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      76.0670731707 %
# MBR->APE plus 76.5243902439 %
# APE->MBR plus 77.5914634146 %

# model: code-llama-13b-instruct_temp_1.6
# dataset: humaneval
# filter: False
# number of Test Cases: 503
# number of hypotheses: 50
# MBR base      61.2804878049 %
# MBR->APE base 61.737804878 %
# APE->MBR base 62.6524390244 %
# ----------------------------------------------------------------------------------------------------
# MBR plus      59.6036585366 %
# MBR->APE plus 60.0609756098 %
# APE->MBR plus 61.4329268293 %

In [345]:
new_majority_votes = {}
for task in exec_outputs:
    if dataset == "mbpp":
        if task in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
            continue
    new_majority_votes[task] = []
    for i in range(len(exec_outputs[task])):
        try:
            new_majority_votes[task].append(exec_outputs[task][i]["base"][0])
        except:
            new_majority_votes[task].append("failed: couldn't collect")

In [346]:
all_ids = {}
for task_id in new_majority_votes:
    all_ids[task_id] = []
    candidate_set = []
    candidates = {i: e for i, e in enumerate(new_majority_votes[task_id]) if not str(e).startswith("failed:")}
    if len(candidates) == 0:
        all_ids[task_id] = [[]]
        continue
    for id in candidates:
        if candidates[id] not in candidate_set:
            candidate_set.append(candidates[id])
            all_ids[task_id].append([id])
        else:
            index = candidate_set.index(candidates[id])
            all_ids[task_id][index].append(id)



In [347]:
final_lists_dict = {}
for task_id in all_ids:
    lengths = [len(e) for e in all_ids[task_id]]
    max_length = max(lengths)
    final_list = []
    for e in all_ids[task_id]:
        if len(e) == max_length:
            final_list = e
    assert type(final_list) == list
    final_lists_dict[task_id] = final_list


In [348]:
base_correct = {}
plus_correct = {}
for task_id in eval_results["eval"]:
    base_correct[task_id] = {}
    plus_correct[task_id] = {}
    if task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
        continue
    for i in final_lists_dict[task_id]:
        assert i == int(eval_results["eval"][task_id][i]["solution_id"])
        base_correct[task_id][i] = eval_results["eval"][task_id][i]["base_status"] == "pass"
        plus_correct[task_id][i] = eval_results["eval"][task_id][i]["base_status"] == eval_results["eval"][task_id][i]["plus_status"] == "pass"




In [349]:
with open(f"{work_dir}/{dataset}/{gen_dir}/errors.pkl", "rb") as f:
    errors = pickle.load(f)

In [350]:
base_scores = []
plus_scores = []
for min_, max_ in [(0,50), (50, 100), (100, 150), (150,200)]:
    for task_id in base_correct:
        base_correct_task = []
        plus_correct_task = []
        for i in range(min_, max_):
            if i in base_correct[task_id]:
                # if you want to calculate one with only one unit test without filtering, just comment the next lin
                #if errors[task_id][i]["base"]["status"] == "pass":
                base_correct_task.append(base_correct[task_id][i])
            if i in plus_correct[task_id]:
                #if errors[task_id][i]["base"]["status"] == "pass":
                plus_correct_task.append(plus_correct[task_id][i])
        try:
            base_scores.append(sum(base_correct_task)/len(base_correct_task))
        except:
            base_scores.append(0)
        try:
            plus_scores.append(sum(plus_correct_task)/len(plus_correct_task))
        except:
            plus_scores.append(0)

print(np.mean(base_scores))
print(np.mean(plus_scores))



0.5494685796998906
0.47226502174549423


In [None]:
def get_top_vote(new_majority_votes, task_id):
    counter = list(sorted(Counter(new_majority_votes[task_id]), key=lambda x: x[1], reverse=True))

    counter = [e[0] for e in counter]
    counter = [e for e in counter if not str(e).startswith("failed:")]
    if len(counter) != 0:
        top_vote = counter[0]
    else:
        top_vote = "failed: couldn't collect"
    collect_ids = []
    for i, e in enumerate(new_majority_votes[task_id]):
        if e == top_vote:
            collect_ids.append(i)
    return collect_ids

total_base_correct = {}
total_plus_correct = {}
for task_id in eval_results["eval"]:
    collect_ids = get_top_vote(new_majority_votes, task_id)
    base_correct = {}
    plus_correct = {}
    for i in collect_ids:
        assert i == int(eval_results["eval"][task_id][i]["solution_id"])
        base_correct[i] = eval_results["eval"][task_id][i]["base_status"] == "pass"
        plus_correct[i] = eval_results["eval"][task_id][i]["base_status"] == eval_results["eval"][task_id][i]["plus_status"] == "pass"
    total_base_correct[task_id] = base_correct
    total_plus_correct[task_id] = plus_correct


In [147]:
import numpy as np
# calculate the correlation using two lists
print(np.corrcoef(mbr_base_scores, base_oracle)[1, 0])
print(np.corrcoef(mbr_plus_scores, plus_oracle)[1, 0])
print(np.corrcoef(filtering_mbr_base_scores, base_oracle)[1, 0])
print(np.corrcoef(filtering_mbr_plus_scores, plus_oracle)[1, 0])



0.9743108450645585
0.9971335062771679
0.9982035514862486
0.9992159249137759


In [49]:
# read cbs_f1 and cbs_f3 from pickle files
import json
import pickle
work_dir = "/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs"
dataset = "mbpp"
#gen_dir = "deepseek-coder-33b-instruct_temp_0.8"
#gen_dir = "deepseek-coder-7b-instruct-v1.5_temp_0.8"
#gen_dir = "deepseek-coder-6.7b-instruct_temp_1.2"
gen_dir = "code-llama-13b-instruct_temp_1.6"
#gen_dir = "code-llama-7b-instruct_temp_1.6"
#debug_gen_dir = gen_dir + "_debug1_not_change_positive"
debug_gen_dir = gen_dir + "_debug1_sd-ut"
#_debug1_not_change_positive
# load exec_outputs§

# load eval_results
with open(f"{work_dir}/{dataset}/{gen_dir}/eval_results.json", "r") as f:
    eval_results = json.load(f)
for task_id in eval_results["eval"]:
    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

with open(f"{work_dir}/{dataset}/{gen_dir}/cbs_f1.pkl", "rb") as f:
    cbs_f1 = pickle.load(f)
with open(f"{work_dir}/{dataset}/{gen_dir}/cbs_f3.pkl", "rb") as f:
    cbs_f3 = pickle.load(f)
    
#pop out ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]
if dataset == "mbpp":
    for task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
        eval_results["eval"].pop(task_id)
        cbs_f1.pop(task_id)
        cbs_f3.pop(task_id)
    
# load ape_eval_results
with open(f"{work_dir}/{dataset}/{debug_gen_dir}/eval_results.json", "r") as f:
    ape_eval_results = json.load(f)
for task_id in ape_eval_results["eval"]:
    ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

#with open(f"{work_dir}/{dataset}/{debug_gen_dir}/cbs_f1.pkl", "rb") as f:
#    ape_cbs_f1 = pickle.load(f)
#with open(f"{work_dir}/{dataset}/{debug_gen_dir}/cbs_f3.pkl", "rb") as f:
#    ape_cbs_f3 = pickle.load(f)

#pop out ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]
#for task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
#    ape_cbs_f1.pop(task_id)
#    ape_cbs_f3.pop(task_id)



In [50]:
if dataset == "humaneval":
    with open(f"{work_dir}/{dataset}/{gen_dir}/errors.pkl", "rb") as f:
        errors = pickle.load(f)

In [51]:
from concurrent.futures import ThreadPoolExecutor

def process_task_mbr(task_id, eval_results_dict, utility_f1, utility_f3, max_hyps = 200, start_id = 0, filter = True):
    assert start_id >= 0 and start_id + max_hyps <= len(eval_results_dict["eval"][task_id])
    task_utility_f1 = []
    task_utility_f3 = []
    
    for hyp_id, hyp in enumerate(eval_results_dict["eval"][task_id]):
        if hyp_id >= max_hyps + start_id or hyp_id < start_id:
            continue
        hyp_utility_f1 = []
        hyp_utility_f3 = []
        
        for ref_id, ref in enumerate(eval_results_dict["eval"][task_id]):
            if ref_id >= max_hyps + start_id or ref_id < start_id:
                continue
            if filter:
                if dataset == "mbpp":
                    ### add the filtering baseline
                    if len(hyp["base_details"]) == 0 or hyp["base_details"][0] == 0:
                        hyp_utility_f1.append(0)
                        hyp_utility_f3.append(0)
                        continue
                    ### end of filtering baseline
                elif dataset == "humaneval":
                    ### add the filtering baseline
                    if errors[task_id][hyp_id]["base"]["status"] != "pass":
                        hyp_utility_f1.append(0)
                        hyp_utility_f3.append(0)
                        continue
                    ### end of filtering baseline
            
            util_score_f1 = utility_f1[task_id][hyp_id][ref_id]
            util_score_f3 = utility_f3[task_id][hyp_id][ref_id]
            hyp_utility_f1.append(util_score_f1)
            hyp_utility_f3.append(util_score_f3)
        task_utility_f1.append(np.mean(hyp_utility_f1))
        task_utility_f3.append(np.mean(hyp_utility_f3))
        
    
    # get argmax
    argmax_f1 = np.argmax(task_utility_f1) + start_id
    argmax_f3 = np.argmax(task_utility_f3) + start_id
    assert argmax_f1 == int(eval_results_dict["eval"][task_id][argmax_f1]["solution_id"])
    assert argmax_f3 == int(eval_results_dict["eval"][task_id][argmax_f3]["solution_id"])
    f1_base_status = eval_results_dict["eval"][task_id][argmax_f1]["base_status"]
    f3_base_status = eval_results_dict["eval"][task_id][argmax_f3]["base_status"]
    f1_plus_status = eval_results_dict["eval"][task_id][argmax_f1]["plus_status"]
    f3_plus_status = eval_results_dict["eval"][task_id][argmax_f3]["plus_status"]

    return (int(f1_base_status == "pass"), 
            int(f1_base_status == f1_plus_status == "pass"), 
            int(f3_base_status == "pass"), 
            int(f3_base_status == f3_plus_status == "pass"), 
            argmax_f1, argmax_f3)

def get_mbr_results(eval_results, utility_f1, utility_f3, max_hyps=200, start_id=0, filter=True, workers=20):
    for task_id in eval_results["eval"]:
        eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
        
    f1_base_results = {}
    f1_plus_results = {}
    f3_base_results = {}
    f3_plus_results = {}
    argmax_f1_results = {}
    argmax_f3_results = {}
    
    def process_single_task(task_id):
        f1_base_pass, f1_plus_pass, f3_base_pass, f3_plus_pass, argmax_f1, argmax_f3 = process_task_mbr(task_id, 
                                                                                                eval_results, 
                                                                                                utility_f1, 
                                                                                                utility_f3, 
                                                                                                max_hyps=max_hyps, 
                                                                                                start_id=start_id, 
                                                                                                filter=filter)
        f1_base_results[task_id] = f1_base_pass
        f1_plus_results[task_id] = f1_plus_pass
        f3_base_results[task_id] = f3_base_pass
        f3_plus_results[task_id] = f3_plus_pass
        argmax_f1_results[task_id] = argmax_f1
        argmax_f3_results[task_id] = argmax_f3

    with ThreadPoolExecutor(max_workers=workers) as executor:
        list(tqdm(executor.map(process_single_task, eval_results["eval"]), total=len(eval_results["eval"])))
        
    for task_id in ape_eval_results["eval"]:
        ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

    f1_ape_base_results = {}
    f1_ape_plus_results = {}
    f3_ape_base_results = {}
    f3_ape_plus_results = {}
    for task_id in ape_eval_results["eval"]:
        argmax_f1_solution = ape_eval_results["eval"][task_id][argmax_f1_results[task_id]]
        argmax_f3_solution = ape_eval_results["eval"][task_id][argmax_f3_results[task_id]]
        argmax_f1_base_status = argmax_f1_solution["base_status"]
        argmax_f1_plus_status = argmax_f1_solution["plus_status"]
        argmax_f3_base_status = argmax_f3_solution["base_status"]
        argmax_f3_plus_status = argmax_f3_solution["plus_status"]
        assert int(argmax_f1_solution["solution_id"]) == argmax_f1_results[task_id]
        assert int(argmax_f3_solution["solution_id"]) == argmax_f3_results[task_id]
        f1_ape_base_results[task_id] = int(argmax_f1_base_status == "pass")
        f1_ape_plus_results[task_id] = int(argmax_f1_base_status == argmax_f1_plus_status == "pass")
        f3_ape_base_results[task_id] = int(argmax_f3_base_status == "pass")
        f3_ape_plus_results[task_id] = int(argmax_f3_base_status == argmax_f3_plus_status == "pass")
    
    return (f1_base_results, 
            f1_plus_results, 
            f3_base_results, 
            f3_plus_results, 
            f1_ape_base_results, 
            f1_ape_plus_results, 
            f3_ape_base_results, 
            f3_ape_plus_results,
            argmax_f1_results,
            argmax_f3_results)

def get_debugged_mbr_results(ape_eval_results, utility_f1, utility_f3, max_hyps=200, start_id=0, filter=True, workers=20):

    f1_debugged_base_results = {}
    f1_debugged_plus_results = {}
    f3_debugged_base_results = {}
    f3_debugged_plus_results = {}
    argmax_f1_debugged_results = {}
    argmax_f3_debugged_results = {}

    def process_single_task(task_id):
        (f1_debugged_base_result, 
         f1_debugged_plus_result, 
         f3_debugged_base_result, 
         f3_debugged_plus_result, 
         argmax_f1_debugged_result, 
         argmax_f3_debugged_result) = process_task_mbr(task_id, 
                                                    ape_eval_results, 
                                                    utility_f1,
                                                    utility_f3,
                                                    max_hyps=max_hyps, 
                                                    start_id=start_id, 
                                                    filter=filter)
        f1_debugged_base_results[task_id] = f1_debugged_base_result
        f1_debugged_plus_results[task_id] = f1_debugged_plus_result
        f3_debugged_base_results[task_id] = f3_debugged_base_result
        f3_debugged_plus_results[task_id] = f3_debugged_plus_result
        argmax_f1_debugged_results[task_id] = argmax_f1_debugged_result
        argmax_f3_debugged_results[task_id] = argmax_f3_debugged_result

    with ThreadPoolExecutor(max_workers=workers) as executor:
        list(tqdm(executor.map(process_single_task, ape_eval_results["eval"]), total=len(ape_eval_results["eval"])))

    for task_id in ape_eval_results["eval"]:
        ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

    return (f1_debugged_base_results, 
            f1_debugged_plus_results, 
            f3_debugged_base_results, 
            f3_debugged_plus_results, 
            argmax_f1_debugged_results, 
            argmax_f3_debugged_results)



In [54]:
from tqdm import tqdm
import numpy as np
num_plus_test_cases = 300
max_hyps = 50
filter = False
workers = 40
f1_base_score = []
f1_plus_score = []
f3_base_score = []
f3_plus_score = []
f1_ape_base_score = []
f1_ape_plus_score = []
f3_ape_base_score = []
f3_ape_plus_score = []
f1_debugged_base_score = []
f1_debugged_plus_score = []
f3_debugged_base_score = []
f3_debugged_plus_score = []
for start_id in range(0, 200, max_hyps):
    # if modulo of start_id is 0, skip
    if (start_id + 1/2 * max_hyps) % 50 == 0:
        print(f"skipping start_id: {start_id}")
        continue
    (f1_base_results, 
     f1_plus_results, 
     f3_base_results, 
     f3_plus_results, 
     f1_ape_base_results, 
     f1_ape_plus_results, 
     f3_ape_base_results, 
     f3_ape_plus_results,
     argmax_f1_results,
     argmax_f3_results) = get_mbr_results(eval_results, 
                                        cbs_f1,
                                        cbs_f3,
                                        max_hyps=max_hyps, 
                                        start_id=start_id, 
                                        filter=filter,
                                        workers=workers)
    #(f1_debugged_base_results, 
    # f1_debugged_plus_results, 
    # f3_debugged_base_results, 
    # f3_debugged_plus_results, 
    # argmax_f1_debugged_results, 
    # argmax_f3_debugged_results) = get_debugged_mbr_results(ape_eval_results, 
    #                                                        ape_cbs_f1,
    #                                                        ape_cbs_f3,
    #                                                        max_hyps=max_hyps, 
    #                                                        start_id=start_id, 
    #                                                        filter=filter,
    #                                                        workers=workers)
    
    f1_base_score.append(sum(f1_base_results.values())/len(f1_base_results))
    f1_plus_score.append(sum(f1_plus_results.values())/len(f1_plus_results))
    f3_base_score.append(sum(f3_base_results.values())/len(f3_base_results))
    f3_plus_score.append(sum(f3_plus_results.values())/len(f3_plus_results))
    f1_ape_base_score.append(sum(f1_ape_base_results.values())/len(f1_ape_base_results))
    f1_ape_plus_score.append(sum(f1_ape_plus_results.values())/len(f1_ape_plus_results))
    f3_ape_base_score.append(sum(f3_ape_base_results.values())/len(f3_ape_base_results))
    f3_ape_plus_score.append(sum(f3_ape_plus_results.values())/len(f3_ape_plus_results))
    #f1_debugged_base_score.append(sum(f1_debugged_base_results.values())/len(f1_debugged_base_results))
    #f1_debugged_plus_score.append(sum(f1_debugged_plus_results.values())/len(f1_debugged_plus_results))
    #f3_debugged_base_score.append(sum(f3_debugged_base_results.values())/len(f3_debugged_base_results))
    #f3_debugged_plus_score.append(sum(f3_debugged_plus_results.values())/len(f3_debugged_plus_results))


100%|██████████| 395/395 [00:00<00:00, 153844.38it/s]
100%|██████████| 395/395 [00:00<00:00, 135743.55it/s]
100%|██████████| 395/395 [00:00<00:00, 155257.25it/s]
100%|██████████| 395/395 [00:00<00:00, 354992.52it/s]


In [55]:
round_digits = 5
print("model: {}".format(gen_dir))
print("filter: {}".format(filter))
print("number of hypotheses: {}".format(max_hyps))
print("F1 MBR base     ", np.round(sum(f1_base_score)/len(f1_base_score) * 100, round_digits), "%")
print("F1 MBR->APE base", np.round(sum(f1_ape_base_score)/len(f1_ape_base_score) * 100, round_digits), "%")
#print("F1 APE->MBR base", np.round(sum(f1_debugged_base_score)/len(f1_debugged_base_score) * 100, round_digits), "%")
print("-"*100)
print("F1 MBR plus     ", np.round(sum(f1_plus_score)/len(f1_plus_score) * 100, round_digits), "%")
print("F1 MBR->APE plus", np.round(sum(f1_ape_plus_score)/len(f1_ape_plus_score) * 100, round_digits), "%")
#print("F1 APE->MBR plus", np.round(sum(f1_debugged_plus_score)/len(f1_debugged_plus_score) * 100, round_digits), "%")
print("-"*100)
print("F3 MBR base     ", np.round(sum(f3_base_score)/len(f3_base_score) * 100, round_digits), "%")
print("F3 MBR->APE base", np.round(sum(f3_ape_base_score)/len(f3_ape_base_score) * 100, round_digits), "%")
#print("F3 APE->MBR base", np.round(sum(f3_debugged_base_score)/len(f3_debugged_base_score) * 100, round_digits), "%")
print("-"*100)
print("F3 MBR plus     ", np.round(sum(f3_plus_score)/len(f3_plus_score) * 100, round_digits), "%")
print("F3 MBR->APE plus", np.round(sum(f3_ape_plus_score)/len(f3_ape_plus_score) * 100, round_digits), "%")
#print("F3 APE->MBR plus", np.round(sum(f3_debugged_plus_score)/len(f3_debugged_plus_score) * 100, round_digits), "%")

model: code-llama-13b-instruct_temp_1.6
filter: False
number of hypotheses: 50
F1 MBR base      64.43038 %
F1 MBR->APE base 68.79747 %
----------------------------------------------------------------------------------------------------
F1 MBR plus      53.5443 %
F1 MBR->APE plus 56.89873 %
----------------------------------------------------------------------------------------------------
F3 MBR base      60.25316 %
F3 MBR->APE base 67.21519 %
----------------------------------------------------------------------------------------------------
F3 MBR plus      50.12658 %
F3 MBR->APE plus 56.01266 %


In [44]:
with open(f"{work_dir}/{dataset}/{gen_dir}/code_score.pkl", "rb") as f:
    code_score = pickle.load(f)

In [47]:
from tqdm import tqdm
import numpy as np
num_plus_test_cases = 300
max_hyps = 50
filter = True
workers = 40
f1_base_score = []
f1_plus_score = []
f3_base_score = []
f3_plus_score = []
f1_ape_base_score = []
f1_ape_plus_score = []
f3_ape_base_score = []
f3_ape_plus_score = []
f1_debugged_base_score = []
f1_debugged_plus_score = []
f3_debugged_base_score = []
f3_debugged_plus_score = []
for start_id in range(0, 200, max_hyps):
    # if modulo of start_id is 0, skip
    if (start_id + 1/2 * max_hyps) % 50 == 0:
        print(f"skipping start_id: {start_id}")
        continue
    (f1_base_results, 
     f1_plus_results, 
     f3_base_results, 
     f3_plus_results, 
     f1_ape_base_results, 
     f1_ape_plus_results, 
     f3_ape_base_results, 
     f3_ape_plus_results,
     argmax_f1_results,
     argmax_f3_results) = get_mbr_results(eval_results, 
                                        code_score,
                                        code_score,
                                        max_hyps=max_hyps, 
                                        start_id=start_id, 
                                        filter=filter,
                                        workers=workers)
    #(f1_debugged_base_results, 
    # f1_debugged_plus_results, 
    # f3_debugged_base_results, 
    # f3_debugged_plus_results, 
    # argmax_f1_debugged_results, 
    # argmax_f3_debugged_results) = get_debugged_mbr_results(ape_eval_results, 
    #                                                        ape_cbs_f1,
    #                                                        ape_cbs_f3,
    #                                                        max_hyps=max_hyps, 
    #                                                        start_id=start_id, 
    #                                                        filter=filter,
    #                                                        workers=workers)
    
    f1_base_score.append(sum(f1_base_results.values())/len(f1_base_results))
    f1_plus_score.append(sum(f1_plus_results.values())/len(f1_plus_results))
    f3_base_score.append(sum(f3_base_results.values())/len(f3_base_results))
    f3_plus_score.append(sum(f3_plus_results.values())/len(f3_plus_results))
    f1_ape_base_score.append(sum(f1_ape_base_results.values())/len(f1_ape_base_results))
    f1_ape_plus_score.append(sum(f1_ape_plus_results.values())/len(f1_ape_plus_results))
    f3_ape_base_score.append(sum(f3_ape_base_results.values())/len(f3_ape_base_results))
    f3_ape_plus_score.append(sum(f3_ape_plus_results.values())/len(f3_ape_plus_results))
    #f1_debugged_base_score.append(sum(f1_debugged_base_results.values())/len(f1_debugged_base_results))
    #f1_debugged_plus_score.append(sum(f1_debugged_plus_results.values())/len(f1_debugged_plus_results))
    #f3_debugged_base_score.append(sum(f3_debugged_base_results.values())/len(f3_debugged_base_results))
    #f3_debugged_plus_score.append(sum(f3_debugged_plus_results.values())/len(f3_debugged_plus_results))


100%|██████████| 164/164 [00:00<00:00, 129688.13it/s]
100%|██████████| 164/164 [00:00<00:00, 125706.48it/s]
100%|██████████| 164/164 [00:00<00:00, 282027.82it/s]
100%|██████████| 164/164 [00:00<00:00, 157082.86it/s]


In [48]:
round_digits = 5
print("model: {}".format(gen_dir))
print("filter: {}".format(filter))
print("number of hypotheses: {}".format(max_hyps))
print("CS MBR base     ", np.round(sum(f1_base_score)/len(f1_base_score) * 100, round_digits), "%")
print("CS MBR->APE base", np.round(sum(f1_ape_base_score)/len(f1_ape_base_score) * 100, round_digits), "%")
#print("F1 APE->MBR base", np.round(sum(f1_debugged_base_score)/len(f1_debugged_base_score) * 100, round_digits), "%")
print("-"*100)
print("CS MBR plus     ", np.round(sum(f1_plus_score)/len(f1_plus_score) * 100, round_digits), "%")
print("CS MBR->APE plus", np.round(sum(f1_ape_plus_score)/len(f1_ape_plus_score) * 100, round_digits), "%")
#print("F1 APE->MBR plus", np.round(sum(f1_debugged_plus_score)/len(f1_debugged_plus_score) * 100, round_digits), "%")
print("-"*100)
print("CS MBR base     ", np.round(sum(f3_base_score)/len(f3_base_score) * 100, round_digits), "%")
print("CS MBR->APE base", np.round(sum(f3_ape_base_score)/len(f3_ape_base_score) * 100, round_digits), "%")
#print("F3 APE->MBR base", np.round(sum(f3_debugged_base_score)/len(f3_debugged_base_score) * 100, round_digits), "%")
print("-"*100)
print("CS MBR plus     ", np.round(sum(f3_plus_score)/len(f3_plus_score) * 100, round_digits), "%")
print("CS MBR->APE plus", np.round(sum(f3_ape_plus_score)/len(f3_ape_plus_score) * 100, round_digits), "%")
#print("F3 APE->MBR plus", np.round(sum(f3_debugged_plus_score)/len(f3_debugged_plus_score) * 100, round_digits), "%")

model: code-llama-13b-instruct_temp_1.6
filter: True
number of hypotheses: 50
CS MBR base      76.98171 %
CS MBR->APE base 77.13415 %
----------------------------------------------------------------------------------------------------
CS MBR plus      65.70122 %
CS MBR->APE plus 65.85366 %
----------------------------------------------------------------------------------------------------
CS MBR base      76.98171 %
CS MBR->APE base 77.13415 %
----------------------------------------------------------------------------------------------------
CS MBR plus      65.70122 %
CS MBR->APE plus 65.85366 %
