In [1]:
import argparse
import json
import multiprocessing
import os
import pickle
import threading
import time
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from typing import Any, Dict, List, Tuple
from warnings import warn

from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

import numpy as np

from evalplus.data.utils import CACHE_DIR

from evalplus.data import (
    get_human_eval_plus,
    get_human_eval_plus_hash,
    get_mbpp_plus,
    get_mbpp_plus_hash,
    load_solutions,
)

from evalplus.eval._special_oracle import (
    MBPP_OUTPUT_NOT_NONE_TASKS,
    MBPP_OUTPUT_SET_EQ_TASKS,
    _poly,
)

from evalplus.gen.util import trusted_exec

def is_floats(x) -> bool:
    # check if it is float; List[float]; Tuple[float]
    if isinstance(x, float):
        return True
    if isinstance(x, (list, tuple)):
        return all(isinstance(i, float) for i in x)
    if isinstance(x, np.ndarray):
        return x.dtype == np.float64 or x.dtype == np.float32
    return False

def ut_exact_match(
    hyp_ut, 
    ref_ut, 
    entry_point, 
    dataset, 
    inp=None, 
    atol=0 # need to change this later
    ):
    exact_match = hyp_ut == ref_ut

    # ================================================ #
    # ============== special oracles ================= #
    if dataset == "mbpp":
        if "are_equivalent" == entry_point:  # Mbpp/164 special oracle
            exact_match = exact_match or True
        elif "sum_div" == entry_point:  # Mbpp/295 special oracle
            exact_match = exact_match or hyp_ut == 0 or ref_ut == 0
        elif entry_point in MBPP_OUTPUT_SET_EQ_TASKS:
            exact_match = set(hyp_ut) == set(ref_ut)
        elif entry_point in MBPP_OUTPUT_NOT_NONE_TASKS:
            # exp is True  if not None
            #        False if None
            if isinstance(hyp_ut, bool):
                hyp_ut = hyp_ut is not None
            if isinstance(ref_ut, bool):
                ref_ut = ref_ut is not None
            exact_match = hyp_ut == ref_ut

    if dataset == "humaneval":
        if "find_zero" == entry_point:
            hyp_ut = _poly(*inp, hyp_ut) <= atol
            ref_ut = _poly(*inp, ref_ut) <= atol
            exact_match = hyp_ut == ref_ut
    # ============== special oracles ================= #
    # ================================================ #

    if atol == 0 and (is_floats(ref_ut) or is_floats(hyp_ut)):
        atol = 1e-6  # enforce atol for float comparison
    if not exact_match and atol != 0:
        # explicitly set rtol=1e-07
        # to match `np.testing.assert_allclose`'s default values
        exact_match =  np.allclose(hyp_ut, ref_ut, rtol=1e-07, atol=atol)
    
    return int(exact_match)

def get_groundtruth(problems, hashcode, tasks_only_output_not_none):
    cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
    if os.path.exists(cache_file):
        #print(f"Load from ground-truth from {cache_file}")
        with open(cache_file, "rb") as f:
            return pickle.load(f)

    os.makedirs(CACHE_DIR, exist_ok=True)
    #print("Computing expected output...")
    tbegin = time.time()
    expected_output = {}
    for task_id, problem in problems.items():
        oracle = {}
        oracle["base"], oracle["base_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["base_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )

        oracle["plus"], oracle["plus_time"] = trusted_exec(
            problem["prompt"] + problem["canonical_solution"],
            problem["plus_input"],
            problem["entry_point"],
            record_time=True,
            output_not_none=problem["entry_point"] in tasks_only_output_not_none,
        )
        expected_output[task_id] = oracle
    #print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")

    with open(cache_file, "wb") as f:
        pickle.dump(expected_output, f)

    return expected_output


In [2]:
def mbr_exec(hyp_uts, ref_uts, entry_point, dataset, n_uts, inps=None, granular=False):
    n_matches = 0
    for i in range(n_uts):
        # skip if either hyp_ut or ref_ut is not in the list
        if i not in hyp_uts or i not in ref_uts:
            continue
        # if there's an error, we return 0
        if type(hyp_uts[i]) == str and hyp_uts[i].startswith("failed:"):
            return 0 
        if type(ref_uts[i]) == str and ref_uts[i].startswith("failed:"):
            return 0
        # we start counting the number of matches
        try:
            n_matches += ut_exact_match(
                hyp_uts[i], 
                ref_uts[i], 
                entry_point, 
                dataset, 
                inp=inps[i] if inps else None
                )
        except:
            n_matches += 0
        
    if granular:
        try:
            return n_matches/ n_uts
        except:
            return 0
    else:
        return int(n_matches == n_uts)

In [97]:
work_dir = "/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs"
dataset = "mbpp"
#gen_dir = "deepseek-coder-33b-instruct_temp_0.8"
#gen_dir = "deepseek-coder-7b-instruct-v1.5_temp_1.2"
#gen_dir = "deepseek-coder-6.7b-instruct_temp_1.2"
gen_dir = "code-llama-13b-instruct_temp_1.6"
#gen_dir = "code-llama-7b-instruct_temp_1.6"
#debug_gen_dir = gen_dir + "_debug1_not_change_positive"
debug_gen_dir = gen_dir + "_debug1_sd-ut"
debug_3times_gen_dir = gen_dir + "_debug3_sd-ut"
#_debug1_not_change_positive
# load exec_outputs

# load problems
if dataset == "mbpp":
    problems = get_mbpp_plus()
    dataset_hash = get_mbpp_plus_hash()
    expected_output = get_groundtruth(
        problems,
        dataset_hash,
        MBPP_OUTPUT_NOT_NONE_TASKS,
    )
elif dataset == "humaneval":
    problems = get_human_eval_plus()
    dataset_hash = get_human_eval_plus_hash()
    expected_output = get_groundtruth(
        problems,
        dataset_hash,
        []
    )
else:
    raise ValueError("Invalid dataset")

with open(f"{work_dir}/{dataset}/{gen_dir}/exec_outputs_v2.pkl", "rb") as f:
    exec_outputs = pickle.load(f)
print("exec_outputs loaded")

# load exec_outputs
#with open(f"{work_dir}/{dataset}/{debug_gen_dir}/exec_outputs_v2.pkl", "rb") as f:
#    exec_outputs_debug = pickle.load(f)
#print("exec_outputs_debug loaded")

# load eval_results
with open(f"{work_dir}/{dataset}/{gen_dir}/eval_results.json", "r") as f:
    eval_results = json.load(f)
for task_id in eval_results["eval"]:
    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
    
#pop out ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]
if dataset == "mbpp":
    for task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
        eval_results["eval"].pop(task_id)
print("eval_results loaded")

# load eval_results
#with open(f"{work_dir}/{dataset}/{debug_gen_dir}/eval_results.json", "r") as f:
#    ape_eval_results = json.load(f)
#for task_id in ape_eval_results["eval"]:
#    ape_eval_results["eval"][task_id] = sorted(ape_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
#print("ape_eval_results loaded")

# load eval_results
#with open(f"{work_dir}/{dataset}/{debug_3times_gen_dir}/eval_results.json", "r") as f:
#    ape_3times_eval_results = json.load(f)
#for task_id in ape_3times_eval_results["eval"]:
#    ape_3times_eval_results["eval"][task_id] = sorted(ape_3times_eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

exec_outputs loaded
eval_results loaded


In [98]:
with open(f"{work_dir}/{dataset}/{gen_dir}/logprobs.pkl", "rb") as f:
    logprobs = pickle.load(f)
print("logprobs loaded")
with open(f"{work_dir}/{dataset}/{gen_dir}/reviewer_logprobs.pkl", "rb") as f:
    reviewer_logprobs = pickle.load(f)
print("reviewer_logprobs loaded")



logprobs loaded
reviewer_logprobs loaded


In [99]:
#with open(f"{work_dir}/{dataset}/{gen_dir}/llm_score_yn.pkl", "rb") as f:
#    llm_score_yn = pickle.load(f)
#print("llm_score_yn loaded")


In [100]:
with open(f"{work_dir}/{dataset}/{gen_dir}/errors.pkl", "rb") as f:
    errors = pickle.load(f)

In [68]:
"""
for task_id in eval_results["eval"]:
    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))
for task_id in errors:
    if task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:
        continue
    for i in range(len(errors[task_id])):
        if errors[task_id][i]["base"]["status"] == "pass":
            assert len(eval_results["eval"][task_id][i]["base_details"])> 0
            assert eval_results["eval"][task_id][i]["base_details"][0] == 1
        else:
            if len(eval_results["eval"][task_id][i]["base_details"]) > 0:
                assert eval_results["eval"][task_id][i]["base_details"][0] == 0, (task_id, i)
"""



'\nfor task_id in eval_results["eval"]:\n    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))\nfor task_id in errors:\n    if task_id in ["Mbpp/6", "Mbpp/7", "Mbpp/8", "Mbpp/9"]:\n        continue\n    for i in range(len(errors[task_id])):\n        if errors[task_id][i]["base"]["status"] == "pass":\n            assert len(eval_results["eval"][task_id][i]["base_details"])> 0\n            assert eval_results["eval"][task_id][i]["base_details"][0] == 1\n        else:\n            if len(eval_results["eval"][task_id][i]["base_details"]) > 0:\n                assert eval_results["eval"][task_id][i]["base_details"][0] == 0, (task_id, i)\n'

In [69]:
def load_jsonl(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f]
code_scores = load_jsonl(f"{work_dir}/{dataset}/{gen_dir}/code_score.jsonl")

In [77]:
#for e in eval_results["eval"]["HumanEval/156"]:
#    print(e["solution_id"])
#    print(e["base_status"])
#    print(e["plus_status"])

include_filtering = True
max_id = 50
min_id = 0
code_score_dict = {}
for task_id in eval_results["eval"]:
    code_score_dict[task_id] = [0.0] * len(eval_results["eval"][task_id])
for elem in code_scores:
    for id in elem["generated_code_solution_ids"]:
        p_name = elem["task_id"].replace("/", "_")
        if id < min_id or id >= max_id:
            continue
        if include_filtering:
            if errors[elem["task_id"]][id]["base"]["status"] != "pass":
                continue
        #code_score_dict[elem["task_id"]][id] = np.exp(np.mean(logprobs[p_name][id]))
        code_score_dict[elem["task_id"]][id] = np.exp(np.log(elem["predict_score"]))
        #code_score_dict[elem["task_id"]][id] = np.exp(np.log(elem["predict_score"]) + np.mean(logprobs[p_name][id]))

# for each task_id, for each solution_id, get the argmax of the code_score_dict[task_id]
argmax_code_scores = {}
for task_id in code_score_dict:
    argmax_code_scores[task_id] = np.argmax(code_score_dict[task_id])
base_correct = []
plus_correct = []
for task_id in eval_results["eval"]:
    argmax_code_score = argmax_code_scores[task_id]
    assert argmax_code_score == int(eval_results["eval"][task_id][argmax_code_score]["solution_id"])
    base_status = eval_results["eval"][task_id][argmax_code_score]["base_status"]
    plus_status = eval_results["eval"][task_id][argmax_code_score]["plus_status"]
    base_correct.append(base_status == "pass")
    plus_correct.append(plus_status == base_status == "pass")
print(np.mean(base_correct))
print(np.mean(plus_correct))

0.774390243902439
0.6646341463414634


In [78]:
print("code-llama-7b-humaneval-codescore-logprob")
print(np.mean([0.35365853658536583, 0.3353658536585366, 0.35365853658536583, 0.2804878048780488]))
print(np.mean([0.31097560975609756, 0.3170731707317073, 0.31097560975609756, 0.2682926829268293]))

print("code-llama-7b-filtering-humaneval-codescore-logprob")
print(np.mean([0.7012195121951219, 0.725609756097561, 0.7195121951219512, 0.7012195121951219]))
print(np.mean([0.5975609756097561, 0.6097560975609756, 0.6097560975609756, 0.6219512195121951]))

print("code-llama-13b-humaneval-codescore-logprob")
print(np.mean([0.34146341463414637, 0.32926829268292684, 0.34146341463414637, 0.42073170731707316]))
print(np.mean([0.31097560975609756, 0.2865853658536585, 0.29878048780487804, 0.3719512195121951]))

print("code-llama-13b-filtering-humaneval-codescore-logprob")
print(np.mean([0.774390243902439, 0.75, 0.774390243902439, 0.7682926829268293]))
print(np.mean([0.6646341463414634, 0.6402439024390244, 0.6585365853658537, 0.6646341463414634]))

print("deepseek-humaneval-codescore-logprob")
print(np.mean([0.6707317073170732, 0.7012195121951219, 0.7439024390243902, 0.6890243902439024]))
print(np.mean([0.6341463414634146, 0.6341463414634146, 0.6585365853658537, 0.6158536585365854]))

print("deepseek-filtering-humaneval-codescore-logprob")
print(np.mean([0.8841463414634146, 0.8597560975609756, 0.8841463414634146, 0.8780487804878049]))
print(np.mean([0.8048780487804879, 0.774390243902439, 0.7865853658536586, 0.7682926829268293]))


code-llama-7b-humaneval-codescore-logprob
0.33079268292682923
0.3018292682926829
code-llama-7b-filtering-humaneval-codescore-logprob
0.711890243902439
0.6097560975609756
code-llama-13b-humaneval-codescore-logprob
0.35823170731707316
0.31707317073170727
code-llama-13b-filtering-humaneval-codescore-logprob
0.7667682926829269
0.6570121951219512
deepseek-humaneval-codescore-logprob
0.701219512195122
0.635670731707317
deepseek-filtering-humaneval-codescore-logprob
0.8765243902439024
0.7835365853658538


In [54]:
max_id = 50
min_id = 0


#baseline for filtering

filtering_dict = {}
for task_id in eval_results["eval"]:
    filtering_dict[task_id] = [0.0] * len(eval_results["eval"][task_id])
    
#for elem in code_scores:
#    for id in elem["generated_code_solution_ids"]:
#        if id < min_id or id >= max_id:
#            continue
#        if errors[elem["task_id"]][id]["base"]["status"] == "pass":
#            filtering_dict[elem["task_id"]][id] = 1

for task_id in eval_results["eval"]:
    for id in range(len(eval_results["eval"][task_id])):
        if id < min_id or id >= max_id:
            continue
        if errors[task_id][id]["base"]["status"] == "pass":
        #if len(eval_results["eval"][task_id][id]["base_details"]) > 0:
        #    if eval_results["eval"][task_id][id]["base_details"][0] == 1:
            filtering_dict[task_id][id] = 1

base_true_positive_rate = {}
plus_true_positive_rate = {}
for task_id in eval_results["eval"]:
    base_true_positive_rate[task_id] = [0.0] * len(eval_results["eval"][task_id])
    plus_true_positive_rate[task_id] = [0.0] * len(eval_results["eval"][task_id])
    
for task_id in eval_results["eval"]:
    for id in range(len(eval_results["eval"][task_id])):
        if id < min_id or id >= max_id:
            continue
        if errors[task_id][id]["base"]["status"] == "pass" \
            and eval_results["eval"][task_id][id]["base_status"] == "pass":
        #if filtering_dict[task_id][id] == 1 and eval_results["eval"][task_id][id]["base_status"] == "pass":
            base_true_positive_rate[task_id][id] = 1
            if eval_results["eval"][task_id][id]["plus_status"] == "pass":
                plus_true_positive_rate[task_id][id] = 1

base_final_scores = []
plus_final_scores = []
for task_id in base_true_positive_rate:
    try:
        base_final_scores.append(sum(base_true_positive_rate[task_id])/sum(filtering_dict[task_id]))
    except:
        base_final_scores.append(0)
    try:
        plus_final_scores.append(sum(plus_true_positive_rate[task_id])/sum(filtering_dict[task_id]))
    except:
        plus_final_scores.append(0)

print(np.mean(base_final_scores))
print(np.mean(plus_final_scores))


0.7336785268806619
0.6258826617542953


In [55]:
#humaneval filtering baseline
print("code-llama-7b-humaneval")
print(np.mean([0.693160186459372, 0.6997735981934243, 0.72117062444169, 0.6862477594452113]))
print(np.mean([0.5908040389033249, 0.6011116883717889, 0.6137140602360607, 0.5822434593521757]))

print("code-llama-13b-humaneval")
print(np.mean([0.7662873506194061, 0.7514377984534123, 0.7629200531121093, 0.7336785268806619]))
print(np.mean([0.643287595977476, 0.631051116450025, 0.6365402095034088, 0.6258826617542953]))

print("deepseek-humaneval")
print(np.mean([0.8820033571614998, 0.8883587387079996, 0.881986977881884, 0.882445855555487]))
print(np.mean([0.7827359898849421, 0.7918541518655567, 0.7854197999920142, 0.7892883253685077]))


code-llama-7b-humaneval
0.7000880421349245
0.5969683117158375
code-llama-13b-humaneval
0.7535809322663974
0.6341903959213012
deepseek-humaneval
0.8836987323267176
0.7873245667777552


In [19]:
#mbpp filtering baseline
print("code-llama-7b-mbpp")
print(np.mean([0.7805309287120359, 0.7724022207392057, 0.7731006556120122, 0.7706827437842202]))
print(np.mean([0.6104730168236094, 0.6064812230767918, 0.6092237984535773, 0.6064054074964162]))

print("code-llama-13b-mbpp")
print(np.mean([0.8138750771906944, 0.8217007921648252, 0.8200892856053096, 0.820794467955619]))
print(np.mean([0.6479763222924249, 0.6527857093397472, 0.6456394161083959, 0.6412623775075433]))

print("deepseek-mbpp")
print(np.mean([0.8758127616516805, 0.8774858296693084, 0.8713415329802104, 0.8699898622614464]))
print(np.mean([0.7300904709039442, 0.7236647749261638, 0.7207821623236315, 0.7297056595650726]))

code-llama-7b-mbpp
0.7741791372118685
0.6081458614625987
code-llama-13b-mbpp
0.819114905729112
0.6469159563120278
deepseek-mbpp
0.8736574966406614
0.726060766929703


In [87]:
for task_id in eval_results["eval"]:
    eval_results["eval"][task_id] = sorted(eval_results["eval"][task_id], key=lambda x: int(x["solution_id"]))

In [120]:
import numpy as np
include_filtering = True
min_id = 0
max_id = min_id + 50
base_correct = []
plus_correct = []
with open(f"{work_dir}/{dataset}/{gen_dir}/errors.pkl", "rb") as f:
    errors = pickle.load(f)
for task_id in eval_results["eval"]:
    p_name = task_id.replace("/", "_")
    mean_logprobs = []
    for i in range(len(eval_results["eval"][task_id])):
        if i < min_id or i >= max_id:
            continue
        assert i == int(eval_results["eval"][task_id][i]["solution_id"])
        if include_filtering:
            passes = errors[task_id][i]["base"]["status"] == "pass"
            if not passes:
                mean_logprobs.append(0)
            else:
                #mean_logprobs.append(np.exp(np.mean(reviewer_logprobs[p_name][i])))
                #mean_logprobs.append(np.exp(np.mean(logprobs[p_name][i])))
                mean_logprobs.append(np.exp(np.mean(reviewer_logprobs[p_name][i])) + np.exp(np.mean(logprobs[p_name][i])))
        else:
            #mean_logprobs.append(np.exp(np.mean(reviewer_logprobs[p_name][i])))
            #mean_logprobs.append(np.exp(np.mean(logprobs[p_name][i])))
            mean_logprobs.append(np.exp(np.mean(reviewer_logprobs[p_name][i])) + np.exp(np.mean(logprobs[p_name][i])))
    # get the argmax of mean_logprobs
    argmax_logprobs = np.argmax(mean_logprobs) + min_id
    # see if the argmax_logprobs is in the list of solution_ids
    assert argmax_logprobs == int(eval_results["eval"][task_id][argmax_logprobs]["solution_id"])
    base_status = eval_results["eval"][task_id][argmax_logprobs]["base_status"]
    plus_status = eval_results["eval"][task_id][argmax_logprobs]["plus_status"]
    base_correct.append(base_status == "pass")
    plus_correct.append(plus_status == base_status == "pass")
print(np.mean(base_correct))
print(np.mean(plus_correct))



0.830379746835443
0.6658227848101266


In [121]:
### cl13b-mbpp
logprob_base = [0.5670886075949367, 0.589873417721519, 0.5924050632911393, 0.5670886075949367]
logprob_plus = [0.4810126582278481, 0.46582278481012657, 0.4835443037974684, 0.45569620253164556]
coder_reviewer_logprob_base = [0.6050632911392405, 0.6151898734177215, 0.6, 0.5949367088607594]
coder_reviewer_logprob_plus = [0.5063291139240507, 0.5063291139240507, 0.49873417721518987, 0.46835443037974683]
filtering_logprob_base = [0.830379746835443, 0.8253164556962025, 0.8278481012658228, 0.8278481012658228]
filtering_logprob_plus = [0.6506329113924051, 0.640506329113924, 0.6455696202531646, 0.6253164556962025]
filtering_base = [0.8138750771906944, 0.8217007921648252, 0.8200892856053096, 0.820794467955619]
filtering_plus = [0.6479763222924249, 0.6527857093397472, 0.6456394161083959, 0.6412623775075433]
filtering_coder_reviewer_logprob_base = [0.830379746835443, 0.830379746835443, 0.8227848101265823, 0.8253164556962025]
filtering_coder_reviewer_logprob_plus = [0.6658227848101266, 0.6632911392405063, 0.6481012658227848, 0.6227848101265823]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
57.91139240506329
logprob_plus
47.151898734177216
coder_reviewer_logprob_base
60.379746835443036
coder_reviewer_logprob_plus
49.493670886075954
filtering_logprob_base
82.78481012658227
filtering_logprob_plus
64.05063291139241
filtering_base
81.9114905729112
filtering_plus
64.69159563120279
filtering_coder_reviewer_logprob_base
82.72151898734177
filtering_coder_reviewer_logprob_plus
64.99999999999999


In [96]:
### cl13b-humaneval
logprob_base = [0.5548780487804879, 0.5304878048780488, 0.5426829268292683, 0.5304878048780488]
logprob_plus = [0.4817073170731707, 0.4451219512195122, 0.4695121951219512, 0.4268292682926829]
coder_reviewer_logprob_base = [0.5182926829268293, 0.5060975609756098, 0.49390243902439024, 0.524390243902439]
coder_reviewer_logprob_plus = [0.4634146341463415, 0.4268292682926829, 0.4329268292682927, 0.4268292682926829]
filtering_logprob_base = [0.8048780487804879, 0.75, 0.7926829268292683, 0.75]
filtering_logprob_plus = [0.6829268292682927, 0.6280487804878049, 0.6829268292682927, 0.6341463414634146]
filtering_base = [0.7662873506194061, 0.7514377984534123, 0.7629200531121093, 0.7336785268806619]
filtering_plus = [0.643287595977476, 0.631051116450025, 0.6365402095034088, 0.6258826617542953]
filtering_coder_reviewer_logprob_base = [0.774390243902439, 0.7378048780487805, 0.7987804878048781, 0.7621951219512195]
filtering_coder_reviewer_logprob_plus = [0.6646341463414634, 0.6280487804878049, 0.6890243902439024, 0.6463414634146342]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
53.963414634146346
logprob_plus
45.579268292682926
coder_reviewer_logprob_base
51.06707317073172
coder_reviewer_logprob_plus
43.75
filtering_logprob_base
77.4390243902439
filtering_logprob_plus
65.70121951219512
filtering_base
75.35809322663974
filtering_plus
63.41903959213012
filtering_coder_reviewer_logprob_base
76.82926829268293
filtering_coder_reviewer_logprob_plus
65.70121951219512


In [60]:
### deepseek-humaneval
logprob_base = [0.8170731707317073, 0.7987804878048781, 0.8414634146341463, 0.7987804878048781]
logprob_plus = [0.7134146341463414, 0.7134146341463414, 0.75, 0.6951219512195121]
coder_reviewer_logprob_base = [0.7865853658536586, 0.7804878048780488, 0.8109756097560976, 0.7682926829268293]
coder_reviewer_logprob_plus = [0.7012195121951219, 0.7012195121951219, 0.7317073170731707, 0.676829268292683]
filtering_logprob_base = [0.9146341463414634, 0.8963414634146342, 0.9085365853658537, 0.9024390243902439]
filtering_logprob_plus = [0.7926829268292683, 0.7865853658536586, 0.7987804878048781, 0.7865853658536586]
filtering_base = [0.8820033571614998, 0.8883587387079996, 0.881986977881884, 0.882445855555487]
filtering_plus = [0.7827359898849421, 0.7918541518655567, 0.7854197999920142, 0.7892883253685077]
filtering_coder_reviewer_logprob_base = [0.8963414634146342, 0.8963414634146342, 0.9024390243902439, 0.8841463414634146]
filtering_coder_reviewer_logprob_plus = [0.7926829268292683, 0.8109756097560976, 0.7987804878048781, 0.7804878048780488]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
81.40243902439023
logprob_plus
71.79878048780488
coder_reviewer_logprob_base
78.65853658536585
coder_reviewer_logprob_plus
70.27439024390243
filtering_logprob_base
90.54878048780488
filtering_logprob_plus
79.11585365853658
filtering_base
88.36987323267176
filtering_plus
78.73245667777552
filtering_coder_reviewer_logprob_base
89.48170731707317
filtering_coder_reviewer_logprob_plus
79.57317073170731


In [62]:
### deepseek-mbpp
logprob_base = [0.6708860759493671, 0.6886075949367089, 0.7189873417721518, 0.7240506329113924]
logprob_plus = [0.5772151898734177, 0.5873417721518988, 0.5949367088607594, 0.5974683544303797]
coder_reviewer_logprob_base = [0.7493670886075949, 0.769620253164557, 0.7645569620253164, 0.7620253164556962]
coder_reviewer_logprob_plus = [0.6329113924050633, 0.6531645569620254, 0.6227848101265823, 0.6506329113924051]
filtering_logprob_base = [0.8759493670886076, 0.8936708860759494, 0.8911392405063291, 0.8810126582278481]
filtering_logprob_plus = [0.7316455696202532, 0.7240506329113924, 0.7240506329113924, 0.7341772151898734]
filtering_base = [0.8758127616516805, 0.8774858296693084, 0.8713415329802104, 0.8699898622614464]
filtering_plus = [0.7300904709039442, 0.7236647749261638, 0.7207821623236315, 0.7297056595650726]
filtering_coder_reviewer_logprob_base = [0.8835443037974684, 0.8936708860759494, 0.8784810126582279, 0.8734177215189873]
filtering_coder_reviewer_logprob_plus = [0.739240506329114, 0.7291139240506329, 0.7113924050632912, 0.7316455696202532]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
70.0632911392405
logprob_plus
58.924050632911396
coder_reviewer_logprob_base
76.13924050632912
coder_reviewer_logprob_plus
63.98734177215191
filtering_logprob_base
88.54430379746836
filtering_logprob_plus
72.84810126582279
filtering_base
87.36574966406614
filtering_plus
72.6060766929703
filtering_coder_reviewer_logprob_base
88.22784810126582
filtering_coder_reviewer_logprob_plus
72.78481012658227


In [18]:
### code-llama-7b-humaneval
logprob_base = [0.43902439024390244, 0.4146341463414634, 0.4573170731707317, 0.49390243902439024]
logprob_plus = [0.3780487804878049, 0.36585365853658536, 0.38414634146341464, 0.42073170731707316]
coder_reviewer_logprob_base = [0.4573170731707317, 0.4451219512195122, 0.4634146341463415, 0.47560975609756095]
coder_reviewer_logprob_plus = [0.40853658536585363, 0.4024390243902439, 0.4024390243902439, 0.40853658536585363]
filtering_logprob_base = [0.6951219512195121, 0.7073170731707317, 0.725609756097561, 0.7134146341463414]
filtering_logprob_plus = [0.5975609756097561, 0.6097560975609756, 0.6036585365853658, 0.6036585365853658]
filtering_base = [0.693160186459372, 0.6997735981934243, 0.72117062444169, 0.6862477594452113]
filtering_plus = [0.5908040389033249, 0.6011116883717889, 0.6137140602360607, 0.5822434593521757]
filtering_coder_reviewer_logprob_base = [0.7012195121951219, 0.6951219512195121, 0.7439024390243902, 0.7134146341463414]
filtering_coder_reviewer_logprob_plus = [0.6097560975609756, 0.5975609756097561, 0.6402439024390244, 0.6036585365853658]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
45.1219512195122
logprob_plus
38.71951219512194
coder_reviewer_logprob_base
46.03658536585366
coder_reviewer_logprob_plus
40.548780487804876
filtering_logprob_base
71.03658536585365
filtering_logprob_plus
60.36585365853659
filtering_base
70.00880421349245
filtering_plus
59.69683117158375
filtering_coder_reviewer_logprob_base
71.34146341463415
filtering_coder_reviewer_logprob_plus
61.28048780487805


In [44]:
### code-llama-7b-mbpp
logprob_base = [0.5164556962025316, 0.5164556962025316, 0.5594936708860759, 0.5468354430379747]
logprob_plus = [0.4151898734177215, 0.4177215189873418, 0.44050632911392407, 0.4253164556962025]
coder_reviewer_logprob_base = [0.5139240506329114, 0.5164556962025316, 0.5392405063291139, 0.5544303797468354]
coder_reviewer_logprob_plus = [0.4177215189873418, 0.4151898734177215, 0.4430379746835443, 0.4481012658227848]
filtering_logprob_base = [0.7949367088607595, 0.7822784810126582, 0.7746835443037975, 0.7721518987341772]
filtering_logprob_plus = [0.6050632911392405, 0.5848101265822785, 0.6, 0.5974683544303797]
filtering_base = [0.7805309287120359, 0.7724022207392057, 0.7731006556120122, 0.7706827437842202]
filtering_plus = [0.6104730168236094, 0.6064812230767918, 0.6092237984535773, 0.6064054074964162]
filtering_coder_reviewer_logprob_base = [0.7873417721518987, 0.779746835443038, 0.789873417721519, 0.7822784810126582]
filtering_coder_reviewer_logprob_plus = [0.6075949367088608, 0.589873417721519, 0.610126582278481, 0.6]
# for each list, compute average and times 100
list_names = ["logprob_base", "logprob_plus", "coder_reviewer_logprob_base", "coder_reviewer_logprob_plus", "filtering_logprob_base", "filtering_logprob_plus", "filtering_base", "filtering_plus", "filtering_coder_reviewer_logprob_base", "filtering_coder_reviewer_logprob_plus"]
lists = [logprob_base, logprob_plus, coder_reviewer_logprob_base, coder_reviewer_logprob_plus, filtering_logprob_base, filtering_logprob_plus, filtering_base, filtering_plus, filtering_coder_reviewer_logprob_base, filtering_coder_reviewer_logprob_plus]

for name, lst in zip(list_names, lists):
    # print the name of the list
    print(name)
    print(np.mean(lst) * 100)

logprob_base
53.48101265822785
logprob_plus
42.46835443037975
coder_reviewer_logprob_base
53.101265822784804
coder_reviewer_logprob_plus
43.101265822784804
filtering_logprob_base
78.10126582278481
filtering_logprob_plus
59.683544303797476
filtering_base
77.41791372118685
filtering_plus
60.81458614625988
filtering_coder_reviewer_logprob_base
78.48101265822784
filtering_coder_reviewer_logprob_plus
60.18987341772153


In [None]:
task_id = "Mbpp/102"
for solution in eval_results["eval"][task_id]:
    solution_id = int(solution["solution_id"])
    p_name = task_id.replace("/", "_")
    pos_score, neg_score = llm_score_yn[p_name][solution_id]["pos"], llm_score_yn[p_name][solution_id]["neg"]
    print(1/(np.exp(neg_score-pos_score)+1))
    print(eval_results["eval"][task_id][0]["base_status"])


In [112]:
import numpy as np
base_correct = []
plus_correct = []
for task_id in eval_results["eval"]:
    p_name = task_id.replace("/", "_")
    mean_logprobs = []
    llm_scores = []
    for i in range(len(eval_results["eval"][task_id])):
        assert i == int(eval_results["eval"][task_id][i]["solution_id"])
        pos_score, neg_score = llm_score_yn[p_name][i]["pos"], llm_score_yn[p_name][i]["neg"]
        llm_score = - np.log( np.exp( neg_score - pos_score ) + 1)
        llm_scores.append(llm_score)
        mean_logprobs.append(np.mean(logprobs[p_name][i]))
    # get the argmax of mean_logprobs
    argmax_llm = np.argmax(np.array(llm_scores))
    # see if the argmax_logprobs is in the list of solution_ids
    base_status = eval_results["eval"][task_id][argmax_llm]["base_status"]
    plus_status = eval_results["eval"][task_id][argmax_llm]["plus_status"]
    base_correct.append(base_status == "pass")
    plus_correct.append(plus_status == base_status == "pass")
print(np.mean(base_correct))
print(np.mean(plus_correct))


0.46835443037974683
0.41012658227848103


In [94]:
llm_score_yn[p_name][i]["pos"]

-2.629206418991089