In [4]:
import math
from collections import Counter, defaultdict, OrderedDict, ChainMap
import re
import numpy as np
from decimal import Decimal
import math



def transform_class_string(text):
    """
    Transforms strings that represent class types in the format '<class 'type'>' into 'type'.
    For example, "<class 'int'>" becomes "int".
    """
    return re.sub(r"<class '(\w+)'>", r'\1', text)

def get_info_and_exec(sample: str) -> dict:
    
    try:
        # replace the dictionary object that cannot be printed out with a failure
        sample = re.sub(r'(<.*?Proxy object.*?>)', r'{"failed": "\1"}',sample)
        info = sample[:sample.index("{")].strip()
        task_id, solution_id, base_plus = info.split(" ")
    except:
        task_id, solution_id, base_plus = sample.strip().split(" ")[:3]
        sample = "{}\n"
    try:
        safe_globals = {'np': np, 'Decimal': Decimal, 'OrderedDict': OrderedDict, 'ChainMap': ChainMap,'math': math}
        sample = sample[sample.index("{"):].strip()
        # transform all class examples
        sample = transform_class_string(sample)
        # replace all array( with np.array(
        sample = re.sub(r"array\(", "np.array(", sample)
        if "np.array" in sample:
            sample = sample.replace("dtype=", "dtype=np.")
            sample = sample.replace("dtype=np.'", "dtype='")
            sample = sample.replace("dtype=np.object)", ")")
            #sample = re.sub(r'(,.*?shape.*?\))', r'', sample)
        # replace all nan with math.nan
        sample = sample.replace("nan", "math.nan")
        # replace all sqrt with math.sqrt
        sample = sample.replace("sqrt", "math.sqrt")
        sample = sample.replace("*I", "*1j")
        # replace with all the objects typed out in html form
        # if we find something starts with single or double quote, we don't replace it
        sample = re.sub(r'(?<![\'"])(<.*?object at.*?>)', r'"\1"', sample)
        sample = re.sub(r'(?<![\'"])(<.*?built-in function.*?>)', r'"\1"', sample)
        execution = eval(sample, {"__builtins__": None}, safe_globals)
    except:
        try:
            execution = sample.replace("-inf", "\'minus_ifty\'").replace("inf", "\'ifty\'")
            execution = eval(execution)
            for e in execution:
                if execution[e] == "minus_ifty":
                    execution[e] = float("-inf")
                elif execution[e] == "ifty":
                    execution[e] = float("inf")
        except:
            #print(task_id, solution_id, base_plus, execution)
            return {"task_id": task_id, "solution_id": int(solution_id), "base_plus": base_plus, "execution": None}

    return {"task_id": task_id, "solution_id": int(solution_id), "base_plus": base_plus, "execution": execution}



In [5]:
exec_outs_dir = "/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/deepseek-coder-33b-instruct_temp_0.8/exec_outs_base_plus.txt"
#read lines of exec_outs_dir
with open(exec_outs_dir, "r") as f:
    exec_outs = [e for e in f.readlines() if e != "\n"]
exec_outs = "".join(exec_outs).split("Mbpp/")[1:]
exec_outs[-1] = exec_outs[-1].split("\n")[0]

In [6]:
from tqdm import tqdm
new_exec_outs = []
failed_execs = []
for exec in tqdm(exec_outs):
    try:
        elem_exec = get_info_and_exec(exec)
        if elem_exec["execution"] is not None and elem_exec["execution"] != {}:
            new_exec_outs.append(elem_exec)
        else:
            failed_execs.append({"task_id": elem_exec["task_id"], "solution_id": elem_exec["solution_id"], "base_plus": elem_exec["base_plus"]})
    except:
        print(exec)
        #raise NotImplementedError
    
# sort by task_id, solution_id, base_plus
new_exec_outs = sorted(new_exec_outs, key = lambda x: (int(x["task_id"]), x["solution_id"], x["base_plus"]))



  8%|▊         | 3309/39900 [00:01<00:14, 2485.55it/s]


758 19 plus {0: {}, 1: {(): 1}, 2: {(1, 2): 3, (2, 3): 2, (1, 2, 3): 1}, 3: {('green', 'orange'): 2, (1, 2): 1, (1, 2, 3): 1}, 4: {('cat', 'dog'): 3, ('cat',): 1, ('cat', 'dog', 'rabbit'): 1, ('rabbit',): 1}, 5: {('a', 'b', 'c', 'd', 'e', 'f', 'g'): 1, ('a', 'b'): 3, ('g', 'f', 'e', 'd', 'c', 'b', 'a'): 3, ('h', 'i', 'j', 'k'): 1}, 6: {('apple', 'banana', 'cherry'): 4, ('apple', 'pear'): 3, ('apple', 'banana'): 1}, 7: {(1, 2): 2, (2, 1, 3): 1, (2, 3): 1, (1, 2, 3): 1}, 8: {(1, 2): 2, (3, 4, 5): 1, (6, 7): 2, (8, 9, 10): 1}, 9: {('a', 'b'): 2, ('c',): 2, ('d',): 1, ('e',): 1}, 10: {(1, 2, 3, 4): 3, (5, 6, 7): 2, (8, 9, 10): 1}, 11: {('x', 'y'): 3, ('z', 'w'): 2}, 12: {(): 2}, 13: {('apple', 'banana', 'cherry'): 3, ('apple', 'pear'): 3, ('apple', 'banana'): 1, ('apae', 'bganana', 'cherry'): 3}, 14: {('x', 'y'): 3, ('z', 'w'): 3}, 15: {('a', 'b', 'c', 'd', 'e', 'f', 'g'): 1, ('a', 'b'): 3, ('g', 'f', 'e', 'd', 'c', 'b', 'a'): 3, ('h', 'i', 'j', 'k'): 2}, 16: {(1, 2, 4): 1, (5, 6, 7): 2, (

NotImplementedError: 

In [12]:
sample = "{0: 200, 1: 3300, 2: 55358, 3: 940774, 4: 1404, 5: 16326651, 6: 192, 7: 1391, 8: 1391, 9: 200, 10: np.array([], shape=(0, 1), dtype=np.int64), 11: 55188, 12: 3300, 13: 1404, 14: np.array([[0, 1]]), 15: 3300, 16: 55358, 17: 1404, 18: 3264, 19: 940774, 20: 16322563, 21: 940774, 22: 1391, 23: 55188, 24: 16326651, 25: 940774, 26: 3300, 27: 1391, 28: 16326651, 29: 940774, 30: 3300, 31: 16322563, 32: 16322563, 33: 939948, 34: 939948, 35: 200, 36: 16326651, 37: 3300, 38: 55358, 39: 940774, 40: 3264, 41: 16322563, 42: 1404, 43: 114, 44: 55358, 45: 200, 46: 1391, 47: 114, 48: 114, 49: 3300, 50: 940774, 51: 192, 52: 3264, 53: 200, 54: 1404, 55: 3264, 56: 1404, 57: 16326651, 58: 16322563, 59: 1391, 60: 16326651, 61: 114, 62: 3300, 63: 3300, 64: 1391, 65: 192, 66: 192, 67: 3264, 68: 16326651, 69: 16326651, 70: 939948, 71: 57, 72: 3264, 73: 1404, 74: 55358, 75: 55358, 76: 2, 77: 939948, 78: 16322563, 79: 57, 80: 164, 81: 1391, 82: 3264, 83: 3, 84: 4, 85: 55358, 86: 1391, 87: 114, 88: 5, 89: 133, 90: 24, 91: 940774, 92: 57, 93: 7, 94: 1404, 95: 3300, 96: 3300, 97: 1391, 98: 1391, 99: 1391, 100: 24, 101: 114, 102: 38, 103: 21, 104: 1391, 105: 16322563}"
new_exec_outs.append({"task_id": 392, 
                      "solution_id": 46, 
                      "base_plus": "plus", 
                      "execution": eval(sample.replace("shape=(0, 1), ", ""))})

In [4]:
new_exec_outs = sorted(new_exec_outs, key = lambda x: (int(x["task_id"]), x["solution_id"], x["base_plus"]))

In [5]:
for i in range(len(new_exec_outs)):
    if type(new_exec_outs[i]["task_id"]) == int:
        new_exec_outs[i]["task_id"] = str(new_exec_outs[i]["task_id"])

In [6]:
import pickle

with open('/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/deepseek-coder-7b-instruct-v1.5_temp_0.8/new_exec_outs.pkl', 'wb') as f:
    pickle.dump(new_exec_outs, f)

In [1]:
import pickle

with open('/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/code-llama-13b_temp_0.8/new_exec_outs.pkl', 'rb') as f:
    new_exec_outs = pickle.load(f)

exec_outs = {}
for e in new_exec_outs:
    if ("Mbpp/"+e["task_id"]) not in exec_outs:
        exec_outs["Mbpp/"+e["task_id"]] = {}
    if e["solution_id"] not in exec_outs["Mbpp/"+e["task_id"]]:
        exec_outs["Mbpp/"+e["task_id"]][e["solution_id"]] = {}
    exec_outs["Mbpp/"+e["task_id"]][e["solution_id"]][e["base_plus"]] = e["execution"]



In [2]:
exec_outs["Mbpp/4"][2]

{'base': {0: [65, 85, 75], 1: [75, 85], 2: [35, 58, 65, 85, 75]},
 'plus': {0: [7, 9, 8],
  1: [600, 700, 900, 800, 1000],
  2: [13, 17, 15, 19, 23, 21, 25],
  3: [80, 90, 100],
  4: [700, 800, 900, 1000],
  5: [0, 30, 80, 50, 60, 100],
  6: [60, 80, 100],
  7: [8, 9],
  8: [15, 17, 21, 19, 23, 25, 25],
  9: [700, 800, 900, 1000],
  10: [200, 300, 500, 400, 800, 900, 600, 1000, 700],
  11: [600, 700, 900, 1000, 800],
  12: [17, 19, 21, 25, 23, 40, 25],
  13: [15, 18, 21, 19, 23, 25, 25],
  14: [70, 80, 100],
  15: [2, 3, 4, 6, 7, 8, 5, 9, 6],
  16: [700, 800, 900, 1000],
  17: [15, 18, 21, 19, 23, 25, 25],
  18: [15, 18, 21, 19, 23, 25, 25],
  19: [7, 7, 8, 9],
  20: [300, 400, 500, 700, 800, 900, 600, 1000, 800],
  21: [15, 19, 17, 21, 25, 40, 23, 25],
  22: [18, 19, 21, 23, 25, 25],
  23: [13, 15, 18, 21, 19, 25, 23, 25],
  24: [200, 300, 400, 600, 700, 900, 500, 1000, 800],
  25: [15, 17, 19, 23, 21, 25, 40, 25],
  26: [11, 15, 13, 21, 19, 17, 23, 25],
  27: [18, 19, 21, 23, 25, 25]

In [3]:
# load json file
import json
with open("/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/code-llama-13b_temp_0.8/eval_results_base_plus.json", "r") as f:
    eval_result = json.load(f)
eval_result = eval_result["eval"]
for task_id in eval_result:
    eval_result[task_id] = sorted(eval_result[task_id], key = lambda x: int(x["solution_id"]))

In [4]:
# load pickle file of log probs
import pandas as pd
log_probs = pd.read_pickle("/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/code-llama-13b_temp_0.8/logprobs.pkl")

In [5]:
eval_result["Mbpp/4"][3]

{'task_id': 'Mbpp/4',
 'solution_id': '3',
 'solution': '"""\nWrite a function to find the n largest integers from a given list of numbers, returned in descending order.\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\n"""\nimport heapq\n\n\ndef heap_queue_largest(nums: list, n: int) -> list:\n    return heapq.nlargest(n, nums)\n',
 'base_status': 'pass',
 'plus_status': 'pass',
 'base_details': [1, 1, 1],
 'plus_details': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'base_fa

In [6]:
#src_idx, sample_idx, result_unit_test_1, result_unit_test_2,  ..., result_unit_test_X
final_outputs = []
for task_id in exec_outs:
    for solution_id in range(200):
        try:
            assert solution_id == int(eval_result[task_id][solution_id]["solution_id"])
            log_prob_task_id = task_id.replace("/", "_")
            final_outputs.append({"src_idx": int(task_id.split("/")[-1]),
                                "sample_idx": solution_id,
                                "log_prob": sum(log_probs[log_prob_task_id][solution_id]),
                                "norm_log_prob": sum(log_probs[log_prob_task_id][solution_id])/len(log_probs[log_prob_task_id][solution_id]),
                                "result_unit_test_1": exec_outs[task_id][solution_id]["base"][0] if 0 in exec_outs[task_id][solution_id]["base"] else None,
                                "result_unit_test_2": exec_outs[task_id][solution_id]["base"][1] if 1 in exec_outs[task_id][solution_id]["base"] else None,
                                "result_unit_test_3": exec_outs[task_id][solution_id]["base"][2] if 2 in exec_outs[task_id][solution_id]["base"] else None,
                                "result_unit_test_4": exec_outs[task_id][solution_id]["base"][3] if 3 in exec_outs[task_id][solution_id]["base"] else None,
                                "result_unit_test_5": exec_outs[task_id][solution_id]["base"][4] if 4 in exec_outs[task_id][solution_id]["base"] else None,
                                "ground_truth_test_1": eval_result[task_id][solution_id]["base_details"][0]==1 if len(eval_result[task_id][solution_id]["base_details"])>0 else False,
                                "ground_truth_test_2": eval_result[task_id][solution_id]["base_details"][1]==1 if len(eval_result[task_id][solution_id]["base_details"])>1 else False,
                                "ground_truth_test_3": eval_result[task_id][solution_id]["base_details"][2]==1 if len(eval_result[task_id][solution_id]["base_details"])>2 else False,
                                "ground_truth_test_4": eval_result[task_id][solution_id]["base_details"][3]==1 if len(eval_result[task_id][solution_id]["base_details"])>3 else False,
                                "ground_truth_test_5": eval_result[task_id][solution_id]["base_details"][4]==1 if len(eval_result[task_id][solution_id]["base_details"])>4 else False})
        except:
            print(task_id, solution_id)
            print(exec_outs[task_id][solution_id])
        


In [7]:
#transform final_outputs to pandas dataframe
import pandas as pd

# Convert the list of dictionaries to a pandas DataFrame
df_final_outputs = pd.DataFrame(final_outputs)


In [8]:
#save df_final_outputs to pickle
df_final_outputs.to_pickle("/mnt/scratch-artemis/haausing/code_reranking/evalplus_outputs/mbpp/codellama_13b_final_outputs.pkl")

In [9]:
print(df_final_outputs[(df_final_outputs["src_idx"] == 4)].to_string())

     src_idx  sample_idx    log_prob  norm_log_prob                                                                                                 result_unit_test_1                                                                                                 result_unit_test_2                                                                                                 result_unit_test_3 result_unit_test_4 result_unit_test_5  ground_truth_test_1  ground_truth_test_2  ground_truth_test_3  ground_truth_test_4  ground_truth_test_5
400        4           0  -82.112555      -0.188764                                                                                                       [85, 75, 65]                                                                                                           [85, 75]                                                                                               [85, 75, 65, 58, 35]               None               None                 True     

In [33]:
# Count how many result_unit_test_4 are not None across different src_idx
result_unit_test_4_count = df_final_outputs.groupby('src_idx')['result_unit_test_4'].apply(lambda x: x.notnull().sum())
# print the src_idx where result_unit_test_4_count > 0
#print(result_unit_test_4_count[result_unit_test_4_count > 0])

In [66]:
from evalplus.data import get_mbpp_plus
problems = get_mbpp_plus()
MBPP_OUTPUT_NOT_NONE_TASKS = ["check_str", "text_match_three", "text_starta_endb"]
for task_id in problems:
    if problems[task_id]["entry_point"] in MBPP_OUTPUT_NOT_NONE_TASKS:
        print(task_id, problems[task_id]["entry_point"])

Mbpp/737 check_str
Mbpp/787 text_match_three
Mbpp/794 text_starta_endb


In [None]:
import numpy as np
def is_floats(x) -> bool:
    # check if it is float; List[float]; Tuple[float]
    if isinstance(x, float):
        return True
    if isinstance(x, (list, tuple)):
        return all(isinstance(i, float) for i in x)
    if isinstance(x, np.ndarray):
        return x.dtype == np.float64 or x.dtype == np.float32
    return False

def utility(output_hyp, output_ref, src_idx, sample_idx):
    if output_hyp is None or output_ref is None:
        return False
    if output_hyp.startswith("failed"):
        return False
    exact_match = output_hyp == output_ref
    if is_floats(output_hyp) or is_floats(output_ref):
        if not exact_match:
            exact_match = np.isclose(output_hyp, output_ref, atol=1e-6)
    return exact_match

Now I just added one more thing for the 

In [1]:
#read the json files to list of dictionaries
prefix = "/mnt/scratch-artemis/haausing/code_reranking/code/lever/data/mbpp/"
llm = "mbpp_codex_verification"
import json
def read_jsonl(file_path: str):
    with open(file_path, "r") as f:
        return [json.loads(e) for e in f.readlines()]

train_data = read_jsonl(prefix + f"{llm}_train.jsonl")
dev_data = read_jsonl(prefix + f"{llm}_dev.jsonl")
test_data = read_jsonl(prefix + f"{llm}_test.jsonl")

In [2]:
all_data = train_data + dev_data + test_data

In [None]:
#src_idx, sample_idx, result_unit_test_1, result_unit_test_2,  ..., result_unit_test_X
final_outputs = []
for task_id in exec_outs:
    for solution_id in range(200):
        try:
            assert solution_id == int(eval_result[task_id][solution_id]["solution_id"])
            log_prob_task_id = task_id.replace("/", "_")
            final_outputs.append({"src_idx": int(task_id.split("/")[-1]),
                                "sample_idx": solution_id,
                                "log_prob": sum(log_probs[log_prob_task_id][solution_id]),
                                "norm_log_prob": sum(log_probs[log_prob_task_id][solution_id])/len(log_probs[log_prob_task_id][solution_id]),
                                "result_unit_test_1": exec_outs[task_id][solution_id][0] if 0 in exec_outs[task_id][solution_id] else None,
                                "result_unit_test_2": exec_outs[task_id][solution_id][1] if 1 in exec_outs[task_id][solution_id] else None,
                                "result_unit_test_3": exec_outs[task_id][solution_id][2] if 2 in exec_outs[task_id][solution_id] else None,
                                "result_unit_test_4": exec_outs[task_id][solution_id][3] if 3 in exec_outs[task_id][solution_id] else None,
                                "result_unit_test_5": exec_outs[task_id][solution_id][4] if 4 in exec_outs[task_id][solution_id] else None,
                                "ground_truth_test_1": eval_result[task_id][solution_id]["base_details"][0]==1 if len(eval_result[task_id][solution_id]["base_details"])>0 else False,
                                "ground_truth_test_2": eval_result[task_id][solution_id]["base_details"][1]==1 if len(eval_result[task_id][solution_id]["base_details"])>1 else False,
                                "ground_truth_test_3": eval_result[task_id][solution_id]["base_details"][2]==1 if len(eval_result[task_id][solution_id]["base_details"])>2 else False,
                                "ground_truth_test_4": eval_result[task_id][solution_id]["base_details"][3]==1 if len(eval_result[task_id][solution_id]["base_details"])>3 else False,
                                "ground_truth_test_5": eval_result[task_id][solution_id]["base_details"][4]==1 if len(eval_result[task_id][solution_id]["base_details"])>4 else False})
        except:
            print(task_id, solution_id)
            print(exec_outs[task_id][solution_id])
        