In [1]:
import json
import pandas

In [2]:
def get_result_for_label(label, start_idx=0, end_idx=10):
    full_info_list = []
    for run_idx in range(start_idx, end_idx):
        with open(f'{label}_{run_idx}.json') as f:
            autosd_results = json.load(f)
        for arhe_idx, autosd_apr_res in enumerate(autosd_results):
            bug_name = autosd_apr_res['task_id']
            autosd_repair_succ = autosd_apr_res['passed']
            if 'trace' not in autosd_apr_res:
                continue
            synth_process = autosd_apr_res['trace'].split('## Analysis')[-1]
            debugging_done = '<DEBUGGING DONE>' in synth_process
            succ_hypothesis = 'The hypothesis is supported' in synth_process or 'The hypothesis was supported' in synth_process
            full_info_list.append({
                'run_idx': run_idx,
                'humaneval_name': bug_name,
                'arhe_idx': arhe_idx,
                'autosd_repair_succ': autosd_repair_succ,
                'debugging_done': debugging_done,
                'succ_hypothesis': succ_hypothesis,
            })
    return pandas.DataFrame(full_info_list)

In [3]:
# headline results
label = './data/chatgpt002_ZSDloopV4_s3_out'
full_info_df = get_result_for_label(label)
print('AutoSD Fixed:', full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique())

print('Acc of Done:', (full_info_df[full_info_df.debugging_done]['autosd_repair_succ'].sum()/
                      full_info_df.debugging_done.sum()))
print('Acc of Not Done:', (full_info_df[~full_info_df.debugging_done]['autosd_repair_succ'].sum()/
                           (~full_info_df.debugging_done).sum()))

AutoSD Fixed: 189
Acc of Done: 0.8188277087033747
Acc of Not Done: 0.6941508104298802


In [4]:
# LLM-Base results
label = './data/chatgpt002_ZSnoSDloopV4_s0_out'
full_info_df = get_result_for_label(label)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

179

In [5]:
# Codex-AutoSD
label = './data/codex_ZSD_loopV4_s3_results'
full_info_df = get_result_for_label(label)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

120

In [6]:
# Codex LLM-Base
label = './data/codex_ZSnoSD_loopV4_s0_T0'
full_info_df = get_result_for_label(label)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

180

In [7]:
# CodeGen-Big (6B) AutoSD
label = './data/BigCG_ZSDloopV4_s3_results'
full_info_df = get_result_for_label(label)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

0

In [8]:
# CodeGen-Big (6B) LLM-Base
label = './data/autosd/bigcg_results/BigCG_ZSnoSD_s0_results'
full_info_df = get_result_for_label(label, start_idx=3, end_idx=4)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

0

In [9]:
# Fewshot CodeGen-Big (6B) LLM-Base
label = './data/BigCG_FSnoSDnoDef_s0_results'
full_info_df = get_result_for_label(label)
full_info_df[full_info_df.autosd_repair_succ]['arhe_idx'].nunique()

44

In [10]:
# reverse mutator baseline (implementation in reverse_mutator.py)
import numpy as np
base_perf = [83, 75, 85, 90, 82, 84, 82, 88, 79, 84, 86, 86, 93, 81, 85, 87, 81, 85, 92, 84, 95, 91, 87, 81, 93, 89, 84, 80, 89, 84, 86, 84, 85, 87, 83, 87, 90, 85, 82, 75, 84, 85, 88, 89, 91, 84, 93, 82, 82, 91, 83, 88, 77, 89, 82, 85, 88, 92, 91, 92, 84, 89, 87, 89, 93, 88, 89, 88, 85, 85, 78, 86, 85, 90, 86, 82, 84, 80, 87, 89, 91, 82, 89, 77, 88, 89, 88, 82, 91, 84, 86, 88, 81, 80, 79, 81, 89, 85, 88, 90]
print(np.mean(base_perf))
print(np.std(base_perf))

85.77
4.199654747714388
