In [1]:
import json
import pandas

In [21]:
corr_fixed_info = pandas.read_csv('./data/d4j_corrects.csv').fillna(False)
proj_name = corr_fixed_info['BugName'].map(lambda x: x.split('_')[0] if type(x) == str else x)
id_num = corr_fixed_info['BugName'].map(lambda x: int(x.split('_')[1]) if type(x) == str else x)
d4jv1_proj = proj_name.isin(['Chart', 'Closure', 'Lang', 'Math', 'Mockito', 'Time'])
d4jv1_idx = d4jv1_proj & (id_num <= 133)
corr_fixed_bugs = corr_fixed_info[corr_fixed_info.AutoSD]['BugName'].unique()
print('Baseline Corrects, D4J v1.2:', corr_fixed_info[corr_fixed_info.Baseline & d4jv1_idx]['BugName'].nunique())
print('Baseline Corrects, D4J v2.0:', corr_fixed_info[corr_fixed_info.Baseline & (~d4jv1_idx)]['BugName'].nunique())
print('AutoSD Corrects, D4J v1.2:', corr_fixed_info[corr_fixed_info.AutoSD & d4jv1_idx]['BugName'].nunique())
print('AutoSD Corrects, D4J v2.0:', corr_fixed_info[corr_fixed_info.AutoSD & (~d4jv1_idx)]['BugName'].nunique())

Baseline Corrects, D4J v1.2: 87
Baseline Corrects, D4J v2.0: 110
AutoSD Corrects, D4J v1.2: 76
AutoSD Corrects, D4J v2.0: 113


In [24]:
full_info_list = []
lang_weirdos = set(['org.apache.commons.lang.LocaleUtilsTest::testCountriesByLanguage'])
closure_weirdos = set(['com.google.javascript.jscomp.CrossModuleMethodMotionTest::testTwoMethods', 'com.google.javascript.jscomp.CrossModuleMethodMotionTest::testClosureVariableReads3'])
mockito_weirdos = set(['org.mockitousage.basicapi.MocksSerializationTest::shouldSerializeRealPartialMock', 'org.mockitousage.basicapi.MocksSerializationTest::shouldSerializeObjectMock'])
time_weirdos = set(['org.joda.time.TestDateTimeZone::testGetName_berlin', 'org.joda.time.TestDateTimeZone::testGetShortName', 'org.joda.time.TestDateTimeZone::testGetName', 'org.joda.time.TestDateTimeZone::testGetShortName_berlin', 'org.joda.time.format.TestDateTimeFormat::testFormat_zoneText', 'org.joda.time.format.TestDateTimeFormat::testFormat_zoneLongText', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_fullDateTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_mediumLongDateTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_fullTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_shortLongDateTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_longTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_mediumFullDateTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_shortFullDateTime', 'org.joda.time.format.TestDateTimeFormatStyle::testForStyle_longDateTime', 'org.joda.time.format.TestDateTimeFormatter::testZoneShortNameNearTransition', 'org.joda.time.format.TestDateTimeFormatter::testZoneNameNearTransition', 'org.joda.time.format.TestDateTimeFormatterBuilder::test_printParseLongName', 'org.joda.time.format.TestDateTimeFormatterBuilder::test_printParseLongNameWithLookup', 'org.joda.time.format.TestDateTimeFormatterBuilder::test_printParseShortNameWithLookup', 'org.joda.time.format.TestDateTimeFormatterBuilder::test_printParseShortName'])
math_weirdos = set(['org.apache.commons.math3.util.FastMathTest::checkMissingFastMathClasses', 'org.apache.commons.math.util.FastMathTest::checkMissingFastMathClasses'])
all_weirdos = time_weirdos | closure_weirdos | mockito_weirdos | math_weirdos | lang_weirdos
for run_idx in range(10):
    with open(f'./data/zsBaseline1024_s0_chatgpt_testResults_{run_idx}_T0.7.jsonl') as f:
        base_results = json.load(f)
        base_id2res = {e['task_id']: e for e in base_results}
    with open(f'./data/zsV3real_s3_chatgpt_{run_idx}_T0.7.json') as f:
        synth_processes = json.load(f)
        autosd_id2proc = {e['task_id']: e for e in synth_processes}
    with open(f'./data/zsV3real_s3_chatgpt_testResults_{run_idx}_T0.7.jsonl') as f:
        autosd_results = json.load(f)
        autosd_id2res = {e['task_id']: e for e in autosd_results}
    
    assert len(synth_processes) == len(autosd_results)
    for base_apr_res in base_results:
        bug_name = base_apr_res['task_id']
        true_bug_name = bug_name.split('/')[-1]
        synth_res = autosd_id2proc[bug_name] if bug_name in autosd_id2res else {'trace': ''}
        autosd_apr_res = autosd_id2res[bug_name] if bug_name in autosd_id2res else {'fail_test_num': -1}
        base_repair_succ = base_apr_res['fail_test_num'] >= 0 and len(set(base_apr_res['failed_tests'])-all_weirdos) == 0
        autosd_repair_succ = autosd_apr_res['fail_test_num'] >= 0 and len(set(autosd_apr_res['failed_tests'])-all_weirdos) == 0
        used_key = 'trace' if 'prompt_at_repair' in synth_res else 'trace'
        synth_process = synth_res[used_key].split('## Analysis')[-1] 
        debugging_done = '<DEBUGGING DONE>' in synth_process
        succ_hypothesis = 'The hypothesis is supported' in synth_process
        autosd_corr_fixed = true_bug_name in corr_fixed_bugs
        full_info_list.append({
            'bug_name': bug_name,
            'base_repair_succ': base_repair_succ,
            'autosd_repair_succ': autosd_repair_succ,
            'debugging_done': debugging_done,
            'succ_hypothesis': succ_hypothesis,
            'autosd_corr_fixed': autosd_corr_fixed,
            'run_idx': run_idx,
        })
full_info_df = pandas.DataFrame(full_info_list)

In [25]:
full_info_df.head()

Unnamed: 0,bug_name,base_repair_succ,autosd_repair_succ,debugging_done,succ_hypothesis,autosd_corr_fixed,run_idx
0,Defects4J-APR/Chart_1,True,True,True,True,True,0
1,Defects4J-APR/Chart_2,False,False,False,False,False,0
2,Defects4J-APR/Chart_3,False,False,False,False,False,0
3,Defects4J-APR/Chart_4,True,True,False,False,True,0
4,Defects4J-APR/Chart_5,False,True,False,False,False,0


In [26]:
total_count = full_info_df.bug_name.nunique()
print('Total:', total_count)
print('LLM-Base APR Plausibles:', full_info_df[full_info_df.base_repair_succ]['bug_name'].nunique())
print('AutoSD APR Plausibles:', full_info_df[full_info_df.autosd_repair_succ]['bug_name'].nunique())
fixed_by_base = set(full_info_df[full_info_df.base_repair_succ]['bug_name'].unique())
fixed_by_autosd = set(full_info_df[full_info_df.autosd_repair_succ]['bug_name'].unique())
print('Raw-AutoSD Uniques:', len(fixed_by_autosd-fixed_by_base))
print('Acc of Done:', (full_info_df[full_info_df.debugging_done]['autosd_repair_succ'].sum()/
                      full_info_df.debugging_done.sum()))
print('Acc of Not Done:', (full_info_df[~full_info_df.debugging_done]['autosd_repair_succ'].sum()/
                           (~full_info_df.debugging_done).sum()))

Total: 777
LLM-Base APR Plausibles: 253
AutoSD APR Plausibles: 231
Raw-AutoSD Uniques: 28
Acc of Done: 0.3742603550295858
Acc of Not Done: 0.11164364251480124


In [27]:
done_plausible_idx = full_info_df.debugging_done & (full_info_df.autosd_repair_succ)
print('Corr% of Done when plausible:', (full_info_df[done_plausible_idx & full_info_df.autosd_corr_fixed]['bug_name'].nunique()/
                      full_info_df[done_plausible_idx]['bug_name'].nunique()))
notdone_plausible_idx = (~full_info_df.debugging_done) & (full_info_df.autosd_repair_succ)
print('Corr% of Not Done when plausible:', (full_info_df[notdone_plausible_idx & full_info_df.autosd_corr_fixed]['bug_name'].nunique()/
                           full_info_df[notdone_plausible_idx]['bug_name'].nunique()))

Corr% of Done when plausible: 0.8921568627450981
Corr% of Not Done when plausible: 0.8246445497630331
