## Reproduction of Table 3

This notebook reproduces the values in Table 3 (different prompt engineering scenarios), except for the Two Examples (n=10) rows. Those values are presented in `Reproduce_Figure3.ipynb`.

In [3]:
import sys
sys.path.append("../../scripts/")
import json
import pandas

In [4]:
with open('../../data/Defects4J/all_d4j_crashes.txt') as f:
    jcrashpack_bugs = [e.strip().replace('-', '_') for e in f.readlines()]

with open('../../data/Defects4J/invalid_bug_reports.txt') as f:
    invalid_bugs = [e.strip().replace('-', '_') for e in f.readlines()]

In [5]:
def evaluate(raw_result, only_crash=False):
    rows = []
    
    for bug_id, test_exec_results in raw_result.items():
        if bug_id in invalid_bugs:
            continue
        if only_crash and bug_id not in jcrashpack_bugs:
            continue
            
        for i, (fname, res) in enumerate(test_exec_results.items()):
            javalang_parse_error = False
            is_compile_error = False
            is_runtime_error = False
            buggy_version_failing = False
            fixed_version_failing = False
            success = False

            if isinstance(res, str):
                javalang_parse_error = True 
            elif res['buggy']['compile_error'] or res['fixed']['compile_error']:
                is_compile_error = True
            elif res['buggy']['runtime_error'] or res['fixed']['runtime_error']:
                is_runtime_error = True
            else:
                if res['buggy']['autogen_failed']:
                    buggy_version_failing = True
                if res['fixed']['autogen_failed']:
                    fixed_version_failing = True
                if buggy_version_failing and (not fixed_version_failing):
                    success = True
            
            rows.append({
                'project': bug_id.split('_')[0],
                'bug_id': bug_id,
                'test_no': i+1,
                'javalang_parse_error': javalang_parse_error,
                'is_compile_error': is_compile_error,
                'is_runtime_error': is_runtime_error,
                'buggy_version_failing': buggy_version_failing,
                'fixed_version_failing': fixed_version_failing,
                'fname': fname,
                'success': success,
            })
    
    return pandas.DataFrame(rows)

In [6]:
def show_tab_results(fname, only_crash=False):
    with open(fname) as f:
        reproduction_df = evaluate(json.load(f), only_crash)
    
    aggr_eval_df = reproduction_df.groupby('bug_id').sum().reset_index()
    aggr_eval_df['project'] = aggr_eval_df.bug_id.apply(lambda x: x.split('_')[0])
    
    succeeded_bugs = aggr_eval_df[aggr_eval_df.success > 0].shape[0]
    fib_bugs = aggr_eval_df[aggr_eval_df.buggy_version_failing > 0].shape[0]
    return succeeded_bugs, fib_bugs

In [7]:
RESULT_FILE_NAMES = {
    '../../results/example0_n10.json': 'No Example (n=10)',
    '../../results/example1_n10.json': 'One Example (n=10)',
    '../../results/wp_example1_n10.json': 'One Example from Source Project (n=10)',
    '../../results/constructor_example1_n10.json': 'One Example with Constructor Info (n=10)',
    '../../results/example2_n50.json': 'Two Examples (n=50)',
    '../../results/example2_n50.json': 'Two Examples (n=50)',
    '../../results/example2_n50.json': 'Two Examples (n=50)',
    '../../results/gpt3.5.json': 'ChatGPT Two Examples (n=10)',
}

CRASH_RESULT_FILE_NAMES = {
    '../../results/example1_n10.json': 'One Example, Crash Bugs (n=10)',
    '../../results/stack_example1_n10.json': 'One Example with Stack, Crash Bugs (n=10)',
}

In [8]:
upper_results = []
for fname, label in RESULT_FILE_NAMES.items():
    reproduced, fibed = show_tab_results(fname)
    upper_results.append({'Label': label, 'reproduced': reproduced, 'FIB': fibed})
upper_df = pandas.DataFrame(upper_results)
upper_df

Unnamed: 0,Label,reproduced,FIB
0,No Example (n=10),124,440
1,One Example (n=10),166,417
2,One Example from Source Project (n=10),152,455
3,One Example with Constructor Info (n=10),167,430
4,Two Examples (n=50),251,570
5,ChatGPT Two Examples (n=10),161,382


In [7]:
lower_results = []
for fname, label in CRASH_RESULT_FILE_NAMES.items():
    reproduced, fibed = show_tab_results(fname, only_crash=True)
    lower_results.append({'Label': label, 'reproduced': reproduced, 'FIB': fibed})
lower_df = pandas.DataFrame(lower_results)
lower_df

Unnamed: 0,Label,reproduced,FIB
0,"One Example, Crash Bugs (n=10)",69,153
1,"One Example with Stack, Crash Bugs (n=10)",84,155
