In [1]:
import json
import os
import pandas
from scipy.stats import spearmanr

In [2]:
# test score
testgen_df = pandas.read_csv("resources/TestGen_results.csv")
testgen_df['expl_idx'] = testgen_df.run_idx.apply(lambda x: int(x.split('_')[-1]))
high_quality_test_mapping = pandas.DataFrame(testgen_df.groupby(['bug_name', 'expl_idx'])['success'].sum()/10)
testgen_score_df = high_quality_test_mapping.reset_index()
testgen_score_df = testgen_score_df.rename(columns={'success': 'test_score'})
testgen_score_df.head()

Unnamed: 0,bug_name,expl_idx,test_score
0,Chart_1,1,0.1
1,Chart_1,2,0.5
2,Chart_1,3,0.7
3,Chart_1,4,0.3
4,Chart_1,5,0.1


In [3]:
# apr score
apr_df = pandas.read_csv("resources/APR_results.csv")
apr_df['expl_idx'] = apr_df.patch_idx+1
high_quality_patch_mapping = pandas.DataFrame(apr_df.groupby(['bug_name', 'expl_idx'])['is_positive_patch'].sum()/5)
apr_score_df = high_quality_patch_mapping.reset_index()
apr_score_df = apr_score_df.rename(columns={'is_positive_patch': 'apr_score'})
apr_score_df.head()

Unnamed: 0,bug_name,expl_idx,apr_score
0,Chart_1,1,0.4
1,Chart_1,2,0.4
2,Chart_1,3,0.4
3,Chart_1,4,0.4
4,Chart_1,5,0.4


In [4]:
# self-rating data
self_rating_df = pandas.read_csv('resources/SelfRating_results.csv')

In [5]:
# AutoFL run data
result_list = []
DATA_DIR = '../results/d4j_autofl_%d/gpt-3.5-turbo-0613/'
for i in range(1, 6):
    for fname in sorted(os.listdir(DATA_DIR % i)):
        if os.path.isdir(os.path.join(DATA_DIR % i, fname)):
            continue
        with open(os.path.join(DATA_DIR % i, fname)) as f:
            run_info = json.load(f)
        bug_name = fname.removeprefix('XFL-').removesuffix('.json')
        grade_results = run_info['buggy_methods']
        if type(grade_results) == str:
            continue
        autofl_success = any(grade_results[gt]['is_found']
                             for gt in grade_results.keys())
        expl_len = len(run_info['messages'][-3]['content'])
        result_list.append({
            'bug_name': bug_name,
            'expl_idx': i,
            'expl_len': expl_len,
            'autofl_success': autofl_success,
        })
result_df = pandas.DataFrame(result_list)

In [6]:
# manual rating data
expl_eval_df = pandas.read_csv('resources/ExplanationQuality.csv')

In [7]:
# combine info
combine_df = pandas.merge(result_df, testgen_score_df, how='outer', on=['bug_name', 'expl_idx'])
combine_df['test_score'] = combine_df.test_score.fillna(0)
combine_df = pandas.merge(combine_df, apr_score_df, how='outer', on=['bug_name', 'expl_idx'])
combine_df['apr_score'] = combine_df.apr_score.fillna(0)
combine_df = pandas.merge(combine_df, self_rating_df, how='outer', on=['bug_name', 'expl_idx'])
expl_char_df = pandas.merge(combine_df, expl_eval_df, how='outer', on=['bug_name', 'expl_idx'])
expl_char_df = expl_char_df[expl_char_df.Exists == True]

In [8]:
# add additional labels
expl_char_df['Wrong'] = (expl_char_df.Accurate == False) & (expl_char_df.Misleading == True)
expl_char_df['Bland'] = (expl_char_df.Accurate == False) & (expl_char_df.Misleading == False)

In [9]:
# variables of interest
table_quality_measures = [
    'Accurate', 'Wrong', 'Useful', 'Bland', 'autofl_success']
table_predict_measures = [
    'test_score', 'apr_score', 'GPT_useful', 'expl_len']

In [10]:
print('Spearman Correlation Table')
table_data = []
for qkey in table_quality_measures:
    qkey_corrs = {'name': qkey}
    for pkey in table_predict_measures:
        corr = spearmanr(
            expl_char_df[qkey],
            expl_char_df[pkey],
        )
        qkey_corrs[pkey] = corr[0]
    table_data.append(qkey_corrs)
table_df = pandas.DataFrame(table_data)
table_df.head()

Spearman Correlation Table


Unnamed: 0,name,test_score,apr_score,GPT_useful,expl_len
0,Accurate,0.234285,0.193781,0.37462,0.299074
1,Wrong,0.03874,-0.065549,0.325042,0.325356
2,Useful,0.263012,0.193813,0.236371,0.15729
3,Bland,-0.233645,-0.108937,-0.6009,-0.536625
4,autofl_success,0.271921,0.491753,0.140578,0.149679


In [11]:
print('Spearman Correlation Table (Bug-Aggregated)')
table_data = []
for qkey in table_quality_measures:
    qkey_corrs = {'name': qkey}
    for pkey in table_predict_measures:
        bug_average_pkey = expl_char_df.groupby('bug_name')[pkey].mean()
        bug_any_qkey = expl_char_df.groupby('bug_name')[qkey].any()
        corr = spearmanr(
            bug_any_qkey,
            bug_average_pkey,
        )
        qkey_corrs[pkey] = corr[0]
    table_data.append(qkey_corrs)
table_df = pandas.DataFrame(table_data)
table_df.head()

Spearman Correlation Table (Bug-Aggregated)


Unnamed: 0,name,test_score,apr_score,GPT_useful,expl_len
0,Accurate,0.263591,0.378972,0.473487,0.238883
1,Wrong,-0.032382,-0.150237,0.105759,0.018545
2,Useful,0.244773,0.335074,0.29221,0.015928
3,Bland,-0.133027,-0.078075,-0.230572,-0.151471
4,autofl_success,0.279275,0.56347,0.342569,0.136965


In [12]:
print('Spearman Correlation Table (Bug-Controlled)')
table_data = []
for qkey in table_quality_measures:
    qkey_corrs = {'name': qkey}
    for pkey in table_predict_measures:
        bug_average_pkey = expl_char_df.groupby('bug_name')[pkey].mean()
        bug_any_qkey = expl_char_df.groupby('bug_name')[qkey].any()
        adjusted_pkey = []
        actually_qkey = []
        for _, row in expl_char_df.iterrows():
            if not row.Exists:
                continue
            average_pkey_for_bug = bug_average_pkey[row.bug_name]
            adjusted_pkey.append(row[pkey]-average_pkey_for_bug)
            actually_qkey.append(row[qkey])
        corr = spearmanr(
            actually_qkey,
            adjusted_pkey,
        )
        qkey_corrs[pkey] = corr[0]
    table_data.append(qkey_corrs)
table_df = pandas.DataFrame(table_data)
table_df.head()

Spearman Correlation Table (Bug-Controlled)


Unnamed: 0,name,test_score,apr_score,GPT_useful,expl_len
0,Accurate,0.071889,0.003333,0.282414,0.221791
1,Wrong,0.083535,-0.00102,0.3331,0.304242
2,Useful,0.095811,-0.108497,0.186267,0.133583
3,Bland,-0.133593,-0.001966,-0.52908,-0.452352
4,autofl_success,-0.045742,0.046294,0.029343,0.138353
