# 3.4 Comparation with InduceBenchmark

In [1]:
import json
import glob
import re
import pandas as pd
import os

In [2]:
root="/home/jovyan/work"
analysis_results_path = root + "/analysis/results/"

## BICs from the InduceBenchmark (only Defects4J)

The BICs of Chart projects are not included as it is a project running on SVN.

In [3]:
real_bics = pd.read_csv('InduceBenchmark_D4J.csv').to_dict('records')
len(real_bics)

82

In [4]:
abs_results = []
percentage_results = []
inner_join = []

# FOR EACH BIC
for bic in real_bics:

    bic_hash = bic['Inducing Change']
    regression_id = "{project}_Bug_{id}".format(project=bic['Project'],id=bic['Bug ID'])
    
    abs_result = {
        'id': regression_id,
        'real_bic': bic_hash,
        'regression': False
    }
    
    percentage_result = {
        'id': regression_id,
        'real_bic': bic_hash,
        'regression': False
    }
    
    # OUR RESULTS (RegTestExecutor)
    our_result = False
    result_path = "{path}/{project}/Bug_{id}/bug_result.json".format(path=analysis_results_path, project=bic['Project'], id=bic['Bug ID'])
    if os.path.isfile(result_path):
        with open(result_path) as f:
            rs_candidates = list(map(lambda c: c[1],json.load(f)['BIC_candidates']))
            
            if len(rs_candidates) == 1: 
                bic_info = bic.copy()
                bic_info['SameBIC'] = bic['Inducing Change'] == rs_candidates[0]
                inner_join.append(bic_info)
            if len(rs_candidates) > 0: 
                abs_result['regression'] = True
                percentage_result['regression'] = True
            abs_result['RS'] = bic_hash in rs_candidates
            percentage_result['RS'] = 1/len(rs_candidates) if bic_hash in rs_candidates else 0
                        
    # SZZ UNLEASHED
    szz_unleashed_result_path = root+"/results/szz/SZZUnleashed/{id}_SZZUnleashed/results/fix_and_introducers_pairs.json".format(id=regression_id)
    if os.path.isfile(szz_unleashed_result_path):
        with open(szz_unleashed_result_path) as f:
            szz_unleashed_candiates = list(set(map(lambda c: c[1], json.load(f))))
            #print(regression_id, bic_hash in szz_unleashed_candiates, szz_unleashed_candiates)
            abs_result['SZZ_UNLEASHED'] = bic_hash in szz_unleashed_candiates
            percentage_result['SZZ_UNLEASHED'] = 1/len(szz_unleashed_candiates) if bic_hash in szz_unleashed_candiates else 0

    # OPEN SZZ
    openszz_result_path = root+"/results/szz/OpenSZZ/{id}_OpenSZZ/suspects.json".format(id=regression_id)
    if os.path.isfile(openszz_result_path):
        with open(openszz_result_path) as f:
            openszz_candiates = list(set(json.load(f)))
            abs_result['OPEN_SZZ'] = bic_hash in openszz_candiates
            percentage_result['OPEN_SZZ'] = 1/len(openszz_candiates) if bic_hash in openszz_candiates else 0
    
    # PYSZZ
    for alg in ['ag', 'l', 'r', 'ma', 'ra']:
        pyszz_result_path = root+"/results/szz/PySZZ_{alg}/{id}_PySZZ_{alg}/result.json".format(id=regression_id,alg=alg)
        if os.path.isfile(pyszz_result_path):
            with open(pyszz_result_path) as f:
                pyszz_candiates = list(set(json.load(f)[0]['inducing_commit_hash']))
                abs_result['PYSZZ_'+alg] = bic_hash in pyszz_candiates
                percentage_result['PYSZZ_'+alg] = 1/len(pyszz_candiates) if bic_hash in pyszz_candiates else 0
                
    # RS + PYSZZ_{alg}
    for alg in ['ag', 'ma']:
        pyszz_result_path = root+"/results/szz/PySZZ_{alg}/{id}_PySZZ_{alg}/result.json".format(id=regression_id,alg=alg)
        if os.path.isfile(result_path) and os.path.isfile(pyszz_result_path):
            with open(pyszz_result_path) as f:
                pyszz_candiates = list(set(json.load(f)[0]['inducing_commit_hash']))
            with open(result_path) as f:
                rs_candidates = list(map(lambda c: c[1],json.load(f)['BIC_candidates']))

            if len(pyszz_candiates) == 0:
                abs_result['RS-PYSZZ_'+alg] = bic_hash in rs_candidates
                percentage_result['RS-PYSZZ_'+alg] = 1/len(rs_candidates) if bic_hash in rs_candidates else 0
            elif len(pyszz_candiates) == 1:
                abs_result['RS-PYSZZ_'+alg] = bic_hash in pyszz_candiates
                percentage_result['RS-PYSZZ_'+alg] = 1 if bic_hash in pyszz_candiates else 0
            elif len(pyszz_candiates) > 1:
                intersection_candidates = set.intersection(set(pyszz_candiates), set(rs_candidates))
                if len(intersection_candidates) == 0:
                    all_candidates = pyszz_candiates + rs_candidates
                    abs_result['RS-PYSZZ_'+alg] = bic_hash in all_candidates
                    percentage_result['RS-PYSZZ_'+alg] = 1/len(all_candidates) if bic_hash in all_candidates else 0
                else:
                    abs_result['RS-PYSZZ_'+alg] = bic_hash in intersection_candidates
                    percentage_result['RS-PYSZZ_'+alg] = 1/len(intersection_candidates) if bic_hash in intersection_candidates else 0
                
    # ADD RESULTS
    abs_results.append(abs_result)
    percentage_results.append(percentage_result)

abs_results_df = pd.DataFrame(abs_results)
percentage_results_df = pd.DataFrame(percentage_results)

## Bugs contained in both datasets

The column `SameBIC` indicates if both datasets contain the same BIC.

In [5]:
pd.DataFrame(inner_join)

Unnamed: 0,Project,Bug ID,Inducing Change,SameBIC
0,Closure,8,80ee905775eded2256972f3e762862fcf66f0095,True
1,Closure,12,df223efd38c514d584f00d076488ab9a02011492,False
2,Closure,21,dbf6ea95477810188582b9e9ac6c9645717cbe95,True
3,Closure,31,a4c526dae1537f027f429145656ffb849699c1fc,True
4,Closure,61,e9a1cc9c431416354dec3dcb393affd7f41c6596,True
5,Closure,62,22a1fcd79bd2bfbfdc9e80d5162bdf19065f9e5e,True
6,Closure,82,8a76da206034045c555297e5069eb273d05fc6fb,False
7,Closure,90,4c6e1039b80859f17de5f3cbcfeba61ed8ea0485,False
8,Closure,91,82a9956c6337d2f5d4a94ebe624d64faa54d9182,True
9,Closure,92,5a3b4cb6b7e816aacd0ffd4c71e23da949fe360d,True


In [6]:
def getSummary(absolute, percentage):
    results_summary = []
    for alg in ['RS', 'RS-PYSZZ_ag', 'RS-PYSZZ_ma', 'SZZ_UNLEASHED', 'OPEN_SZZ', 'PYSZZ_ag', 'PYSZZ_l', 'PYSZZ_r', 'PYSZZ_ma', 'PYSZZ_ra' ]:
        results_summary.append({
            'Algorithm': alg,
            'Absolute': "{:.2f}".format((absolute[alg] == True).sum()/len(absolute.index)),
            'Relative': "{:.2f}".format((percentage[alg] == True).sum()/len(percentage.index))
        })
    results_summary_df = pd.DataFrame(results_summary).set_index('Algorithm')
    return results_summary_df

In [7]:
getSummary(abs_results_df, percentage_results_df)

Unnamed: 0_level_0,Absolute,Relative
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
RS,0.32,0.3
RS-PYSZZ_ag,0.26,0.23
RS-PYSZZ_ma,0.29,0.26
SZZ_UNLEASHED,0.02,0.0
OPEN_SZZ,0.07,0.04
PYSZZ_ag,0.21,0.17
PYSZZ_l,0.04,0.04
PYSZZ_r,0.05,0.05
PYSZZ_ma,0.23,0.12
PYSZZ_ra,0.2,0.12


In [8]:
only_regressions_abs = abs_results_df[abs_results_df['regression']==True]
only_regressions_relative = percentage_results_df[percentage_results_df['regression']==True]
getSummary(only_regressions_abs, only_regressions_relative)

Unnamed: 0_level_0,Absolute,Relative
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
RS,0.84,0.81
RS-PYSZZ_ag,0.68,0.61
RS-PYSZZ_ma,0.77,0.68
SZZ_UNLEASHED,0.06,0.0
OPEN_SZZ,0.19,0.1
PYSZZ_ag,0.55,0.45
PYSZZ_l,0.1,0.1
PYSZZ_r,0.13,0.13
PYSZZ_ma,0.61,0.32
PYSZZ_ra,0.52,0.32
