# 3.3 Evaluation of SZZ derivatives

In [1]:
import json
import glob
import re
import pandas as pd
import os
import statistics
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
root="/home/jovyan/work"
analysis_results_path = root + "/analysis/results/"

In [3]:
algs = ['OPENSZZ', 'SZZ_UNLEASHED', 'PYSZZ_ag', 'PYSZZ_l', 'PYSZZ_r', 'PYSZZ_ma', 'PYSZZ_ra']

In [4]:
projects = [
    "JacksonXml", "Time", "Collections", "Compress", "Csv", "JacksonCore", "JacksonDatabind", "Gson", "Jsoup",
    "Lang", "Math", "Closure", "Mockito", "Cli", "Codec", "JxPath"
]
regressions = []
for project in projects:
    # FOR EACH BUG
    for bug_path in glob.glob("{root}/results/{project}/Bug_*/".format(root=root, project=project)):
        bug_id = re.search(r"Bug_(\d+)", bug_path).group(1)
        result_path = root+"/analysis/results/{project}/Bug_{bug_id}/bug_result.json".format(project=project, bug_id=bug_id)
        # analysis/results/JacksonXml/Bug_1/bug_result.json
        with open(result_path) as f:
            result = json.load(f)
            if result['category'] == "A regression is detected" and result['sub_category'] == "Unique candidates": 
                result['BIC'] = result['BIC_candidates'][0][1]
                regressions.append(result)
            elif result['sub_category'] == "Multiple candidates": 
                if project == "JacksonDatabind" and bug_id == "14":
                    result['BFC'] = result['fix_commit']
                    result['BIC'] = result['BIC_candidates'][0][1]
                    regressions.append(result)
                if project == "Math" and bug_id == "28":
                    result['BFC'] = result['fix_commit']
                    result['BIC'] = result['BIC_candidates'][1][1]
                    regressions.append(result)
                if project == "Gson" and bug_id == "7":
                    result['BFC'] = result['fix_commit']
                    result['BIC'] = result['BIC_candidates'][0][1]
                    regressions.append(result)

In [5]:
len(regressions)

98

In [6]:
for regression in regressions:
    # print(regression['project'], regression['bug'])
    
    # SZZ UNLEASHED
    szz_unleashed_result_path = root+"/results/szz/SZZUnleashed/{id}_SZZUnleashed/results/fix_and_introducers_pairs.json".format(id=regression['id'])
    if os.path.isfile(szz_unleashed_result_path):
        with open(szz_unleashed_result_path) as f:
            szz_unleashed_result = json.load(f)
            szz_unleashed_result = list(map(lambda c: c[1],szz_unleashed_result))
            regression['SZZ_UNLEASHED_LEN_CANDIDATES'] = len(szz_unleashed_result) 
            regression['SZZ_UNLEASHED'] = regression['BIC'] in szz_unleashed_result 
    
    # OPEN SZZ
    
    openszz_result_path = root+"/results/szz/OpenSZZ/{id}_OpenSZZ/suspects.json".format(id=regression['id'])
    if os.path.isfile(openszz_result_path):
        with open(openszz_result_path) as f:
            openszz_result = json.load(f)
            regression['OPENSZZ_LEN_CANDIDATES'] = len(openszz_result) 
            regression['OPENSZZ'] = regression['BIC'] in openszz_result 
            
    
    # PYSZZ
    for alg in ['ag', 'l', 'r', 'ma', 'ra']:
        pyszz_result_path = root+"/results/szz/PySZZ_{alg}/{id}_PySZZ_{alg}/result.json".format(id=regression['id'],alg=alg)
        if os.path.isfile(pyszz_result_path):
            with open(pyszz_result_path) as f:
                pyszz_result = json.load(f)[0]['inducing_commit_hash']
                regression['PYSZZ_{alg}_LEN_CANDIDATES'.format(alg=alg)] = len(pyszz_result) 
                regression['PYSZZ_{alg}'.format(alg=alg)] = regression['BIC'] in pyszz_result

## “How precise are SZZ derivatives in detecting the change that introduced a bug?”

In [7]:
szz_results = pd.DataFrame(regressions)
n_regressions = len(szz_results.index)
szz_results_table = []
for alg in algs: 
    total = szz_results[alg].sum()
    candidates = szz_results[alg+'_LEN_CANDIDATES'].sum()
    szz_results_table.append({
        'Algorithm': alg,
        'Total': total,
        'Hit rate':  (total * 100 / n_regressions),
        'Avg. # of candidates': candidates / n_regressions
    })
szz_results_table_df = pd.DataFrame(szz_results_table)
pd.DataFrame(szz_results_table).round(decimals=2)

Unnamed: 0,Algorithm,Total,Hit rate,Avg. # of candidates
0,OPENSZZ,17,17.35,1.05
1,SZZ_UNLEASHED,6,6.12,17.37
2,PYSZZ_ag,39,39.8,1.21
3,PYSZZ_l,15,15.31,0.68
4,PYSZZ_r,22,22.45,0.68
5,PYSZZ_ma,52,53.06,2.45
6,PYSZZ_ra,39,39.8,1.44


In [8]:
szz_results_only_bool = szz_results[['id'] + algs]
at_least_one_located_regression = szz_results_only_bool[szz_results_only_bool.select_dtypes([bool]).any(1)]
non_located_regressions = szz_results_only_bool.drop(at_least_one_located_regression.index)
print(len(non_located_regressions))
non_located_regressions

40


Unnamed: 0,id,OPENSZZ,SZZ_UNLEASHED,PYSZZ_ag,PYSZZ_l,PYSZZ_r,PYSZZ_ma,PYSZZ_ra
5,Compress_Bug_11,False,False,False,False,False,False,False
6,Compress_Bug_45,False,False,False,False,False,False,False
7,JacksonCore_Bug_11,False,False,False,False,False,False,False
8,JacksonCore_Bug_10,False,False,False,False,False,False,False
10,JacksonDatabind_Bug_14,False,False,False,False,False,False,False
11,JacksonDatabind_Bug_38,False,False,False,False,False,False,False
12,JacksonDatabind_Bug_76,False,False,False,False,False,False,False
13,JacksonDatabind_Bug_37,False,False,False,False,False,False,False
15,JacksonDatabind_Bug_84,False,False,False,False,False,False,False
18,JacksonDatabind_Bug_112,False,False,False,False,False,False,False
