# Evaluation of SZZ derivatives

In [1]:
import json
import glob
import re
import pandas as pd
import os
import statistics

In [2]:
root="/home/jovyan/work"
analysis_results_path = root + "/analysis/results/"

In [3]:
algs = ['OPENSZZ', 'SZZ_UNLEASHED', 'PYSZZ_ag', 'PYSZZ_l', 'PYSZZ_r', 'PYSZZ_ma', 'PYSZZ_ra']

In [4]:
projects = [
    "JacksonXml", "Time", "Collections", "Compress", "Csv", "JacksonCore", "JacksonDatabind", "Gson", "Jsoup",
    "Lang", "Math", "Closure", "Mockito"
]
regressions = []
for project in projects:
    # FOR EACH BUG
    for bug_path in glob.glob("{root}/results/{project}/Bug_*/".format(root=root, project=project)):
        bug_id = re.search(r"Bug_(\d+)", bug_path).group(1)
        result_path = root+"/analysis/results/{project}/Bug_{bug_id}/bug_result.json".format(project=project, bug_id=bug_id)
        # analysis/results/JacksonXml/Bug_1/bug_result.json
        with open(result_path) as f:
            result = json.load(f)
            if result['category'] == "A regression is detected" and result['sub_category'] == "Unique candidates": 
                result['BIC'] = result['BIC_candidates'][0][1]
                regressions.append(result)

In [5]:
for regression in regressions:
    # print(regression['project'], regression['bug'])
    
    # SZZ UNLEASHED
    szz_unleashed_result_path = root+"/results/szz/SZZUnleashed/{id}_SZZUnleashed/results/fix_and_introducers_pairs.json".format(id=regression['id'])
    if os.path.isfile(szz_unleashed_result_path):
        with open(szz_unleashed_result_path) as f:
            szz_unleashed_result = json.load(f)
            szz_unleashed_result = list(map(lambda c: c[1],szz_unleashed_result))
            regression['SZZ_UNLEASHED_LEN_CANDIDATES'] = len(szz_unleashed_result) 
            regression['SZZ_UNLEASHED'] = regression['BIC'] in szz_unleashed_result 
    
    # OPEN SZZ
    
    openszz_result_path = root+"/results/szz/OpenSZZ/{id}_OpenSZZ/suspects.json".format(id=regression['id'])
    if os.path.isfile(openszz_result_path):
        with open(openszz_result_path) as f:
            openszz_result = json.load(f)
            regression['OPENSZZ_LEN_CANDIDATES'] = len(openszz_result) 
            regression['OPENSZZ'] = regression['BIC'] in openszz_result 
            
    
    # PYSZZ
    for alg in ['ag', 'l', 'r', 'ma', 'ra']:
        pyszz_result_path = root+"/results/szz/PySZZ_{alg}/{id}_PySZZ_{alg}/result.json".format(id=regression['id'],alg=alg)
        if os.path.isfile(pyszz_result_path):
            with open(pyszz_result_path) as f:
                pyszz_result = json.load(f)[0]['inducing_commit_hash']
                regression['PYSZZ_{alg}_LEN_CANDIDATES'.format(alg=alg)] = len(pyszz_result) 
                regression['PYSZZ_{alg}'.format(alg=alg)] = regression['BIC'] in pyszz_result

## RQ2: “How precise are SZZ derivatives in detecting the change that introduced a bug?”

In [9]:
szz_results = pd.DataFrame(regressions)
n_regressions = len(szz_results.index)
szz_results_table = []
for alg in algs: 
    total = szz_results[alg].sum()
    candidates = szz_results[alg+'_LEN_CANDIDATES'].sum()
    szz_results_table.append({
        'Algorithm': alg,
        'Total': total,
        'Hit rate':  (total * 100 / n_regressions),
        'Avg. # of candidates': candidates / n_regressions
    })
szz_results_table_df = pd.DataFrame(szz_results_table)
# szz_results_table.append({
#     'Algorithm':'Average',
#     'Total': statistics.fmean(szz_results_table_df['Total']),
#     'Hit rate': statistics.fmean(szz_results_table_df['Hit rate']),
#     'Avg. # of candidates': statistics.fmean(szz_results_table_df['Avg. # of candidates'])
# })
pd.DataFrame(szz_results_table).round(decimals=2)

Unnamed: 0,Algorithm,Total,Hit rate,Avg. # of candidates
0,OPENSZZ,12.0,24.0,0.92
1,SZZ_UNLEASHED,3.0,6.0,12.48
2,PYSZZ_ag,26.0,52.0,1.3
3,PYSZZ_l,9.0,18.0,0.64
4,PYSZZ_r,14.0,28.0,0.64
5,PYSZZ_ma,32.0,64.0,1.96
6,PYSZZ_ra,25.0,50.0,1.38
7,Average,17.29,34.57,2.76


In [10]:
szz_results_table_df['Avg. # of candidates']

0     0.92
1    12.48
2     1.30
3     0.64
4     0.64
5     1.96
6     1.38
Name: Avg. # of candidates, dtype: float64

In [11]:
szz_results_only_bool = szz_results[['id'] + algs]
at_least_one_located_regression = szz_results_only_bool[szz_results_only_bool.select_dtypes([bool]).any(1)]
non_located_regressions = szz_results_only_bool.drop(at_least_one_located_regression.index)
non_located_regressions

Unnamed: 0,id,OPENSZZ,SZZ_UNLEASHED,PYSZZ_ag,PYSZZ_l,PYSZZ_r,PYSZZ_ma,PYSZZ_ra
0,Time_Bug_1,False,False,True,False,False,True,True
1,Compress_Bug_28,True,False,True,True,True,True,True
2,Compress_Bug_45,False,False,False,False,False,False,False
3,JacksonCore_Bug_11,False,False,False,False,False,False,False
4,JacksonCore_Bug_10,False,False,False,False,False,False,False
5,JacksonCore_Bug_21,True,False,False,True,True,True,True
6,JacksonDatabind_Bug_24,False,False,False,False,True,True,True
7,JacksonDatabind_Bug_35,False,False,False,False,False,False,False
8,JacksonDatabind_Bug_41,False,False,True,False,True,True,True
9,JacksonDatabind_Bug_87,True,False,False,False,True,True,False
