# Configuration

In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns

plt.rc('text', usetex=True)

"""
setup the directory to save figures
"""
FIGURE_DIR = './figures'
if not os.path.exists(FIGURE_DIR):
    os.mkdir(FIGURE_DIR)

path_to_result = "./resources/workspace/bic_expr.pkl"
path_to_fault_numstat = "./resources/workspace/resources/fault_numstat.csv"

# Loading Data

In [None]:
"""
join the BIC result file and the bug metadata
"""
df = pd.read_pickle(path_to_result).set_index(['pid', 'vid'])
numstat = pd.read_csv(path_to_fault_numstat)
numstat['vid'] = numstat['vid'].apply(str)
df = df.join(numstat.set_index(['pid', 'vid']))

"""
is_wen_in_dyn: True if the BIC identified by Wen et al. is contained in the reduced BIC search space, otherwise False
is_omission_fault: True if no lines are deleted in the commit, otherwise False
"""
df['has_szz_results'] = df['has_szz_results'].astype(bool) # convert to boolean
df['has_wen_bic'] = df['has_wen_bic'].astype(bool) # convert to boolean
df['is_wen_in_dyn'] = df['is_wen_in_dyn'].astype(bool) # convert to boolean
df['is_omission_fault'] = df['deletions'] == 0
df = df.reset_index()
df

In [None]:
print(df.groupby('pid').vid.count().sum())
print(df.groupby('pid').has_szz_results.sum().sum())

In [None]:
"""
Get the average number of commits for each project
"""
df.groupby('pid')['num_total_commits'].agg(['mean']).round(1)

In [None]:
df[(df.pid=='Lang') & (df.vid=='51')]

# BIC search space reduction

In [None]:
"""
Distributions of the ratio of the number of BIC candidates to the total number of commits
"""

plt.figure(figsize=(7,3.8))
df['total_reduction'] = (df.num_dyn/df.num_total_commits)
sns.boxplot(data=df.reset_index(), x='total_reduction', y='pid')
plt.ylabel("Project")
plt.xlabel("Ratio of BIC candidates ($|C_{BIC}|/|C|$)")
plt.xlim((-0.05, 1.05))
plt.grid(axis='x')

"""
Print the median value
"""
y = 0
for pid, g in df.groupby(['pid']):
    x = g['total_reduction'].max()
    plt.text(x + 0.02, y,
        "{:.3f}".format(g['total_reduction'].median()),
        verticalalignment='center')
    y += 1

plt.savefig(os.path.join(FIGURE_DIR, 'd4j_space_reduction.pdf'), bbox_inches='tight')
plt.show()

print(f"{round((1 - df.total_reduction.mean()) * 100, 1)}% of total commits are filtered out by using the failure coverage")

# Soundness Verification

In [None]:
tdf = df[df.has_wen_bic].copy()
tdf['wen_short'] = tdf['wen'].apply(lambda s: str(s)[:7]) # short commit hash
tdf['bid'] = tdf['pid'] + '-' + tdf['vid'] + 'b'


"""
Draw the pie chart
"""
fig, ax = plt.subplots(figsize=(3, 3))
size = 0.4
vals = np.array([[49., 18.], [8, 6]])
cmap = sns.color_palette('hls', 8)
outer_colors = [cmap[5], 'lightgray']
inner_colors = ['w', cmap[1], 'w', cmap[1],]
ax.pie(vals.sum(axis=1), radius=1, colors=outer_colors,
       wedgeprops=dict(width=size, edgecolor='black'),
       startangle=120, 
       counterclock=True,
       labels=['In $C_{BIC}$', 'Not in $C_{BIC}$'],
       autopct=lambda p : '{:d}'.format(int(round(p*81/100, 0))),
       labeldistance=1.1,
       pctdistance=0.8
)

pie = ax.pie(vals.flatten(), radius=1-size, colors=inner_colors,
       startangle=120, 
       counterclock=True,
       wedgeprops=dict(width=0.4, edgecolor='black'),
       autopct=lambda p : '{:d}'.format(int(round(p*81/100, 0))),
       pctdistance=0.7,
)
plt.legend(pie[0], ["No", "Yes"], title="Omission Fault", bbox_to_anchor=(0.5, 1.2), loc='center')
plt.savefig(os.path.join(FIGURE_DIR, "verification.pdf"), bbox_inches='tight')
plt.show()

In [None]:
"""
Get the list of commits (the identified BICs) that are not contained in the BIC candidate set obtained using failure coverage
"""
tdf[~tdf.is_wen_in_dyn][['bid', 'wen_short', 'is_omission_fault']]

# Improving SZZ precision

In [None]:
plt.figure(figsize=(7,3.5))
szz_df = df[df['has_szz_results'] == True]
szz_df_non_zero = szz_df[szz_df['num_szz'] != 0].copy()
szz_df_non_zero['szz_filtered_out'] = szz_df_non_zero['num_only_szz']/szz_df_non_zero['num_szz']
sns.boxplot(data=szz_df_non_zero.reset_index(), x='szz_filtered_out', y='pid')
plt.ylabel("Project")
plt.xlabel("Ratio of SZZ-identified BIC not contained in $C_{BIC}$")
plt.xlim((-0.05, 1.05))
plt.grid(axis='x')
plt.savefig(os.path.join(FIGURE_DIR, 'szz_filtered_out.pdf'), bbox_inches='tight')
plt.show()
print(f"{round(szz_df_non_zero.szz_filtered_out.mean()*100, 1)}% of commits found by SZZ can be filtered out by the dynamic analysis")


In [None]:
100-87.6