# Data Analysis

In [1]:
import pandas as pd 
import glob
import re
import sys
import concurrent.futures
from Analysis import Analysis

pd.set_option('display.max_rows', 100)

### Calculate the result for each bug

In [2]:
root = "/home/jovyan/work"
analyzer = Analysis(root)

projects = [
    "JacksonXml", "Time", "Collections", "Compress", "Csv", "JacksonCore", "JacksonDatabind", "Gson", "Jsoup",
    "Lang", "Math", "Closure", "Mockito"
]
future_results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for project in projects:
        # FOR EACH BUG
        for bug_path in glob.glob("{root}/results/{project}/Bug_*/".format(root=root, project=project)):
            bug = re.search(r"Bug_(\d+)", bug_path).group(1)
            try:
                future = executor.submit(analyzer.analyzeBug, project, bug, False)
                future_results.append(future)
            except Exception as e:
                print(bug_path)
                print(e)
bug_results = [future.result() for future in future_results]

### Example of bug result when a BIC detected

In [3]:
bug_results[9]

{'id': 'Time_Bug_1',
 'bug': 'Bug_1',
 'project': 'Time',
 'fix_pass': True,
 'prev_fails': True,
 'category': 'A regression is detected',
 'sub_category': 'Unique candidates',
 'test_name': 'org.joda.time.TestPartial_Constructors#testConstructorEx7_TypeArray_intArray',
 'bug_report': 'https://github.com/JodaOrg/joda-time/issues/93',
 'fix_commit': '9a62b06be5d0df8e833ff8583398cca386608cac',
 'BIC_candidates': [(1, '8612f9e5b88c1bea933ef9ab1e431f5db3006b48')],
 'executionsOnPast': 151,
 'buildFail': 0,
 'buildTestFail': 1493,
 'numCommits': 1717}

## RQ0: “Can regression tests be transplanted to the past?”

In [32]:
resume_df= pd.DataFrame(bug_results)[
    ['id', 'project','executionsOnPast', 'numCommits', 'buildFail', 'buildTestFail', 'BIC_candidates']
]

# Remove bugs with non-succeed BFC
resume_df = resume_df[resume_df['numCommits'] > 1]

resume_df['one_bic_found'] = resume_df.apply (
    lambda row: 1 if len(row['BIC_candidates']) == 1 else 0, axis=1
)
resume_df['n_bic_found'] = resume_df.apply (
    lambda row: 1 if len(row['BIC_candidates']) > 1 else 0, axis=1
)
resume_df['executionsOnPast_rate'] = resume_df.apply (
    lambda row: row['executionsOnPast'] * 100 / row['numCommits'], axis=1
)
resume_df['compilability'] = resume_df.apply (
    lambda row: (row['numCommits']-row['buildFail']) * 100 / row['numCommits'], axis=1
)
resume_df['testCompilability'] = resume_df.apply (
    lambda row: (row['numCommits']-row['buildTestFail']-row['buildFail']) * 100 / row['numCommits'], axis=1
)


resume_df.groupby('project').agg(
    bugs=('project','size'), 
    #s_bics=('one_bic_found','sum'),
    #m_bics=('n_bic_found','sum'),
    compilability_mean=('compilability','mean'), 
    compilability_median=('compilability','median'),
    test_compilability_mean=('testCompilability','mean'), 
    test_compilability_median=('testCompilability','median'),
    ratio_mean=('executionsOnPast_rate','mean'), 
    ratio_median=('executionsOnPast_rate','median'),
    #commits_mean=('numCommits','mean'),
    #commits_median=('numCommits','median')
).round(2)

Unnamed: 0_level_0,bugs,compilability_mean,compilability_median,test_compilability_mean,test_compilability_median,ratio_mean,ratio_median
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Closure,147,61.1,53.37,30.7,15.4,30.7,15.4
Collections,2,95.94,95.94,3.99,3.99,3.77,3.77
Compress,40,26.11,21.21,19.43,14.63,16.55,8.16
Csv,15,18.95,16.56,5.09,3.44,5.03,3.44
Gson,18,71.0,99.75,70.29,98.07,70.29,98.07
JacksonCore,23,40.48,38.45,36.72,30.5,36.72,30.5
JacksonDatabind,97,90.16,92.8,15.78,16.01,15.78,16.01
JacksonXml,3,85.02,84.39,37.66,36.21,37.66,36.21
Jsoup,91,20.83,12.43,17.83,10.0,17.83,10.0
Lang,58,71.77,64.52,12.83,9.67,12.83,9.67


In [5]:
resume_df['executionsOnPast_rate'].mean()

21.075428838804818

In [6]:
resume_df['executionsOnPast_rate'].median()

12.244897959183673

## RQ1: “Can the BIC for a given bug be found using its regression test?”

### Summary of results

In [7]:
df = pd.DataFrame(bug_results)[['id', 'project','category', 'sub_category']]
df[df['category'] != None].set_index('id').sort_index()
print(len(df))
df['category'].value_counts()

730


No regression is detected       570
Test fails in the fix commit     85
A regression is detected         75
Name: category, dtype: int64

### Results per sub-catergory

In [8]:
df.groupby(['category', "sub_category"]).count()['id']

category                      sub_category             
A regression is detected      Multiple candidates           26
                              Unique candidates             49
No regression is detected     -                            570
Test fails in the fix commit  Failure in source build       60
                              Failure in test build          3
                              Test execution fails           6
                              The test was not executed     16
Name: id, dtype: int64

### Test fails in the fix commit - Test execution fails

In [9]:
df[df['sub_category']=='Test execution fails']

Unnamed: 0,id,project,category,sub_category
12,Time_Bug_9,Time,Test fails in the fix commit,Test execution fails
16,Time_Bug_8,Time,Test fails in the fix commit,Test execution fails
23,Time_Bug_7,Time,Test fails in the fix commit,Test execution fails
26,Time_Bug_16,Time,Test fails in the fix commit,Test execution fails
257,Jsoup_Bug_67,Jsoup,Test fails in the fix commit,Test execution fails
334,Jsoup_Bug_78,Jsoup,Test fails in the fix commit,Test execution fails
