# Data Analysis

In [1]:
import pandas as pd 
import glob
import re
import sys
import concurrent.futures
from Analysis import Analysis

pd.set_option('display.max_rows', 100)

### Calculate the result for each bug

In [2]:
root = "/home/jovyan/work"
analyzer = Analysis(root)

projects = [
    "Time", "JacksonXml", "Collections", "Compress", "Csv", "JacksonCore", "JacksonDatabind", 
    "Jsoup", "Lang", "Math","Gson", "Closure", "Mockito"
]
future_results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for project in projects:
        # FOR EACH BUG
        for bug_path in glob.glob("{root}/results/{project}/Bug_*/".format(root=root, project=project)):
            bug = re.search(r"Bug_(\d+)", bug_path).group(1)
            try:
                future = executor.submit(analyzer.analyzeBug, project, bug, False)
                future_results.append(future)
            except Exception as e:
                print(bug_path)
                print(e)

In [3]:
bug_results = []
for future in future_results:
    r = future.result() 
    bug_results.append(r)
#     try:
        
#     except Exception as e:
#         print(e)

### Example of bug result when a BIC detected

In [4]:
bug_results[3]

{'id': 'Time_Bug_1',
 'bug': 'Bug_1',
 'project': 'Time',
 'fix_pass': True,
 'prev_fails': True,
 'category': 'A regression is detected',
 'sub_category': 'Unique candidates',
 'test_name': 'org.joda.time.TestPartial_Constructors#testConstructorEx7_TypeArray_intArray',
 'bug_report': 'https://github.com/JodaOrg/joda-time/issues/93',
 'fix_commit': '9a62b06be5d0df8e833ff8583398cca386608cac',
 'BIC_candidates': [[1, '8612f9e5b88c1bea933ef9ab1e431f5db3006b48']],
 'executionsOnPast': 151,
 'buildFail': 0,
 'buildTestFail': 1493,
 'numCommits': 1717}

## RQ0: “Can regression tests be transplanted to the past?”

In [5]:
resume_df= pd.DataFrame(bug_results)[
    ['id', 'project','executionsOnPast', 'numCommits', 'buildFail', 'buildTestFail', 'BIC_candidates']
]

# Remove bugs with non-succeed BFC
resume_df = resume_df[resume_df['numCommits'] > 1]

resume_df['one_bic_found'] = resume_df.apply (
    lambda row: 1 if len(row['BIC_candidates']) == 1 else 0, axis=1
)
resume_df['n_bic_found'] = resume_df.apply (
    lambda row: 1 if len(row['BIC_candidates']) > 1 else 0, axis=1
)
resume_df['executionsOnPast_rate'] = resume_df.apply (
    lambda row: row['executionsOnPast'] * 100 / row['numCommits'], axis=1
)
resume_df['compilability'] = resume_df.apply (
    lambda row: (row['numCommits']-row['buildFail']) * 100 / row['numCommits'], axis=1
)
resume_df['testCompilability'] = resume_df.apply (
    lambda row: (row['numCommits']-row['buildTestFail']-row['buildFail']) * 100 / row['numCommits'], axis=1
)


resume_df.groupby('project').agg(
    bugs=('project','size'), 
    #s_bics=('one_bic_found','sum'),
    #m_bics=('n_bic_found','sum'),
    compilability_mean=('compilability','mean'), 
    compilability_median=('compilability','median'),
    test_compilability_mean=('testCompilability','mean'), 
    test_compilability_median=('testCompilability','median'),
    ratio_mean=('executionsOnPast_rate','mean'), 
    ratio_median=('executionsOnPast_rate','median'),
    #commits_mean=('numCommits','mean'),
    #commits_median=('numCommits','median')
).round(2)

Unnamed: 0_level_0,bugs,compilability_mean,compilability_median,test_compilability_mean,test_compilability_median,ratio_mean,ratio_median
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Closure,147,61.1,53.37,30.7,15.4,30.7,15.4
Collections,2,95.94,95.94,3.99,3.99,3.77,3.77
Compress,47,31.15,23.6,22.02,17.02,19.56,10.22
Csv,15,18.95,16.56,5.09,3.44,5.03,3.44
Gson,18,42.0,35.98,40.73,34.68,40.73,34.68
JacksonCore,23,40.48,38.45,36.72,30.5,36.72,30.5
JacksonDatabind,97,90.16,92.8,15.78,16.01,15.78,16.01
JacksonXml,3,85.02,84.39,37.66,36.21,37.66,36.21
Jsoup,91,20.83,12.43,17.83,10.0,17.83,10.0
Lang,63,73.35,64.69,11.28,8.3,11.28,8.3


In [6]:
resume_df['executionsOnPast_rate'].mean()

19.758169399433616

In [7]:
resume_df['executionsOnPast_rate'].median()

11.406737377238176

## RQ1: “Can the BIC for a given bug be found using its regression test?”

### Summary of results

In [8]:
df = pd.DataFrame(bug_results)[['id', 'project','category', 'sub_category']]
df[df['category'] != None].set_index('id').sort_index()
print(len(df))
df['category'].value_counts()

731


No regression is detected       595
A regression is detected         77
Test fails in the fix commit     59
Name: category, dtype: int64

### Results per sub-catergory

In [9]:
df.groupby(['category', "sub_category"]).count()['id']

category                      sub_category           
A regression is detected      Multiple candidates         27
                              Unique candidates           50
No regression is detected     -                          595
Test fails in the fix commit  Failure in source build     53
                              Failure in test build        4
                              Test execution fails         2
Name: id, dtype: int64

### Test execution fails (FIX COMMIT)

In [10]:
df[df['sub_category']=='Test execution fails']

Unnamed: 0,id,project,category,sub_category
240,Jsoup_Bug_67,Jsoup,Test fails in the fix commit,Test execution fails
317,Jsoup_Bug_78,Jsoup,Test fails in the fix commit,Test execution fails


- Jsoup 67 -> Test depends of execution time (flaky)
- Jsoup 78 -> Test has a Timeout

### Failure in test build (FIX COMMIT)

In [12]:
df[df['sub_category']=='Failure in test build']

Unnamed: 0,id,project,category,sub_category
33,Collections_Bug_26,Collections,Test fails in the fix commit,Failure in test build
34,Collections_Bug_26,Collections,Test fails in the fix commit,Failure in test build
36,Collections_Bug_27,Collections,Test fails in the fix commit,Failure in test build
93,Csv_Bug_13,Csv,Test fails in the fix commit,Failure in test build


- Collections 26 -> Other tests doesn't compile
- Collections 27 -> Other tests doesn't compile
- Csv 13 -> Library "lang" doesn't exists

### Failure in source build (FIX COMMIT)

In [13]:
df[df['sub_category']=='Failure in source build']

Unnamed: 0,id,project,category,sub_category
13,Time_Bug_26,Time,Test fails in the fix commit,Failure in source build
22,Time_Bug_25,Time,Test fails in the fix commit,Failure in source build
26,JacksonXml_Bug_1,JacksonXml,Test fails in the fix commit,Failure in source build
27,JacksonXml_Bug_6,JacksonXml,Test fails in the fix commit,Failure in source build
31,JacksonXml_Bug_5,JacksonXml,Test fails in the fix commit,Failure in source build
102,JacksonCore_Bug_23,JacksonCore,Test fails in the fix commit,Failure in source build
105,JacksonCore_Bug_24,JacksonCore,Test fails in the fix commit,Failure in source build
113,JacksonCore_Bug_26,JacksonCore,Test fails in the fix commit,Failure in source build
142,JacksonDatabind_Bug_76,JacksonDatabind,Test fails in the fix commit,Failure in source build
156,JacksonDatabind_Bug_104,JacksonDatabind,Test fails in the fix commit,Failure in source build
