In [16]:
import json
import pandas as pd
import os
import sys
from datetime import datetime
import csv
import concurrent.futures
sys.path.insert(1, '../../py/')
from framework.utils.GitUtils import GitManager, cloneRepository

Dataset of work "Evaluating SZZ Implementations Through a Developer-Informed Oracle" 
- SOURCE: https://github.com/grosa1/icse2021-szz-replication-package/

In [17]:
with open("language-filtered.json") as f:
    language_filtered_dataset = json.load(f)
len(language_filtered_dataset)

1115

In [18]:
languages = dict()
for bug in language_filtered_dataset:
    language = bug['fix']['files'][0]['lang']
    if language not in languages:
        languages[language] = 0
    languages[language] += 1
df_languages = pd.DataFrame.from_dict(languages, orient='index', columns=['Count'])
df_languages.sort_values(by='Count', ascending=False).head(10)

Unnamed: 0,Count
c,306
py,275
cpp,138
js,134
java,78
php,62
h,44
rb,35
cs,27
cxx,7


## Calculate distance between BFC an BIC

In [19]:
def calculateDistance(bug_info):
    
    project_name = bug_info['repository'].split("/")[1]
    
    result = {
        'bug_id': bug_info['id'],
        'project': bug_info['repository'],
        'bfc_lang': bug_info['fix']['files'][0]['lang'].lower(),
        'bic_lang': bug_info['bugs'][0]['files'][0]['lang'].lower(),
        'distance_in_commits': 0,
        'distance_in_days': 0
    }
    
    # CLONE REPO
    if not os.path.isdir("projects/"+project_name):
        cloneRepository("https://github.com/"+bug_info['repository'], "projects/"+project_name)
    
    # GET COMMITS
    gm = GitManager("projects/"+project_name, bug_info['fix']['commit']['hash'])
    report_path = "reports/%s_bug_%d_commit_list.csv"%(project_name, bug_info['id'])
    if not os.path.isfile(report_path):
        commits = gm.generateCommitList(report_path)
    else:
        # Use cache
        with open(report_path, "r") as f:
            reader = csv.DictReader(f)
            commits = list(reader)
    
    # ITERATE COMMITS
    bfc_pos = 0
    bfc_date = datetime.strptime(commits[0]['date'], '%Y-%m-%d %H:%M:%S %z')

    for commit in commits:
        if commit['hash'] == bug_info['bugs'][0]['commit']['hash']:
            bic_pos = int(commit['id'])
            bic_date = datetime.strptime(commit['date'], '%Y-%m-%d %H:%M:%S %z')
            result['distance_in_commits'] = bic_pos
            result['distance_in_days'] = (bfc_date-bic_date).days
            return result

In [20]:
future_results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for bug in language_filtered_dataset:
        future = executor.submit(calculateDistance, bug)
        future_results.append(future)

In [21]:
bug_results = []
for future in future_results:
    try:
        r = future.result() 
        if r is not None:
            bug_results.append(r)
    except Exception as e:
        print("Can't find project")

Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find project
Can't find p

In [22]:
len(bug_results)

1040

In [23]:
df = pd.DataFrame.from_dict(bug_results)
df.to_csv('distance_results.csv', index=False)
df

Unnamed: 0,bug_id,project,bfc_lang,bic_lang,distance_in_commits,distance_in_days
0,3,DemocracyClub/yournextrepresentative,py,py,43,4
1,4,ahobson/ruby-pcap,rb,rb,2,1
2,10,SimpleServer/SimpleServer,java,classpath,190,49
3,16,kiva/backbone.siren,js,js,51,44
4,21,nickvandewiele/RMG-Java,java,java,10,13
...,...,...,...,...,...,...
1035,3556,baob/submit-this,js,js,5,0
1036,3566,gpac/gpac,c,c,1,0
1037,3575,arobert01/RTK,h,h,2,0
1038,3591,awslabs/amazon-kinesis-video-streams-webrtc-sdk-c,c,c,5,3


Manual edits:
* For project _OpenChannelSSD/linux (bug 2636)_, the original BFC date was Mon Sep 17 00:00:00 2001 -0700, but in Github was 07/22/2015 (we use this second one, more accurate)
* For project _Subsurface-divelog/subsurface (bug 2951)_, the original BFC date was Wed Jan 3 11:46:21 2018 +010, but in Github was 01/06/2018 (we use this second one, more accurate)

In [24]:
df.sort_values(by='distance_in_days', ascending=False).head(10)

Unnamed: 0,bug_id,project,bfc_lang,bic_lang,distance_in_commits,distance_in_days
902,3041,ocaml/ocaml,c,depend,14532,6778
736,2426,mono/libgdiplus,c,,338,3772
1028,3525,herbstluftwm/herbstluftwm,cpp,c,2376,3241
949,3196,dscho/busybox-w32,c,c,3480,2659
505,1645,shogun-toolbox/shogun,h,h,7785,2294
988,3380,jiangxincode/Emma,java,java,10,2156
306,1029,BenzoRoms/packages_providers_TelephonyProvider,java,java,727,2124
901,3039,reactos/reactos,c,c,18449,2009
747,2475,atari800/atari800,c,,328,1892
616,2025,adamretter/exist,java,java,4601,1883


In [25]:
df.sort_values(by='distance_in_commits', ascending=False).head(10)

Unnamed: 0,bug_id,project,bfc_lang,bic_lang,distance_in_commits,distance_in_days
799,2636,OpenChannelSSD/linux,c,h,262977,1390
843,2814,GreyLeshy/android_kernel_sony_msm8994_kitakami,c,c,218845,1075
814,2706,ethan-halsall/Simple-Kernel,c,h,118019,548
576,1919,PeterHuewe/linux-tpmdd,h,c,110288,734
92,269,Tragetaschen/linux-stable-pconxs,c,c,76532,462
423,1407,SVB22/kernel_lenovo_msm8953,c,c,75158,691
39,113,yangdongsheng/linux,c,c,71264,347
199,639,torvalds/linux,c,c,59076,407
277,944,olafdietsche/linux-accessfs,c,txt,53773,301
533,1747,Chairshot215/android_kernel_lge_hammerhead-sta...,c,c,52213,407


In [26]:
df.describe().astype(int).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bug_id,1040,1727,1016,3,858,1690,2583,3595
distance_in_commits,1040,1678,13129,1,3,11,81,262977
distance_in_days,1040,106,367,0,1,9,51,6778


In [27]:
agg_by_lang = df.groupby('bfc_lang')['distance_in_commits','distance_in_days'].aggregate('mean')
agg_by_lang.round(decimals=2)

  agg_by_lang = df.groupby('bfc_lang')['distance_in_commits','distance_in_days'].aggregate('mean')


Unnamed: 0_level_0,distance_in_commits,distance_in_days
bfc_lang,Unnamed: 1_level_1,Unnamed: 2_level_1
c,5366.98,180.09
cpp,164.24,109.39
cs,61.96,30.07
cxx,11.17,8.0
h,3104.55,129.5
hh,2.0,0.0
hpp,7.33,7.33
java,431.73,144.19
js,130.09,38.49
php,101.8,107.62


In [28]:
df.groupby('project').aggregate('count')#.sort_values(by='bug_id', ascending=False).head(10)

Unnamed: 0_level_0,bug_id,bfc_lang,bic_lang,distance_in_commits,distance_in_days
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01org/gbs,1,1,1,1,1
0lvin/yquake2,1,1,1,1,1
4awpawz/trio,1,1,1,1,1
6WIND/os-vif-plugin-vhostuser-fp,1,1,1,1,1
AIFDR/inasafe,1,1,1,1,1
...,...,...,...,...,...
zhengqunkoo/taxibros,1,1,1,1,1
zizhengwu/Lumines-clone-Unity2D,1,1,1,1,1
zonque/linux,1,1,1,1,1
zstackio/zstack-utility,1,1,1,1,1


## Search for regression tests

In [29]:
language_filtered_dataset[0]

{'id': 3,
 'repository': 'DemocracyClub/yournextrepresentative',
 'fix': {'commit': {'hash': '0722309a9c242aac28d2cb33798abbd18b233aa0',
   'message': "Fix creation of duplicate PostExtraElection objects in tests\n\nThis fixes an error introduced in 2574243a39d90a2673cf56647c524 - the\nPostExtraElection objects *were* being created, because of specifying\nthe list of elections for the 'elections' kwarg to\nPostExtraFactory.create.",
   'author': 'Mark Longair',
   'url': 'https://api.github.com/repos/DemocracyClub/yournextrepresentative/commits/0722309a9c242aac28d2cb33798abbd18b233aa0'},
  'files': [{'name': 'uk_examples.py',
    'new_path': 'candidates/tests/uk_examples.py',
    'old_path': 'candidates/tests/uk_examples.py',
    'lang': 'py',
    'lines_added': [],
    'lines_deleted': [117, 118, 119, 120],
    'change_type': 'MODIFY'}]},
 'bugs': [{'commit': {'hash': '2574243a39d90a2673cf56647c524e268d7f169e',
    'message': "Fix the UK examples, which didn't create PostExtraElection

In [30]:
unique_links = set()
n=0
for bug in language_filtered_dataset:
    bfc_files = bug['fix']['files']
    for file in bfc_files:
        if file['new_path'] is not None and 'test' in file['new_path']:
            link = "https://github.com/"+bug['repository']+"/commit/"+bug['fix']['commit']['hash']
            if link not in unique_links:
                n+=1
                unique_links.add(link)

for link in unique_links:
    print(link)
print(n)

https://github.com/unshiftio/url-parse/commit/b723b90dcb7576904bd11d99e1dd0d7b397baca6
https://github.com/shamuproject/mavtables/commit/1b4c9abf0281d183c62cd431ecf8c6ce77a912e8
https://github.com/SAP/openui5/commit/ff6203240f54d07743f98bab4459a02b14c31c3d
https://github.com/wikimedia/pywikibot/commit/a142a7ebaaa65af121e3cf0a8b175e391f0e4046
https://github.com/cosmocode/dokuwiki-plugin-struct/commit/f0ee60b954b9647d3eb2249b991590b08ad39158
https://github.com/pdg137/bigdecimal/commit/0aa8f96519e1c61913ba45b05e2188c036ac1f01
https://github.com/spring-projects/spring-framework/commit/813108a928fd50a1fb87e9333b974b6e7242b117
https://github.com/sosy-lab/java-smt/commit/52f5ed2cae6c2e0958a47572b83e6b8b754f91ab
https://github.com/bloomberg/salt/commit/1e9dd7fd3f1fed843a61a916d95cc61ace48a2f7
https://github.com/openstack/nova/commit/0a09bf5faeb30b675fdb8f517c61166ef794bad3
https://github.com/nodejs/node/commit/f77555f792e8515192d6fe973d7c4e1532da9d5c
https://github.com/mozilla/fxa-content-serve

In [33]:
with open("RegressionTestsFound.csv", "r") as f:
    reader = csv.DictReader(f)
    regtest = list(reader)
regtest_df = pd.DataFrame.from_dict(regtest)
regtest_df.groupby('HAS_REGRESSION_TEST').aggregate('count')

Unnamed: 0_level_0,LINK_TO_BFC
HAS_REGRESSION_TEST,Unnamed: 1_level_1
FALSE,50
NOT EXIST,3
TRUE,55
