Libs imports and Colletions search on MongoDB

In [13]:
# imports
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from pymongo import MongoClient
from bson.objectid import ObjectId
from scipy.stats import wilcoxon
from cliffs_delta import cliffs_delta
import numpy as np
import datetime

# Connection with smartshark_2_2 at MongoDB
client = MongoClient()
db = client.smartshark_2_2

# Get colletions data
vcs_systems = pd.DataFrame(list(db.vcs_system.find()))
travisBuilds = pd.DataFrame(list(db.travis_build.find()))
tags = pd.DataFrame(list(db.tag.find()))
issues = pd.DataFrame(list(db.issue.find()))
issue_system = pd.DataFrame(list(db.issue_system.find()))
projects = pd.DataFrame(list(db.project.find()))


Selecting projects for analysis

In [14]:
# Filter projects IC data
projetosWithBuilds = travisBuilds['vcs_system_id'].unique()

# Get all Tags of project with IC
vcsWithBuilds = vcs_systems.query('_id.isin(@projetosWithBuilds)').drop_duplicates().dropna()._id
tagsProjectWithBuilds = tags.query('vcs_system_id.isin(@vcsWithBuilds)').drop_duplicates().dropna()

# Filter Release Tags from select projects
releaseTags = tagsProjectWithBuilds[
    tagsProjectWithBuilds['name']
    .str.contains('rc|RC|candidate|CANDIDATE|beta|BETA|alpha|ALPHA')==False].drop_duplicates().dropna()

# Get Valid Builds of Release Tags 
relaseCommits = releaseTags.commit_id.unique()
integratedBuilds = travisBuilds.query('commit_id.isin(@relaseCommits)')
stateUnwanted = ['failed', 'canceled']
validBuilds = integratedBuilds.query('~state.isin(@stateUnwanted)')

# Aggregate builds with the same commit
validBuilds = validBuilds.groupby(['commit_id'], as_index=False, sort=False).max()

# get the projects with > 2 valid releases (implies the adoption and the use of CD)
vcsWithCD = validBuilds[['vcs_system_id','_id']].groupby(['vcs_system_id'], as_index=False).agg('count').sort_values(by=['_id'], ascending=False)
vcsWithCD = vcsWithCD.query('_id > 1')
#Projects selected

Getting data for the selected projects

In [25]:
#From the projects, get the data for analysis 
listProjectsWithCD = [] 
issues_system_ids = []
for i in range(vcsWithCD.shape[0]):
    vcs = vcsWithCD.iloc[i,0]
    # Get the project id
    project_id = vcs_systems.query('_id == @vcs').iloc[0,2] 

    # Get the name of the project
    name = projects.query('_id == @project_id').iloc[0,1]     

    # Get issue system id
    issues_system_id = issue_system.query('project_id == @project_id').iloc[0,0]
    issues_system_ids.append(issues_system_id)   

    # Get the commit of Cd adoption build
    commit_id_Builld = validBuilds.query('vcs_system_id == @vcs').sort_values(by=['started_at']).iloc[0,0]
    
    # Get tag, release name and adoption date
    id_tag_CD_Adoption = releaseTags.query('commit_id == @commit_id_Builld').iloc[0,0]
    release_name = releaseTags.query('commit_id == @commit_id_Builld').iloc[0,1]
    adotion_date = releaseTags.query('commit_id == @commit_id_Builld').iloc[0,6]

    listProjectsWithCD.append([name , vcs,project_id, issues_system_id, commit_id_Builld, id_tag_CD_Adoption, release_name, adotion_date  ])

# Mount de dataframe
dfProjectsWithCD = pd.DataFrame(listProjectsWithCD, columns=['name', 'vcs_system_id', 'project_id', 'issues_system_id', 'commit_id', 'tag_id','adoption_release', 'cd_adoption_date' ])

In [16]:
# Get the bug fixing time

issuesCDProjects = issues.query('issue_system_id.isin(@issues_system_ids)')
mean_bugs_no_cd = []
median_bugs_no_cd = []
mean_bugs_cd = []
median_bugs_cd = []
p_values = []

for i in range(dfProjectsWithCD.shape[0]):
    # Data for output
    tmp_name = dfProjectsWithCD.iloc[i,0]    
    tmp_date = dfProjectsWithCD.iloc[i,7]
    tmp_issue_sistem = dfProjectsWithCD.iloc[i,3]

    # Selecting the bugs
    df_bugs_cd = issuesCDProjects.query('issue_system_id == @tmp_issue_sistem & issue_type == \'Bug\' & resolution == \'Fixed\' &  status.isin([\'Closed\',\'Resolved\']) & created_at >= @tmp_date')        
    df_bugs_no_cd = issuesCDProjects.query('issue_system_id == @tmp_issue_sistem & issue_type == \'Bug\' & resolution == \'Fixed\' &  status.isin([\'Closed\',\'Resolved\']) &  updated_at < @tmp_date')   
    
    # Bug fixing time
    df_bugs_no_cd['delta'] = df_bugs_no_cd.updated_at - df_bugs_no_cd.created_at
    df_bugs_cd['delta'] = df_bugs_cd.updated_at - df_bugs_cd.created_at    

    # Store the measurement
    mean_bugs_no_cd.append(df_bugs_no_cd.delta.mean())
    median_bugs_no_cd.append(df_bugs_no_cd.delta.median())
    mean_bugs_cd.append(df_bugs_cd.delta.mean())
    median_bugs_cd.append(df_bugs_cd.delta.median())


dfProjectsWithCD['mean_delta_bugs_no_cd'] = mean_bugs_no_cd
dfProjectsWithCD['median_delta_bugs_no_cd'] = median_bugs_no_cd
dfProjectsWithCD['mean_delta_bugs_cd'] = mean_bugs_cd
dfProjectsWithCD['median_delta_bugs_cd'] = median_bugs_cd

Data for Statistics analysis (Cliff Delta and  Wilcoxon test)

In [20]:
cliffs_delta((dfProjectsWithCD['mean_delta_bugs_no_cd'] / np.timedelta64(1,'D')), (dfProjectsWithCD['mean_delta_bugs_cd'] / np.timedelta64(1,'D')))

(0.5904, 'large')

In [19]:
cliffs_delta((dfProjectsWithCD['median_delta_bugs_no_cd'] / np.timedelta64(1,'D')), (dfProjectsWithCD['median_delta_bugs_cd'] / np.timedelta64(1,'D')))

(0.4368, 'medium')

In [21]:
wilcoxon((dfProjectsWithCD['mean_delta_bugs_no_cd'] / np.timedelta64(1,'D')), (dfProjectsWithCD['mean_delta_bugs_cd'] / np.timedelta64(1,'D')), alternative='greater')

WilcoxonResult(statistic=294.0, pvalue=6.99460506439209e-05)

In [22]:
wilcoxon((dfProjectsWithCD['median_delta_bugs_no_cd'] / np.timedelta64(1,'D')), (dfProjectsWithCD['median_delta_bugs_cd'] / np.timedelta64(1,'D')), alternative='greater' )

WilcoxonResult(statistic=275.0, pvalue=0.0008126795291900635)

Data for the paper's table

In [23]:
dfProjectsWithCD['mean_delta_bugs_no_cd_min'] = dfProjectsWithCD.mean_delta_bugs_no_cd / datetime.timedelta(minutes=1)
dfProjectsWithCD['mean_delta_bugs_cd_min'] = dfProjectsWithCD.mean_delta_bugs_cd / datetime.timedelta(minutes=1)
dfProjectsWithCD['median_delta_bugs_no_cd_min'] = dfProjectsWithCD.median_delta_bugs_no_cd / datetime.timedelta(minutes=1)
dfProjectsWithCD['median_delta_bugs_cd_min'] = dfProjectsWithCD.median_delta_bugs_cd / datetime.timedelta(minutes=1)

In [24]:
dfProjectsWithCD[['name','cd_adoption_date','mean_delta_bugs_no_cd_min','mean_delta_bugs_cd_min', 'median_delta_bugs_no_cd_min','median_delta_bugs_cd_min' ]].to_csv('tabela1.csv', index=False)