In [1]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import pingouin as pg
import pandas as pd  
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from warnings import filterwarnings
filterwarnings('ignore')


In [2]:
# Dataset
data = pd.read_excel("Dataset.xlsx")
data.shape

(330, 28)

In [3]:
data.head()

Unnamed: 0,Jira Id,Requirement CFP,Requirement E,Requirement X,Requirement W,Requirement R,Backend CFP,Backend E,Backend X,Backend W,...,Android Seniority,Android Bug Count,iOS CFP,iOS E,iOS X,iOS W,iOS R,iOS Seniority,iOS Bug Count,Total Bug Count
0,T-1,26,3,4,16,3,0,0,0,0,...,3,6,26,3,4,16,3,2,5,11
1,T-2,3,0,0,0,3,0,0,0,0,...,2,0,3,0,0,0,3,2,0,0
2,T-3,8,0,0,4,4,0,0,0,0,...,2,13,8,0,0,4,4,3,0,13
3,T-4,11,2,1,6,2,6,2,1,1,...,3,0,5,0,0,5,0,3,0,0
4,T-5,7,5,2,0,0,0,0,0,0,...,2,17,7,5,2,0,0,3,27,44


# Common Utility Function

In [5]:
# Required sample size calculation
def required_sample_size(r, alpha=0.05, power=0.80):
    sample_size = pg.power_corr(r=r, alpha=alpha, power=power)
    return int(np.ceil(sample_size))

# RQ1: Is there any correlation between CFP size and the number of code defects?

In [7]:
### CFP Size vs Defect Count Correlation on whole dataset

Requirement_CFP = data['Requirement CFP'].tolist()
total_bug_count = data['Total Bug Count'].tolist()
corr, pval = spearmanr(Requirement_CFP, total_bug_count)
MLmodelRequirementDefectPrediction=-1
sample_size=len(Requirement_CFP)
alpha = 0.05

# print the result
print("Sample Size",len(Requirement_CFP))
print("Spearman's correlation coefficient:", corr)
print("p-value:", pval)

if sample_size>=required_sample_size(corr):
    if pval < alpha:
        print("Result: The result is significant. We reject the H0 hypothesis.")
    else:
        print("Result: The result is not significant. We cannot reject the H0 hypothesis.")
else:
    print("Result: Insufficient sample size!!!")
    

Sample Size 330
Spearman's correlation coefficient: 0.3994569018559609
p-value: 4.509559408380957e-14
Result: The result is significant. We reject the H0 hypothesis.


# RQ2: How does the correlation between CFP size and the number of code defects differ across various platforms, levels of developer seniority, and CFP data movement characteristics?

In [9]:
### Function Purpose: CFP Size vs Defect Count Correlation on based on Platform, CFP data movement and Developer Seniority
#Input paramters:
#    platform: Development platform. Backend, iOS or Android in our case study.
#    data_movement: CFP data movement. It can be E for Entry,X for Exit,W for Write and R for Read
#    print_header: set =1 to print result header. default is 0
#    alpha: alpha value for spearman correlation. it is 0.005 as default

def get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement(platform,seniority,data_movement,print_header=0,alpha=0.05):
    developer_seniority_level=seniority # between 1-3
    CFP_data_movement=data_movement # All for total platform CFP else E,X,W,R
    seniority_label=platform+' Seniority'
    Bug_label=platform+' Bug Count'
    CFP_label=platform
    is_significant="False"
    MLmodelRequirementDefectPrediction=-1

    #Setup for data movement
    if CFP_data_movement=='ALL':
        CFP_label=CFP_label+' CFP'
    else:
        CFP_label=CFP_label+' '+CFP_data_movement
        
    
    Requirement_CFP= data.loc[(data[CFP_label] != 0) & (data[seniority_label] == developer_seniority_level), CFP_label].tolist()
    total_bug_count= data.loc[(data[CFP_label] != 0) & (data[seniority_label] == developer_seniority_level), Bug_label].tolist()
    sample_size=len(Requirement_CFP)

    corr=np.nan
    pval=np.nan
    if sample_size>0 and np.var(Requirement_CFP) != 0 and np.var(total_bug_count) != 0: 
        corr, pval = spearmanr(Requirement_CFP, total_bug_count)

    r_sample_size=-1
    if np.isnan(corr)!=True and corr!=0 and np.isnan(pval)!=True:
        if corr<0.98  and corr!=0:
            r_sample_size=required_sample_size(corr)
        else:
            r_sample_size=required_sample_size(0.98)
            
    
    test_R2="NA"

    if sample_size>=r_sample_size and r_sample_size!=-1:

        if pval < alpha:
            is_significant="True"
    else:
        
        if pval > alpha:
            is_significant="False"
        else:
            is_significant="Not Enough Sample"
        

    # print the result

    if print_header==1:
        print(' {:<10s}  {:<10}  {:<20}  {:<17s}  {:<19}  {:<15s} {:<15s}'.format('Platform','Seniority','Data Movement','correlation','p-value','is_significant','Sample Size'))
    
    print('-' * 150)
    print(' {:<10s}  {:<12s} {:<20s} {:<18.3f} {:<20.3f} {:<20s} {:<16d}'.format(platform,str(developer_seniority_level),data_movement,corr,pval,is_significant,sample_size))
    
    return is_significant,corr,pval,MLmodelRequirementDefectPrediction

In [10]:
#For Backend Platform
is_significant,corr,pval,MLmodel_Backend_1_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',1,'ALL',1)
is_significant,corr,pval,MLmodel_Backend_2_All=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',2,'ALL')
is_significant,corr,pval,MLmodel_Backend_3_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',3,'ALL')
is_significant,corr,pval,MLmodel_Backend_1_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',1,'E')
is_significant,corr,pval,MLmodel_Backend_1_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',1,'X')
is_significant,corr,pval,MLmodel_Backend_1_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',1,'W')
is_significant,corr,pval,MLmodel_Backend_1_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',1,'R')
is_significant,corr,pval,MLmodel_Backend_2_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',2,'E')
is_significant,corr,pval,MLmodel_Backend_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',2,'X')
is_significant,corr,pval,MLmodel_Backend_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',2,'W')
is_significant,corr,pval,MLmodel_Backend_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',2,'R')
is_significant,corr,pval,MLmodel_Backend_3_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',3,'E')
is_significant,corr,pval,MLmodel_Backend_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',3,'X')
is_significant,corr,pval,MLmodel_Backend_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',3,'W')
is_significant,corr,pval,MLmodel_Backend_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Backend',3,'R')
#For iOS Platform
is_significant,corr,pval,MLmodel_iOS_1_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',1,'ALL')
is_significant,corr,pval,MLmodel_iOS_2_All=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',2,'ALL')
is_significant,corr,pval,MLmodel_iOS_3_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',3,'ALL')
is_significant,corr,pval,MLmodel_iOS_1_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',1,'E')
is_significant,corr,pval,MLmodel_iOS_1_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',1,'X')
is_significant,corr,pval,MLmodel_iOS_1_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',1,'W')
is_significant,corr,pval,MLmodel_iOS_1_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',1,'R')
is_significant,corr,pval,MLmodel_iOS_2_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',2,'E')
is_significant,corr,pval,MLmodel_iOS_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',2,'X')
is_significant,corr,pval,MLmodel_iOS_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',2,'W')
is_significant,corr,pval,MLmodel_iOS_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',2,'R')
is_significant,corr,pval,MLmodel_iOS_3_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',3,'E')
is_significant,corr,pval,MLmodel_iOS_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',3,'X')
is_significant,corr,pval,MLmodel_iOS_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',3,'W')
is_significant,corr,pval,MLmodel_iOS_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('iOS',3,'R')
#For Android Platform
is_significant,corr,pval,MLmodel_Android_1_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',1,'ALL')
is_significant,corr,pval,MLmodel_Android_2_All=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',2,'ALL')
is_significant,corr,pval,MLmodel_Android_3_ALL=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',3,'ALL')
is_significant,corr,pval,MLmodel_Android_1_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',1,'E')
is_significant,corr,pval,MLmodel_Android_1_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',1,'X')
is_significant,corr,pval,MLmodel_Android_1_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',1,'W')
is_significant,corr,pval,MLmodel_Android_1_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',1,'R')
is_significant,corr,pval,MLmodel_Android_2_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',2,'E')
is_significant,corr,pval,MLmodel_Android_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',2,'X')
is_significant,corr,pval,MLmodel_Android_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',2,'W')
is_significant,corr,pval,MLmodel_Android_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',2,'R')
is_significant,corr,pval,MLmodel_Android_3_E=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',3,'E')
is_significant,corr,pval,MLmodel_Android_2_X=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',3,'X')
is_significant,corr,pval,MLmodel_Android_2_W=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',3,'W')
is_significant,corr,pval,MLmodel_Android_2_R=get_spearmanr_corr_based_on_platform_and_seniority_and_data_movement('Android',3,'R')

 Platform    Seniority   Data Movement         correlation        p-value              is_significant  Sample Size    
------------------------------------------------------------------------------------------------------------------------------------------------------
 Backend     1            ALL                  0.225              0.125                False                48              
------------------------------------------------------------------------------------------------------------------------------------------------------
 Backend     2            ALL                  0.187              0.062                False                100             
------------------------------------------------------------------------------------------------------------------------------------------------------
 Backend     3            ALL                  nan                nan                  Not Enough Sample    9               
-----------------------------------------------------

# RQ3: To what extent can we accurately predict the number of code defects for a new requirement?

In [12]:
# Train and validate ML models & Create Best ML Model to predict # of defects based on CFP size
def get_best_ML_model(X, y,cv=5):

    v_random_state=30
    
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge': Ridge(random_state=v_random_state),
        'Lasso': Lasso(random_state=v_random_state),
        'ElasticNet': ElasticNet(random_state=v_random_state),
        'Decision Tree (max_depth=3)': DecisionTreeRegressor(max_depth=3, random_state=v_random_state),
        'Decision Tree (max_depth=4)': DecisionTreeRegressor(max_depth=4, random_state=v_random_state),
        'Decision Tree (max_depth=5)': DecisionTreeRegressor(max_depth=5, random_state=v_random_state),
        'Random Forest': RandomForestRegressor(random_state=v_random_state),
        'Gradient Boosting': GradientBoostingRegressor(random_state=v_random_state),
        'AdaBoost': AdaBoostRegressor(random_state=v_random_state),
        'SVR': SVR(),
        'KNeighbors': KNeighborsRegressor(),
        'Bayesian Ridge': BayesianRidge(),
        'MLP Regressor': MLPRegressor(random_state=v_random_state)
    }
    
    model_scores = {}

    for name, model in models.items():
        scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        model_scores[name] = mean_score
        print(f"{name}: Mean R² = {mean_score:.4f}, Standard Deviation = {std_score:.4f}")
    
    best_model_name = max(model_scores, key=model_scores.get)
    best_model_score = model_scores[best_model_name]
    best_model = models[best_model_name]
    
    print(f"\nBest Model: {best_model_name}")
    #print(f"Best Model R² score: {best_model_score:.4f}")
    
    best_model.fit(X, y)
    return best_model,best_model_score



In [13]:
#For iOS and seniority level is Senior
seniority=2
CFP_All= data.loc[data['iOS Seniority'] == seniority, 'iOS CFP'].tolist()
CFP_E= data.loc[data['iOS Seniority'] == seniority, 'iOS E'].tolist()
sample_size=len(CFP_All)

X = np.array([CFP_All,CFP_E]).T


y= data.loc[data['iOS Seniority'] == seniority, 'iOS Bug Count'].tolist()

MLmodel_for_iOS_and_seniority_2_with_All_E,CV_r2_score=get_best_ML_model(X,y)

print(f"iOS Senior Developer Mean R²: {CV_r2_score:.3f}")
print(f"Sample Size: {sample_size}")

Linear Regression: Mean R² = -1.8562, Standard Deviation = 2.9536
Ridge: Mean R² = -1.8387, Standard Deviation = 2.9214
Lasso: Mean R² = -1.5137, Standard Deviation = 2.4206
ElasticNet: Mean R² = -1.2638, Standard Deviation = 1.9368
Decision Tree (max_depth=3): Mean R² = -3.5540, Standard Deviation = 7.9544
Decision Tree (max_depth=4): Mean R² = -3.5532, Standard Deviation = 7.9489
Decision Tree (max_depth=5): Mean R² = -3.5549, Standard Deviation = 7.9482
Random Forest: Mean R² = -1.4471, Standard Deviation = 4.1278
Gradient Boosting: Mean R² = -3.3157, Standard Deviation = 7.5131
AdaBoost: Mean R² = -3.5551, Standard Deviation = 7.9725
SVR: Mean R² = 0.3069, Standard Deviation = 0.3753
KNeighbors: Mean R² = 0.0301, Standard Deviation = 0.8672
Bayesian Ridge: Mean R² = -1.6708, Standard Deviation = 2.6747
MLP Regressor: Mean R² = -0.2077, Standard Deviation = 1.6801

Best Model: SVR
iOS Senior Developer Mean R²: 0.307
Sample Size: 77


In [14]:
#For iOS and seniority level is Expert
seniority=3
CFP_All= data.loc[data['iOS Seniority'] == seniority, 'iOS CFP'].tolist()
CFP_E= data.loc[data['iOS Seniority'] == seniority, 'iOS E'].tolist()
sample_size=len(CFP_All)

X = np.array([CFP_All,CFP_E]).T

y= data.loc[data['iOS Seniority'] == seniority, 'iOS Bug Count'].tolist()

MLmodel_for_iOS_and_seniority_3_with_All_E,CV_r2_score=get_best_ML_model(X,y)

print(f"iOS Expert Developer Mean R²: {CV_r2_score:.3f}")
print(f"Sample Size: {sample_size}")

Linear Regression: Mean R² = 0.4063, Standard Deviation = 0.2945
Ridge: Mean R² = 0.4132, Standard Deviation = 0.2886
Lasso: Mean R² = 0.5795, Standard Deviation = 0.1864
ElasticNet: Mean R² = 0.6078, Standard Deviation = 0.1934
Decision Tree (max_depth=3): Mean R² = -0.3832, Standard Deviation = 1.7580
Decision Tree (max_depth=4): Mean R² = -0.3759, Standard Deviation = 1.7679
Decision Tree (max_depth=5): Mean R² = -0.3514, Standard Deviation = 1.7819
Random Forest: Mean R² = 0.2855, Standard Deviation = 0.5302
Gradient Boosting: Mean R² = -0.1452, Standard Deviation = 1.0496
AdaBoost: Mean R² = 0.1857, Standard Deviation = 0.6813
SVR: Mean R² = 0.2430, Standard Deviation = 0.2724
KNeighbors: Mean R² = 0.2861, Standard Deviation = 0.3470
Bayesian Ridge: Mean R² = 0.4195, Standard Deviation = 0.2845
MLP Regressor: Mean R² = 0.4373, Standard Deviation = 0.2730

Best Model: ElasticNet
iOS Expert Developer Mean R²: 0.608
Sample Size: 114


In [15]:
#For Android and seniority level is Senior
seniority=2
CFP_All= data.loc[data['Android Seniority'] == seniority, 'Android CFP'].tolist()
CFP_E= data.loc[data['Android Seniority'] == seniority, 'Android E'].tolist()
CFP_W= data.loc[data['Android Seniority'] == seniority, 'Android W'].tolist()
sample_size=len(CFP_All)

X = np.array([CFP_All,CFP_E,CFP_W]).T

y= data.loc[data['Android Seniority'] == seniority, 'Android Bug Count'].tolist()

MLmodel_for_Android_and_seniority_2_with_All_E_W,CV_r2_score=get_best_ML_model(X,y)

print(f"Android Senior Developer Mean R²: {CV_r2_score:.3f}")
print(f"Sample Size: {sample_size}")

Linear Regression: Mean R² = -0.6781, Standard Deviation = 1.2055
Ridge: Mean R² = -0.6662, Standard Deviation = 1.2025
Lasso: Mean R² = -0.3181, Standard Deviation = 0.8407
ElasticNet: Mean R² = -0.4207, Standard Deviation = 1.0000
Decision Tree (max_depth=3): Mean R² = -0.7734, Standard Deviation = 2.4641
Decision Tree (max_depth=4): Mean R² = -0.7961, Standard Deviation = 2.5018
Decision Tree (max_depth=5): Mean R² = -0.7815, Standard Deviation = 2.5119
Random Forest: Mean R² = -0.5715, Standard Deviation = 1.7653
Gradient Boosting: Mean R² = -1.0772, Standard Deviation = 2.5344
AdaBoost: Mean R² = -0.7744, Standard Deviation = 1.8766
SVR: Mean R² = -0.1674, Standard Deviation = 0.7079
KNeighbors: Mean R² = -0.1259, Standard Deviation = 0.6773
Bayesian Ridge: Mean R² = -0.3747, Standard Deviation = 0.7922
MLP Regressor: Mean R² = -0.3580, Standard Deviation = 0.8592

Best Model: KNeighbors
Android Senior Developer Mean R²: -0.126
Sample Size: 32


In [16]:
#For Android and seniority level is Expert
seniority=3
CFP_All= data.loc[data['Android Seniority'] == seniority, 'Android CFP'].tolist()
sample_size=len(CFP_All)

X = np.array([CFP_All]).T

y= data.loc[data['Android Seniority'] == seniority, 'Android Bug Count'].tolist()

MLmodel_for_Android_and_seniority_3_with_All=get_best_ML_model(X,y)

print(f"Android Expert Developer Mean R²: {CV_r2_score:.3f}")
print(f"Sample Size: {sample_size}")

Linear Regression: Mean R² = -1.2144, Standard Deviation = 2.0635
Ridge: Mean R² = -1.2144, Standard Deviation = 2.0633
Lasso: Mean R² = -1.1996, Standard Deviation = 1.9968
ElasticNet: Mean R² = -1.2047, Standard Deviation = 2.0241
Decision Tree (max_depth=3): Mean R² = -2.7917, Standard Deviation = 3.8340
Decision Tree (max_depth=4): Mean R² = -3.2518, Standard Deviation = 5.1043
Decision Tree (max_depth=5): Mean R² = -3.2606, Standard Deviation = 5.1247
Random Forest: Mean R² = -3.0456, Standard Deviation = 4.9096
Gradient Boosting: Mean R² = -3.2451, Standard Deviation = 5.0907
AdaBoost: Mean R² = -1.0535, Standard Deviation = 1.6956
SVR: Mean R² = -0.1224, Standard Deviation = 0.0674
KNeighbors: Mean R² = -4.4039, Standard Deviation = 5.5882
Bayesian Ridge: Mean R² = -1.2055, Standard Deviation = 1.7641
MLP Regressor: Mean R² = -0.8545, Standard Deviation = 1.7489

Best Model: SVR
Android Expert Developer Mean R²: -0.126
Sample Size: 92


# A new requirement prediction for iOS and senior level by providing the iOS CFP size and the CFP entry data movement size, respectively. 

In [18]:
new_Requirement= np.array([[20, 5]])

# Predict
result = MLmodel_for_iOS_and_seniority_2_with_All_E.predict(new_Requirement)

print(f"The number of defects predicted for the new requirement is: {result[0]:.0f}")

The number of defects predicted for the new requirement is: 3
