In [64]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu
from cliffs_delta import cliffs_delta

from utils import helpers as hpr
from utils import constants
from utils import classifier_util as clas_util

### Dependent changes

In [5]:
df_dependencies = pd.read_csv(osp.join('.', 'Files', 'source_target_evolution_clean.csv'))
df_dependencies = df_dependencies[(df_dependencies['Source_status']!="NEW")&(df_dependencies['Target_status']!="NEW")]

In [6]:
dependent_changes = set(hpr.flatten_list(df_dependencies[['Source', 'Target']].values))

### Load changes

In [8]:
df_changes = hpr.combine_openstack_data(changes_path="/Changes3/")

Reading OpenStack changes...
OpenStack changes loaded successfully...


In [9]:
df_changes = df_changes[df_changes["status"]!="NEW"]
df_changes['is_dependent'] = df_changes['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)

In [8]:
combined_output = hpr.combine_file_metrics()
combined_output.drop(columns=["owner_account_id", 'status'], inplace=True)

In [9]:
def calc_mod_file_dep_cha(row):
    changed_files = row["changed_files"]
    if type(changed_files) is not list:
        changed_files = []
    return round(100*row['num_mod_file_dep_cha']/len(changed_files), 2) if len(changed_files) != 0 else 0

In [10]:
df = clas_util.combine_features()
df = pd.merge(
    left=df, 
    right=combined_output, 
    left_on='number', 
    right_on='number', 
    how='left',
    suffixes=('_source', '_target')
)
df['pctg_mod_file_dep_cha'] = df.apply(calc_mod_file_dep_cha, axis=1)
df['is_dependent'] = df['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)
df = df.drop(columns=[
   "changed_files", "num_mod_file_dep_cha", 'num_build_failures', 
    # 'cross_project_changes', 'cross_project_changes_owner', 'pctg_cross_project_changes_owner', 
    # 'min_num_mod_file_dep_cha', 'max_num_mod_file_dep_cha', 'mean_num_mod_file_dep_cha', 'median_num_mod_file_dep_cha',
    # 'pctg_cross_project_changes', 'last_mth_cro_proj_nbr'
    ])
df.fillna(0, inplace=True)
# del df_changes

In [60]:
def mann_u_test(sample1, sample2):
    _, p_value = mannwhitneyu(sample1, sample2)
    # Set significance level
    alpha = 0.05
    if p_value <= alpha:
        print("Reject the null hypothesis: There is a statistically significant difference between the two samples.")
        print(p_value)
    else:
        print("Fail to reject the null hypothesis: There is no statistically significant difference between the two samples.")

### Added lines stattistical significance

In [69]:
sample1 = df_changes.loc[(df_changes['is_dependent']==1), "insertions"].tolist()
sample2 = df_changes.loc[(df_changes['is_dependent']==0), "insertions"].tolist()
# sample1.median(), sample2.median()
mann_u_test(sample1, sample2)
cliffs_delta(sample1, sample2)

Reject the null hypothesis: There is a statistically significant difference between the two samples.
1.702447600381793e-50


(0.040424623111997876, 'negligible')

### Description length stattistical significance

In [None]:
df_desc_length = pd.read_csv("./Files/Metrics/description_length.csv")
df_samples = pd.merge(
    df_changes[["number", "is_dependent"]],
    df_desc_length,
    on="number",
    how="left"
)

In [77]:
sample1 = df_samples.loc[(df_samples['is_dependent']==1), "description_length"].tolist()
sample2 = df_samples.loc[(df_samples['is_dependent']==0), "description_length"].tolist()
# sample1.median(), sample2.median()
mann_u_test(sample1, sample2)
cliffs_delta(sample1, sample2)

Reject the null hypothesis: There is a statistically significant difference between the two samples.
0.0


(0.2177572845509798, 'small')

### The experience of the developers with dependenct changes

In [85]:
dev_with_dep = df_changes.loc[df_changes['is_dependent']==1, "owner_account_id"].unique()

sample1 = df_changes[df_changes['owner_account_id'].isin(dev_with_dep)].groupby("owner_account_id").count()['id'].tolist()
sample2 = df_changes[~df_changes['owner_account_id'].isin(dev_with_dep)].groupby("owner_account_id").count()['id'].tolist()
p_value = mann_u_test(sample1, sample2)
cliffs_delta(sample1, sample2)

Reject the null hypothesis: There is a statistically significant difference between the two samples.
0.0


(0.7904687142349257, 'large')

### Age of the project

In [88]:
df_project_age = pd.read_csv("./Files/Metrics/project_age.csv")
df_samples = pd.merge(
    df_changes[["number", "is_dependent"]],
    df_project_age,
    on="number",
    how="left"
)

In [89]:
sample1 = df_samples.loc[df_samples["is_dependent"]==1, "project_age"].tolist()
sample2 = df_samples.loc[df_samples["is_dependent"]==0, "project_age"].tolist()
p_value = mann_u_test(sample1, sample2)
cliffs_delta(sample1, sample2)

Reject the null hypothesis: There is a statistically significant difference between the two samples.
0.0


(0.17472380212312147, 'small')

### changes with number of changed files

In [None]:
sample1 = df_changes.loc[df_changes["is_dependent"]==1, "files_count"].sort_values().tolist()
sample2 = df_changes.loc[df_changes["is_dependent"]==0, "files_count"].sort_values().tolist()
# Perform the Mann-Whitney U test
_, p_value = mannwhitneyu(sample1, sample2)