In [1]:
import os
import os.path as osp
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  precision_score, recall_score, roc_auc_score, brier_score_loss
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit

from imblearn.under_sampling import RandomUnderSampler

from utils import helpers as hpr
from utils import constants
from utils import classifier_util as clas_util

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from scipy.stats import mannwhitneyu

### Constants

In [2]:
METRICS = constants.get_metrics()[:-3]

### Dependent changes

In [8]:
df_dependencies = pd.read_csv(osp.join('.', 'Files', 'source_target_evolution_clean.csv'))
# df_dependencies = df_dependencies[(df_dependencies['Source_status']!="NEW")&(df_dependencies['Target_status']!="NEW")]
dependent_changes = set(hpr.flatten_list(df_dependencies[['Source', 'Target']].values))

### Load changes

In [11]:
df_changes = hpr.combine_openstack_data(changes_path="/Changes3/")
# df_changes = df_changes[df_changes["status"]!="NEW"]
# df_changes['is_dependent'] = df_changes['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)

Reading OpenStack changes...
OpenStack changes loaded successfully...


In [27]:
df_changes["number"].nunique()

712079

In [29]:
df2 = pd.read_csv("./Files/Data/Test/0.csv")
df2 = pd.merge(
    left=df2, 
    right=df_changes[["number", "status"]], 
    left_on='Source', 
    right_on='number', 
    how='inner',
    suffixes=('_target', '_source')
)
df2 = pd.merge(
    left=df2, 
    right=df_changes[["number", "status"]], 
    left_on='Target', 
    right_on='number', 
    how='inner',
    suffixes=('_source', '_target')
)

In [31]:
df2[df2["status_target"]=="ABANDONED"]

Unnamed: 0,Source,Target,related,pctg_inter_dep_cha,src_trgt_co_changed_nbr,dev_in_src_change_nbr,rev_in_src_change_nbr,num_shrd_desc_tkns,num_shrd_file_tkns,desc_sim,...,code_churn_source,last_mth_dep_proj_nbr_source,ratio_dep_chan_owner_source,projects_contributed_owner_source,pctg_cross_project_changes_source,pctg_cross_project_changes_owner_source,number_source,status_source,number_target,status_target
365,273587,275389,0,57.14,2,4,9,0.027027,0.054054,0.147101,...,16,4,8.333333,52,0.489933,0.5,273587,MERGED,275389,ABANDONED
366,268249,275389,0,3.06,0,6,7,0.020833,0.000000,0.013383,...,16,4,8.333333,52,0.489933,0.5,268249,MERGED,275389,ABANDONED
367,265978,275389,0,60.00,6,3,10,0.019608,0.071429,0.110363,...,16,4,8.333333,52,0.489933,0.5,265978,MERGED,275389,ABANDONED
368,266536,275389,0,33.33,5,1,26,0.000000,0.000000,0.110214,...,16,4,8.333333,52,0.489933,0.5,266536,MERGED,275389,ABANDONED
369,269970,275389,0,32.56,10,14,47,0.000000,0.000000,0.179878,...,16,4,8.333333,52,0.489933,0.5,269970,MERGED,275389,ABANDONED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131051,322784,333037,0,50.00,5,10,20,0.032258,0.000000,0.480435,...,173,4,50.000000,9,0.361111,0.0,322784,MERGED,333037,ABANDONED
131052,320999,333037,0,20.00,3,3,5,0.000000,0.000000,0.385169,...,173,4,50.000000,9,0.361111,0.0,320999,MERGED,333037,ABANDONED
131053,319464,333037,0,1.41,0,1,4,0.000000,0.000000,0.136939,...,173,4,50.000000,9,0.361111,0.0,319464,MERGED,333037,ABANDONED
131054,317251,333037,0,66.67,5,4,10,0.000000,0.000000,0.605956,...,173,4,50.000000,9,0.361111,0.0,317251,MERGED,333037,ABANDONED


In [13]:
df_developer_exp = pd.read_csv(osp.join(".", "Files", "Metrics", "project_changes_owner.csv"))
df_developer_exp['is_dependent'] = df_developer_exp['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)

In [15]:
df_changes.loc[(df_changes["insertions"]==100)&(df_changes["is_dependent"]==0), ["number", "added_lines", "insertions"]].sort_values("insertions")

Unnamed: 0,number,added_lines,insertions
116,21225,import json from tempest.common.rest_client im...,100
378616,489219,,100
382320,493712,"""""""A decorator for verifying policy enforceme...",100
382771,494247,"# Licensed under the Apache License, Version 2...",100
384375,496203,"# # Licensed under the Apache License, Version...",100
...,...,...,...
176807,242556,@mock.patch('ceilometer.pipeline.LOG') def te...,100
176853,242615,new_domain = unit.new_domain_ref() domain = u...,100
177316,243221,"with _utils.shade_exceptions(""Error authentic...",100
173985,238965,"import json def get_property_value(self, obj, ...",100


In [3]:
# combined_output = hpr.combine_file_metrics()
path = osp.join(".", "Files", "file-metrics")

files = hpr.list_file(path)
combined_output = pd.DataFrame()

for file in files:
    df_item = pd.read_csv(osp.join(path, file))
    combined_output = pd.concat((combined_output, df_item))
combined_output.drop(columns=["owner_account_id", 'status'], inplace=True)
# combined_output["changed_files"] = combined_output["changed_files"].map(ast.literal_eval)

### Combine features

In [4]:
metric_path = osp.join('.', 'Files', 'Metrics')
metric_list = [f for f in hpr.list_file(metric_path) if f[:-4]]

In [5]:
def calc_mod_file_dep_cha(row):
    changed_files = row["changed_files"]
    if type(changed_files) is not list:
        changed_files = []
    return round(100*row['num_mod_file_dep_cha']/len(changed_files), 2) if len(changed_files) != 0 else 0

In [17]:
df = clas_util.combine_features()
df = pd.merge(
    left=df,
    right=combined_output,
    left_on='number',
    right_on='number',
    how='left',
    suffixes=('_source', '_target')
)
df['pctg_mod_file_dep_cha'] = df.apply(calc_mod_file_dep_cha, axis=1)
# df['is_dependent'] = df['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)
df = df.drop(columns=[
    "changed_files", "num_mod_file_dep_cha", 'num_build_failures', 'pctg_mod_file_dep_cha'
    ])
# df.fillna(0, inplace=True)

In [22]:
change_ids = df["number"].unique()

./Changes3/data_425.csv succeeded
./Changes3/data_343.csv succeeded
./Changes3/data_357.csv succeeded
./Changes3/data_431.csv succeeded
./Changes3/data_419.csv succeeded
./Changes3/data_1198.csv succeeded
./Changes3/data_1167.csv succeeded
./Changes3/data_380.csv succeeded
./Changes3/data_1173.csv succeeded
./Changes3/data_394.csv succeeded
./Changes3/data_141.csv succeeded
./Changes3/data_627.csv succeeded
./Changes3/data_633.csv succeeded
./Changes3/data_2.csv succeeded
./Changes3/data_155.csv succeeded
./Changes3/data_169.csv succeeded
./Changes3/data_26.csv succeeded
./Changes3/data_182.csv succeeded
./Changes3/data_1365.csv succeeded
./Changes3/data_1403.csv succeeded
./Changes3/data_1417.csv succeeded
./Changes3/data_828.csv succeeded
./Changes3/data_196.csv succeeded
./Changes3/data_1371.csv succeeded
./Changes3/data_32.csv succeeded
./Changes3/data_1359.csv succeeded
./Changes3/data_800.csv succeeded
./Changes3/data_814.csv succeeded
./Changes3/data_753.csv succeeded
./Changes3

In [24]:
df.dropna(axis=1, inplace=True)

In [None]:
df.loc[(df["description_length"]==325)&(df["is_dependent"]==1), ["number", "description_length"]]

In [162]:
dep_per_project = df.groupby(['project', 'is_dependent']).count().reset_index()[['project', 'is_dependent', 'number']]
dep_per_project.rename(columns={'number': 'count'}, inplace=True)
# dep_per_project[(dep_per_project['is_dependent']==1)&(dep_per_project['count']==0)]
projects_sample = dep_per_project.sort_values(['is_dependent', 'count'], ascending=False)
projects_sample = projects_sample[(projects_sample['is_dependent']==1)&(projects_sample['count']>=300)].iloc[:, 0].unique()

### Build dataset for the 1st model

In [25]:
y = df['is_dependent']
X = df.drop(columns=['is_dependent'])

### Base estimators

In [12]:
ensemble_classifiers = clas_util.load_classifiers()

In [13]:
df_features = pd.read_csv(osp.join(".", "Results", "Correlation", "first_model.csv"))

### Training ad evaluation of the first and second model

In [None]:
training_results = {key: None for key in ensemble_classifiers.keys()}
tscv = TimeSeriesSplit(n_splits = 10)

clf_path = osp.join('.', 'Results')
if not os.path.exists(clf_path):
    os.makedirs(clf_path)

df_feat_impo = pd.DataFrame()
# df_feat_impact = pd.DataFrame()

for label, ens_clf in ensemble_classifiers.items():
    print(f'Start training with {label} classifier...')

    # if label not in ['RF']:
    #     continue

    auc_scores = []
    precision_scores = []
    recall_scores = []
    brier_scores = []
    # feature_importances = np.zeros(len(METRICS))
    feature_importances = []
    corr_features = []
    redundant_features = []
    # df_features = pd.DataFrame({"Feat":X_train.columns.tolist()})
    # for f in range(0, 10):
    #     df_features[f"Fold{f}"] = [1]*len(df_features)
    # df_features.to_csv(osp.join(".", "Results", "Correlation", "first_model.csv"), index=None)

    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
        # if fold < 7:
        #     continue
        clone_clf = clone(ens_clf)

        # Filter training set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        # corr_path = osp.join('.', 'Results', 'Correlation', f'first_model')
        # if not os.path.exists(corr_path):
        #     os.makedirs(corr_path)
            
        corr_features = df_features.loc[df_features[f'Fold{fold}']==0, 'Feat'].tolist()
        # plt.figure(figsize=(6,12))
        # dissimilarity = 1 - abs(X_train.corr())
        # Z = linkage(squareform(dissimilarity), 'complete')

        # dendrogram(Z, labels=X_train.columns, orientation='left')

        # threshold = 0.3 * max(Z[:, 2])  # Scale threshold based on the maximum distance in the dendrogram
        # plt.axvline(x=threshold, color='r', linestyle='--')

        # # Adjust the layout to make sure labels fit
        # plt.tight_layout()
        # plt.savefig(f'{corr_path}/{fold}.pdf')

        # Remove highly correlated features from the training sets
        if len(corr_features) > 0:
            X_train = X_train.drop(columns=corr_features, errors='ignore')

        # Conduct redundancy analysis
        redundant_features = clas_util.redundancy_analysis(X_train)

        # Remove indepandent variables explained by others
        X_train = X_train.drop(columns=redundant_features)

        # Instantiate the UnderSampler class then fit it on the each fold training dataset
        ros = RandomUnderSampler(random_state=0)
        
        # Perform under-sampling of the majority class(es)
        X_train, y_train = ros.fit_resample(X_train, y_train)
    
        # Train the Random Forest Classifier on the training fold set 
        clone_clf.fit(X_train, y_train)

        # Columns to exclude in the test set
        cols_exluded = corr_features + redundant_features
        
        # print(highly_correlated_features, independent_features)

        # Filter training set
        X_test = X.iloc[test_idx]
        X_test = X_test.drop(columns=cols_exluded, errors='ignore')
        y_test = y.iloc[test_idx]

        # Test the Random Forest Classifier on the test fold set 
        y_pred = clone_clf.predict(X_test)
        
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))
        brier_scores.append(brier_score_loss(y_test, y_pred))

        # if label == 'RF':
        #     df_feat_imp_item = pd.DataFrame({name: [val] for name, val in zip(X_train.columns.to_list(), clone_clf.feature_importances_)})
        #     # if 'pctg_mod_file_dep_cha' in X_train.columns:
        #     # print(X_train.columns.sort_values())
        #     df_feat_impo = pd.concat((df_feat_impo, df_feat_imp_item))
        # if label not in ['MLP']:

        #     binary_cols = list(constants.DESCRIPTION.keys())

        #     # Step 1: Calculate the median of each column
        #     medians = X_train.median()

        #     # Step 2: Calculate the standard deviation of each column
        #     std_devs = X_train.std()

        #     # Step 3: Create a new dataframe starting with the median row
        #     df_feat_impact_item = pd.DataFrame([medians])
        #     for col in binary_cols:
        #         if col in X_train.columns.tolist():
        #             df_feat_impact_item[col] = 0
        
        #     df_feat_impact_item = pd.concat([df_feat_impact_item] * (len(X_train.columns.to_list()) + 1), ignore_index=True)

        #     # Step 4: Double the number of rows according to the number of features and add standard deviation to each column
        #     for idx, col in zip(range(1, len(df_feat_impact_item.columns)+1), df_feat_impact_item.columns):
        #         if col in binary_cols:
        #             df_feat_impact_item.iloc[idx, idx-1] = 1
        #         else:
        #             df_feat_impact_item.iloc[idx, idx-1] += std_devs[col]

        #     df_feat_impact_item['pred'] = clone_clf.predict_proba(df_feat_impact_item)[:,1]
        #     proba1 = df_feat_impact_item.iloc[0, -1]
        #     df_feat_impact_item['impact'] = None

        #     df_feat_impact_item.iloc[1:, -1] = 0 if proba1 == 0 else (df_feat_impact_item.iloc[1:, -2] - proba1) / proba1

        #     df_feat_impact_item['fold'] = fold
        #     df_feat_impact_item['Classifier'] = label

        #     df_feat_impact = pd.concat((df_feat_impact, df_feat_impact_item))

        print(f"{label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}, Brier: {brier_scores[-1]}")

    prec_avg = np.average(precision_scores)
    recall_avg = np.average(recall_scores)
    auc_avg = np.average(auc_scores)
    brier_avg = np.average(brier_scores)

    print(f"{label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}, Brier: {brier_avg}")

    training_results[label] = {
        'Classifier': label,
        'Precision': prec_avg,
        'Recall': recall_avg,
        'AUC': auc_avg,
        'Brier': brier_avg,
        'AUC Scores': auc_scores,
        'Precision Scores': precision_scores,
        'Recall Scores': recall_scores,
        'Brier Scores': brier_scores,
        # 'feature_importances': feature_importances,
        # 'features': X_train.columns.tolist()
    }

In [265]:
df_training_results = pd.DataFrame(list(training_results.values()))
df_training_results.sort_values(by=['AUC', 'Brier'], ascending=[0, 1], inplace=True)
df_training_results.to_csv(osp.join('.', 'Results', 'first_model_perf.csv'), index=None)

In [5]:
df_training_results = pd.read_csv(osp.join('.', 'Results', 'first_model_perf.csv'))
# df_training_results.sort_values(by=['AUC', 'Brier'], ascending=False, inplace=True)

In [8]:
def latex_float(f):
    float_str = "{0:.2g}".format(f)
    if "e" in float_str:
        base, exponent = float_str.split("e")
        return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
    else:
        return float_str

In [6]:
df_training_results_m2 = pd.read_csv(osp.join(".", "Results", "second_model_perf.csv"))

In [9]:
for idx, row in df_training_results.iterrows():
    m2_perf = df_training_results_m2[df_training_results_m2['Classifier']==row['Classifier']]
    print("\multirow{2}{*}"+"{"+row['Classifier']+"} & AUC & $"+str(round(row['AUC']*100, 2))+"\%$ & $"+str(round(m2_perf['AUC'].values[0]*100, 2))+"\%$\\" + "\\")
    print("& Brier & $"+latex_float(row['Brier'])+"$ & $"+latex_float(m2_perf['Brier'].values[0])+"$\\"+ "\\")
    print("\\noalign{\smallskip}")
    print("\hhline{~----}")

    print("\\noalign{\smallskip}")

\multirow{2}{*}{RF} & AUC & $82.86\%$ & $80.84\%$\\
& Brier & $0.28$ & $0.072$\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{AdaBoost} & AUC & $82.72\%$ & $77.16\%$\\
& Brier & $0.27$ & $0.16$\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{XGBoost} & AUC & $82.35\%$ & $84.49\%$\\
& Brier & $0.26$ & $0.099$\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{ET} & AUC & $70.74\%$ & $64.49\%$\\
& Brier & $0.35$ & $0.3$\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{MLP} & AUC & $60.02\%$ & $69.58\%$\\
& Brier & $0.61$ & $0.17$\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}


In [28]:
df_feat_impact = df_feat_impact.reset_index(drop=True)
# df_feat_impact.to_csv(osp.join('.', 'Results', 'Impact', 'first_model_feat_impact.csv'), index=None)

Unnamed: 0,has_feature_addition,deletions,insertions,num_file_types,is_preventive,project_changes_owner,is_refactoring,whole_within_project_changes,is_merge,num_directory_files,...,pctg_mod_file_dep_cha,pred,impact,fold,max_num_mod_file_dep_cha,cross_project_changes_owner,min_num_mod_file_dep_cha,cross_project_changes,whole_changes_owner,num_dev_modified_files
0,0,2.00000,13.000000,1.000000,0,5.0,0,7.5,0,1.0,...,0.000000,0.12,,1,,,,,,
1,1,2.00000,13.000000,1.000000,0,5.0,0,7.5,0,1.0,...,0.000000,0.13,0.083333,1,,,,,,
2,0,12.89036,13.000000,1.000000,0,5.0,0,7.5,0,1.0,...,0.000000,0.09,-0.25,1,,,,,,
3,0,2.00000,239.808907,1.000000,0,5.0,0,7.5,0,1.0,...,0.000000,0.18,0.5,1,,,,,,
4,0,2.00000,13.000000,1.637287,0,5.0,0,7.5,0,1.0,...,0.000000,0.12,0.0,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0,2.00000,7.000000,1.000000,0,,0,15357.0,0,1.0,...,60.000000,0.58,0.074074,10,,1.0,0.000000,42.0,389.0,
236,0,2.00000,7.000000,1.000000,0,,0,15357.0,0,1.0,...,60.000000,0.55,0.018519,10,,1.0,0.000000,42.0,389.0,
237,0,2.00000,7.000000,1.000000,0,,0,15357.0,0,1.0,...,60.000000,0.61,0.12963,10,,1.0,0.000000,42.0,389.0,
238,0,2.00000,7.000000,1.000000,0,,0,15357.0,0,1.0,...,60.000000,0.62,0.148148,10,,1.0,74.604505,42.0,389.0,


### Feature importance

In [155]:
df_feat_impo.fillna(0, inplace=True)
# df_feat_impo
df_feat_impo.to_csv(osp.join('.', 'Results', 'Feature_importance', 'first_feat_impo.csv'), index=None)

### Dimension importance

In [22]:
dimensions = {
    'Change': constants.CHANGE_METRICS,
    'Text': constants.TEXT_METRICS,
    'Developer': constants.DEVELOPER_METRICS,
    'Project': constants.PROJECT_METRICS,
    'File': constants.FILE_METRICS
}

In [27]:
dimension_results = {key: [] for key in dimensions.keys()}
# Change variable to 'keep' or 'discard
dimension_type = 'keep'

tscv = TimeSeriesSplit(n_splits = 10)

clf_path = osp.join('.', 'Results')
if not os.path.exists(clf_path):
    os.makedirs(clf_path)

for label, dimension in dimensions.items():
    print(f'Start training with {label} dimension...')

    features = []
    if dimension_type == 'keep':
        features = dimension
    else:
        for lab, dim in dimensions.items():
            if lab != label:
                features += dim

    features = [f for f in features if f in X.columns.tolist()]

    auc_scores = []
    precision_scores = []
    recall_scores = []
    brier_scores = []
    
    feature_importances = []

    kept_feat = []
    corr_features = []
    redundant_features = []

    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):

        forest_clf = RandomForestClassifier(random_state=42)

        # Filter training set
        X_train = X.iloc[train_idx]
        X_train = X_train[features]
        y_train = y.iloc[train_idx]

        # conduct the correlation analysis

        # if fold == 0:
        # corr_path = osp.join('.', 'Results', 'Correlation', f'{model}_model')
            
        corr_features = df_features.loc[df_features[f'Fold{fold}']==0, 'Feat'].tolist()
        # corr_features = clas_util.correlation_analysis(X_train, metric_imp)

        features = [f for f in features if f not in corr_features]
        # # Remove highly correlated features from the training set
        X_train = X_train[features]

        # Conduct redundancy analysis
        # if fold == 0 and len(features) > 1:
        redundant_features = clas_util.redundancy_analysis(X_train)
            # features = [f for f in features if f not in corr_features]

        # Remove indepandent variables explained by others
        if len(redundant_features) != 0:
            X_train = X_train.drop(columns=redundant_features)

        # Instantiate the UnderSampler class then fit it on the each fold training dataset
        ros = RandomUnderSampler(random_state=0)

        # Perform under-sampling of the majority class(es)
        X_train, y_train = ros.fit_resample(X_train, y_train)
    
        # Train the Random Forest Classifier on the training fold set 
        forest_clf.fit(X_train, y_train)

        # Filter training set
        X_test = X.iloc[test_idx]
        X_test = X_test[X_train.columns.tolist()]
        y_test = y.iloc[test_idx]

        # Test the Random Forest Classifier on the test fold set 
        y_pred = forest_clf.predict(X_test)

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        brier = brier_score_loss(y_test, y_pred)

        precision_scores.append(precision)
        recall_scores.append(recall)
        auc_scores.append(auc)
        brier_scores.append(brier)

        # feature_importances.app

        print(f"Dimension: {label}, Fold: {fold+1}, Precision: {precision}, Recall: {recall}, AUC: {auc}, Brier: {brier}")

        kept_feat = X_train.columns.tolist()

    prec_avg = np.average(precision_scores)
    recall_avg = np.average(recall_scores)
    auc_avg = np.average(auc_scores)
    brier_avg = np.average(brier_scores)

    print(f"{label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}")
    dimension_results[label] += [{
        'Dimension': label,
        'Precision': prec_avg,
        'Recall': recall_avg,
        'AUC': auc_avg,
        'Brier': brier_avg
    }]

Start training with Change dimension...
Dimension: Change, Fold: 1, Precision: 0.015423621381787449, Recall: 0.35138387484957884, AUC: 0.5298755534207842, Brier: 0.2962144964245448
Dimension: Change, Fold: 2, Precision: 0.05910109708623334, Recall: 0.5084573748308525, AUC: 0.5606082572556943, Brier: 0.39200271827266125
Dimension: Change, Fold: 3, Precision: 0.06943020916372471, Recall: 0.5462962962962963, AUC: 0.5735194897009421, Brier: 0.4020726829042272
Dimension: Change, Fold: 4, Precision: 0.07304228362082126, Recall: 0.5445234708392603, AUC: 0.5739177322840148, Brier: 0.3998795310979659
Dimension: Change, Fold: 5, Precision: 0.0955488938640762, Recall: 0.5478677110530896, AUC: 0.5758074735877574, Brier: 0.40021931518062615
Dimension: Change, Fold: 6, Precision: 0.1363697388735987, Recall: 0.5768848502058782, AUC: 0.5654856110172605, Brier: 0.44343367260259164
Dimension: Change, Fold: 7, Precision: 0.11440801990340992, Recall: 0.5274076572777872, AUC: 0.5579419870257819, Brier: 0.4

In [28]:
dim_imp = pd.DataFrame([item[0] for item in list(dimension_results.values())])
dim_imp.to_csv(osp.join('.', 'Results', 'Feature_importance', f'first_model_{dimension_type}_dim.csv'), index=None)

In [25]:
dim_imp

Unnamed: 0,Dimension,Precision,Recall,AUC,Brier
0,Change,0.184608,0.931897,0.819432,0.276269
1,Text,0.171462,0.941881,0.811367,0.299549
2,Developer,0.10439,0.860847,0.64606,0.541653
3,Project,0.176986,0.955364,0.817475,0.299555
4,File,0.180999,0.946864,0.821739,0.284503


In [4]:
def round_number(nbr):
    return round(100 * nbr, 2)

In [29]:
first_model_keep_dim = pd.read_csv(osp.join(".", "Results", "Feature_importance", "first_model_keep_dim.csv"))
first_model_discard_dim = pd.read_csv(osp.join(".", "Results", "Feature_importance", "first_model_discard_dim.csv"))
second_model_keep_dim = pd.read_csv(osp.join(".", "Results", "Feature_importance", "second_model_keep_dim.csv"))
second_model_discard_dim = pd.read_csv(osp.join(".", "Results", "Feature_importance", "second_model_discard_dim.csv"))

first_model_perf = pd.read_csv(osp.join(".", "Results", "first_model_perf.csv"))
second_model_perf = pd.read_csv(osp.join(".", "Results", "second_model_perf.csv")).sort_values(by='AUC', ascending=False)
first_model_auc = round_number(first_model_perf['AUC'].values[0])
second_model_auc = round_number(second_model_perf['AUC'].values[0])
only_pair_auc = round_number(second_model_keep_dim.loc[second_model_keep_dim['Dimension']=="Pairs", 'AUC'].values[0])
without_pair_auc = round_number(second_model_discard_dim.loc[second_model_discard_dim['Dimension']=="Pairs", 'AUC'].values[0])

In [31]:
for idx, row in first_model_keep_dim.iterrows():
    auc_m2_keep = round_number(second_model_keep_dim.loc[second_model_keep_dim['Dimension']==row['Dimension'], "AUC"].values[0])
    auc_m1_discard = round_number(first_model_discard_dim.loc[first_model_discard_dim['Dimension']==row['Dimension'], "AUC"].values[0])
    auc_m2_discard = round_number(second_model_discard_dim.loc[second_model_discard_dim['Dimension']==row['Dimension'], "AUC"].values[0])
    print("\\multirow{2}{*}{\\textbf{"+row['Dimension']+"}} & Only & "+str(round_number(row['AUC']))+"\\% & "+str(auc_m2_keep)+"\\% \\" + "\\")
    print("& Without & "+str(auc_m1_discard)+"\% &"+ str(auc_m2_discard)+"\%\\"+ "\\")
    print("\\noalign{\smallskip}")
    print("\hhline{~----}")
    print("\\noalign{\smallskip}")
print("\\multirow{2}{*}{\\textbf{Pair}} & Only & N/A &"+ str(only_pair_auc)+"\\%\\\\")
print("\\noalign{\smallskip}")
print("& Without & N/A & "+str(without_pair_auc)+"\\%\\\\")
print("\\noalign{\smallskip}")
print("\\hline")
print("\\noalign{\smallskip}")
print("\\textbf{All} & All & "+str(first_model_auc)+"\\% & "+str(second_model_auc)+"\\%\\\\")
print("\\noalign{\smallskip}")

\multirow{2}{*}{\textbf{Change}} & Only & 56.32\% & 62.58\% \\
& Without & 81.94\% &83.1\%\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{\textbf{Text}} & Only & 57.27\% & 58.7\% \\
& Without & 81.14\% &82.53\%\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{\textbf{Developer}} & Only & 79.5\% & 79.8\% \\
& Without & 64.61\% &80.9\%\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{\textbf{Project}} & Only & 54.35\% & 61.68\% \\
& Without & 81.75\% &83.64\%\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{\textbf{File}} & Only & 54.43\% & 52.6\% \\
& Without & 82.17\% &83.1\%\\
\noalign{\smallskip}
\hhline{~----}
\noalign{\smallskip}
\multirow{2}{*}{\textbf{Pair}} & Only & N/A &76.4\%\\
\noalign{\smallskip}
& Without & N/A & 77.11\%\\
\noalign{\smallskip}
\hline
\noalign{\smallskip}
\textbf{All} & All & 82.86\% & 84.49\%\\
\noalign{\smallskip}


### Impact

In [156]:
df_feat_impact = df_feat_impact[df_feat_impact['impact'].notnull()]
df_feat_impact = df_feat_impact.reset_index(drop=True)
# df_feat_impact.to_csv(osp.join('.', 'Results', 'Impact', 'first_model_feat_impact.csv'), index=None)

In [147]:
df_feat_impact_2 = pd.read_csv(osp.join('.', 'Results', 'Impact', 'first_model_feat_impact.csv'))
# df_feat_impact = df_feat_impact[df_feat_impact['impact'].notnull()]
# attr = "insertions"
# test = df_feat_impact.loc[(df_feat_impact[attr].duplicated()==False)&(df_feat_impact['fold']!=0), [attr, "fold", "impact"]].sort_values("impact")
# test = test[test['impact'].notnull()==True].iloc[:-1, -1]
# test = f" & {round(test.min(), 2)} & {round(test.median(), 2)} & {round(test.max(), 2)}"
# test

In [157]:
# Creating a DataFrame from the new set of data
data = {
    'Feature': [
        'ratio_dep_chan_owner', 'project_changes_owner', 'pctg_cross_project_changes',
        'description_length', 'whole_within_project_changes', 'project_age',
        'projects_contributed_owner', 'pctg_cross_project_changes_owner', 'subject_length',
        'cross_project_changes_owner', 'num_file_changes', 'insertions',
        'last_mth_dep_proj_nbr', 'deletions', 'max_num_mod_file_dep_cha',
        'num_directory_files', 'cross_project_changes', 'whole_changes_owner',
        'num_file_types', 'has_feature_addition', 'is_corrective',
        'avg_num_dev_modified_files', 'is_refactoring', 'min_num_mod_file_dep_cha',
        'is_preventive', 'is_non_functional', 'num_dev_modified_files',
        'is_merge'
    ],
    'Ranking': [
        1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 9, 9, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 16, 17, 17, 18
    ]
}

# Creating the dataframe
df_most_impo_feat = pd.DataFrame(data)
classifiers = df_feat_impact['Classifier'].unique().tolist()
df_most_impo_feat['Classifier'] = [classifiers for _ in range(len(df_most_impo_feat))]
df_most_impo_feat = df_most_impo_feat.explode("Classifier")
df_most_impo_feat.reset_index(drop=True, inplace=True)

In [158]:
def map_impact_to_fold(row):
    # print(row)
    for i in range(10):
        row[f'fold{i}'] = df_feat_impact.loc[(df_feat_impact['Classifier']==row['Classifier'])&(df_feat_impact['fold']==i), [row['Feature'],  'fold', 'impact']].sort_values(by=row['Feature']).iloc[-1, -1]
    # print(row)
    return row

def retrieve_impact(row, func):
    df_sub = [row[f'fold{i}'] for i in range(10)]
    return func(df_sub)

In [159]:
df_most_impo_feat = df_most_impo_feat.apply(map_impact_to_fold, axis=1)
df_most_impo_feat['mean'] = df_most_impo_feat.apply(retrieve_impact, args=(np.mean,), axis=1) 
# df_most_impo_feat['median'] = df_most_impo_feat.apply(retrieve_impact, args=(np.median,), axis=1) 
# df_most_impo_feat['max'] = df_most_impo_feat.apply(retrieve_impact, args=(max,), axis=1) 
# df_most_impo_feat['max'] = df_most_impo_feat.apply(retrieve_impact, args=(max,), axis=1) 

In [161]:
df_most_impo_feat.to_csv(osp.join('.', 'Results', 'first_feat_impact_import.csv'), index=None)

In [None]:
for idx, row in df_most_impo_feat[:10].iterrows():
    print(row["Feature"].replace("_", "\\_")+" & "+str(row['Ranking'])+" & "+str(round(row['min'], 2))+" & "+str(round(row['median'], 2))+" & "+str(round(row['max'], 2))+" \\\\")