In [3]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from datetime import datetime

from datetime import timedelta

from xgboost import XGBClassifier
from sklearn.metrics import  precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, brier_score_loss
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
from imblearn.under_sampling import RandomUnderSampler

# %matplotlib inline
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from alibi.explainers import ALE, plot_ale

from utils import helpers as hpr
from utils import constants
import utils.classifier_util as clas_util

from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

### Load changes

In [None]:
df_changes = hpr.combine_openstack_data(changes_path="/Changes3/")
df_changes = df_changes[df_changes['status']!="NEW"]
# df_changes['changed_files'] = df_changes['changed_files'].map(hpr.combine_changed_file_names)
# df_changes['commit_message'] = df_changes['commit_message'].map(hpr.preprocess_change_description)

In [148]:
import ast

code = ["def purge_data(args):",
    "    \"\"\"Run data purge process",
    "    @raises Exception should the purge fail\"\"\"",
    "    hookenv.action_set({",
    "        'output': utils.purge_stale_soft_deleted_rows(",
    "            before=hookenv.action_get('before'),",
    "        )",
    "    })",
    "",
    ""
]

tree = ast.parse("\n".join(code))  # Generates the AST

In [149]:
class CodeAnalyzer(ast.NodeVisitor):
    def __init__(self):
        self.method_calls = 0
        self.variables = set()  # Stores unique variable names
        self.functions = []
        self.imports = []

    def visit_Call(self, node):
        # Count method/function calls (e.g., print(), foo())
        self.method_calls += 1
        self.generic_visit(node)  # Continue traversing child nodes

    def visit_Name(self, node):
        # Track variable names (e.g., x, y)
        if isinstance(node.ctx, ast.Load):  # Only count reads, not writes
            self.variables.add(node.id)
        self.generic_visit(node)

    def visit_FunctionDef(self, node):
        # Track function definitions (e.g., foo)
        self.functions.append(node.name)
        self.generic_visit(node)

    def visit_Import(self, node):
        # Track imports (e.g., import os)
        for alias in node.names:
            self.imports.append(alias.name)
        self.generic_visit(node)

    def visit_ImportFrom(self, node):
        # Track from-imports (e.g., from sys import path)
        self.imports.append(f"{node.module}.{[alias.name for alias in node.names][0]}")
        self.generic_visit(node)

# Analyze the code
analyzer = CodeAnalyzer()
analyzer.visit(tree)

print("Method calls:", analyzer.method_calls)
print("Variables used:", analyzer.variables)
print("Functions defined:", analyzer.functions)
print("Imports:", analyzer.imports)

Method calls: 3
Variables used: {'utils', 'hookenv'}
Functions defined: ['purge_data']
Imports: []


### Load dependencies

In [2]:
df_deps = pd.read_csv("./Files/source_target_evolution_clean.csv")

In [3]:
dependent_changes = set(hpr.flatten_list(df_deps[['Source', 'Target']].values))

In [118]:
df_changes['is_dependent'] = df_changes['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)

In [123]:
df_changes['is_dependent'].value_counts()

is_dependent
0    675706
1     49357
Name: count, dtype: int64

In [125]:
df_changes.loc[df_changes['is_dependent']!=2, "status"].value_counts()

status
MERGED       578933
ABANDONED    133290
NEW           12840
Name: count, dtype: int64

### Combine features

In [81]:
df = clas_util.combine_features()
df = df.drop(columns=['num_build_failures'])
# df = pd.merge(
#     left=df, 
#     right=df_changes[['number', 'created', 'project', 'owner_account_id']], 
#     left_on='number', 
#     right_on='number', 
#     how='inner',
#     suffixes=('_source', '_target')
# )
# df['is_dependent'] = df['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)
# df['is_cross'] = df['number'].map(is_cross_project)

In [4]:
ensemble_classifiers = clas_util.load_classifiers()

In [5]:
df_features = pd.read_csv(osp.join(".", "Results", "Correlation", "second_model.csv"))

In [11]:
df_changed_lines = pd.read_csv(osp.join(".", "Files", "changed_lines.csv"))

In [7]:
path = osp.join(".", "Files", "file-metrics")
combined_output = hpr.combine_file_metrics(path)

In [None]:
def fill_in_empty_values(row, c):
    # print(f"{row[c]=}")
    if not row[c]:
        splits = c.split("_")
        src_or_trgt = splits[-1].capitalize()
        metric = "_".join(splits[:-1])
        # print(row[src_or_trgt], metric)
        new_value = combined_output.loc[combined_output["number"]==row[src_or_trgt], metric].values[0]
        print(new_value)
        row[c] = new_value
    return row

In [6]:
# training_results = {key: None for key in ensemble_classifiers.keys()}
training_results = []
tscv = TimeSeriesSplit(n_splits = 10)
clf_path = osp.join('.', 'Results')

df_feat_impo = pd.DataFrame()
df_feat_impact = pd.DataFrame()

if not os.path.exists(clf_path):
    os.makedirs(clf_path)

for dev_type in ['All', 'Different', 'Same']:
    # if dev_type != 'All':
    #     continue

    for label, ens_clf in ensemble_classifiers.items():
        print(f'Start training with classifier: {label} and Developer type: {dev_type}...')

        if label != 'XGBoost':
            continue
        top3_results = {i: None for i in range(10)}
        top5_results = {i: None for i in range(10)}
        top7_results = {i: None for i in range(10)}
        top10_results = {i: None for i in range(10)}
        top20_results = {i: None for i in range(20)}
        top30_results = {i: None for i in range(20)}
        top40_results = {i: None for i in range(20)}
        top50_results = {i: None for i in range(20)}
        top60_results = {i: None for i in range(20)}
        top70_results = {i: None for i in range(20)}
        top80_results = {i: None for i in range(20)}
        top90_results = {i: None for i in range(20)}
        top100_results = {i: None for i in range(20)}

        auc_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []
        f2_scores = []
        brier_scores = []
        feature_importances = []
        features = []
        corr_features = []
        redundant_features = []

        # for config in [
        #     'Source_train_dependent',
        #     # 'Source_train_independent',
        #     # 'Source_test_dependent',
        #     # 'Source_test_independent',
        #     # 'Source_train_test_dependent',
        #     # 'Source_train_test_independent',
        # ]:
        for fold in range(0, 10):

            # if fold not in [9]:
            #     continue

            clone_clf = clone(ens_clf)

            # Split training data into features and labels
            X_train = pd.read_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"))
            X_train.fillna(0, inplace=True)

            # if config in ['Source_train_dependent', 'Source_train_test_dependent']:
            #     X_train = X_train[X_train['Source'].isin(dependent_changes)]
            # elif config in ['Source_train_independent', 'Source_train_test_independent']:
            #     X_train = X_train[~X_train['Source'].isin(dependent_changes)]
            # if dev_type == "Different":
            #     X_train = X_train[X_train['Source_author']!=X_train['Target_author']]
            # elif dev_type == "Same":
            #     X_train = X_train[X_train['Source_author']==X_train['Target_author']]

            y_train = X_train['related']

            # X_train = X_train.drop(columns=["related"])
            # pd.DataFrame({'col': X_train.columns.tolist()}).to_csv("test.csv", index=None)
            # df_test = pd.concat((df_test, X_train.iloc[:1]))
            # desc_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold)
            # subject_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, "subject")
            # add_lines_model = clas_util.doc2vec_model(df_changed_lines, X_train[['Source', 'Target']].values, fold, 'added_lines')
            # del_lines_model = clas_util.doc2vec_model(df_changed_lines, X_train[['Source', 'Target']].values, fold, 'deleted_lines')

            # X_train = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_train, 'commit_message', 'desc')
            # X_train = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_train, 'subject', 'subject')
            # X_train = clas_util.compute_embdedding_similarity(df_changed_lines, add_lines_model, X_train, 'added_lines', 'add_lines')
            # X_train = clas_util.compute_embdedding_similarity(df_changed_lines, del_lines_model, X_train, 'deleted_lines', 'del_lines')

            # X_train.to_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"), index=None)
            X_train = X_train.drop(columns=['Source', 'Target', 'related', 'Source_date', 'Source_author', 'Target_date', 'Target_author', 'add_lines_sim', 'del_lines_sim', 'is_cross'])
            ################# TO REMOVE
            # if dev_type != 'All':
            #     ros = RandomUnderSampler(random_state=0)
            
            #     # Perform under-sampling of the majority class(es)
            #     X_train, y_train = ros.fit_resample(X_train, y_train)
            #################
            # corr_path = osp.join('.', 'Results', 'Correlation')
            # if fold == 0:
            #     df_features = pd.DataFrame({'Feat': X_train.columns.tolist()})
            #     for f in range(0, 10):
            #         df_features[f"Fold{f}"] = [1]*len(df_features)
            #     df_features.to_csv(f"{corr_path}/second_model.csv", index=None)
            

            # conduct the correlation analysis
            # if fold == 0:
            

        
            # if not os.path.exists(corr_path):
            #     os.makedirs(corr_path)
                
            # pd.DataFrame({'Features': corr_features}).to_csv(f'{corr_path}/{fold+1}.csv', index=None)
            # plt.figure(figsize=(6,12))
            # dissimilarity = 1 - abs(X_train.corr())
            # Z = linkage(squareform(dissimilarity), 'complete')

            # dendrogram(Z, labels=X_train.columns, orientation='left')

            # threshold = 0.3 * max(Z[:, 2])  # Scale threshold based on the maximum distance in the dendrogram
            # plt.axvline(x=threshold, color='r', linestyle='--')

            # # # Adjust the layout to make sure labels fit
            # plt.tight_layout()
            # plt.savefig(f'{corr_path}/second_model/{fold}.pdf')

            corr_features = df_features.loc[df_features[f'Fold{fold}']==fold, 'Feat'].tolist() + ['rev_in_src_change_nbr']
            # Remove highly correlated features from the training set
            X_train = X_train.drop(columns=corr_features, errors='ignore')

            # Conduct redundancy analysis
            redundant_features = clas_util.redundancy_analysis(X_train)

            # Remove indepandent variables explained by others
            X_train = X_train.drop(columns=redundant_features)

            # if fold == 0:
            #     break

            # Instantiate the OverSampler class then fit it on the each fold training dataset
            # features = X_train.columns.tolist()
            # print(f'len(X_train) {len(X_train)}')

            # Columns to exclude in the test set
            # cols_exluded = corr_features + redundant_features 

            # Train the Random Forest Classifier on the training fold set 
            clone_clf.fit(X_train, y_train)

            X_test = pd.read_csv(osp.join(".", "Files", "Data", "Test", f"temp100_15_{fold}.csv"))
            
            # if config in ['Source_test_dependent', 'Source_train_test_dependent']:
            #     X_test = X_test[X_test['Source'].isin(dependent_changes)]
            # elif config in ['Source_test_independent', 'Source_train_test_independent']:
            #     X_test = X_test[~X_test['Source'].isin(dependent_changes)]

            # X_test = X_test[X_test['Source'].isin(dependent_changes)]
            if dev_type == "Different":
                X_test = X_test[X_test['Source_author']!=X_test['Target_author']]
            elif dev_type == "Same":
                X_test = X_test[X_test['Source_author']==X_test['Target_author']]

            # X_test.fillna(0, inplace=True)
            y_test = X_test['related']
            # if len(y_test[y_test == 1]) == 0:
            #     continue
            # X_test = X_test.drop(columns=["related"])
            # X_test_pairs = X_test[['Source', 'Target', 'related']]

            # X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'commit_message', 'desc')
            # X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'subject', 'subject')
            # X_test = clas_util.compute_embdedding_similarity(df_changed_lines, add_lines_model, X_test, 'added_lines', 'add_lines')
            # X_test = clas_util.compute_embdedding_similarity(df_changed_lines, del_lines_model, X_test, 'deleted_lines', 'del_lines')
            
            # X_test = X_test.drop(columns=cols_dropped+corr_features+redundant_features)
            # X_test = X_test.drop(columns=['Source', 'Target'])
            # X_test.to_csv(osp.join(".", "Files", "Data", "Test", f"{fold}.csv"), index=None)
            X_test_topk = X_test[['Source', 'Target', 'related']]
            X_test_topk.reset_index(drop=True, inplace=True)
            X_test = X_test[X_train.columns.tolist()]

            # print(X_train.columns, )
            # Test the Random Forest Classifier on the test fold set
            if label != "AdaBoost":
                y_probs = clone_clf.predict_proba(X_test)[:, 1]

                # Set custom threshold
                threshold = 0.75
                y_pred = [1 if p >= threshold else 0 for p in y_probs]
            else:
                y_pred = clone_clf.predict(X_test)

            # y_pred_prob = clone_clf.predict_proba(X_test)[:, 1]
            # X_test_pairs['pred'] = y_pred_prob
            # compute_top_k_prec_recal(X_test_pairs, label, fold+1)
            # Compute topk precision and recall
            df_topk = pd.merge(X_test_topk, pd.DataFrame({'pred': y_probs}), left_index=True, right_index=True)
            top3_results[fold] = clas_util.compute_topk_precision_recall(df_topk)
            top5_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=5)
            top7_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=7)
            top10_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=10)
            top20_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=20)
            top30_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=30)
            top40_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=40)
            top50_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=50)
            top60_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=60)
            top70_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=70)
            top80_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=80)
            top90_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=90)
            top100_results[fold] = clas_util.compute_topk_precision_recall(df_topk, k=100)

            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))
            f2_scores.append(fbeta_score(y_test, y_pred, beta=2))
            auc_scores.append(roc_auc_score(y_test, y_pred))
            brier_scores.append(brier_score_loss(y_test, y_pred))

            print(f"{label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, F1-Score: {f1_scores[-1]}, F2-Score: {f2_scores[-1]}, AUC: {auc_scores[-1]}, Brier: {brier_scores[-1]}")

            # logging.debug(msg=f"{label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}")
            if label == 'XGBoost': 
                df_feat_imp_item = pd.DataFrame({name: [val] for name, val in zip(X_train.columns.to_list(), clone_clf.feature_importances_)})
                df_feat_impo = pd.concat((df_feat_impo, df_feat_imp_item))

            if label not in ['MLP']:

                binary_cols = list(constants.DESCRIPTION.keys())
                binary_cols = hpr.flatten_list([[f'{c}_source', f'{c}_target'] for c in binary_cols])

                # Step 1: Calculate the median of each column
                medians = X_train.median()

                # Step 2: Calculate the standard deviation of each column
                std_devs = X_train.std()

                # Step 3: Create a new dataframe starting with the median row
                df_feat_impact_item = pd.DataFrame([medians])
                for col in binary_cols:
                    if col in X_train.columns.tolist():
                        df_feat_impact_item[col] = 0
                df_feat_impact_item = pd.concat([df_feat_impact_item] * (len(X_train.columns.to_list()) + 1), ignore_index=True)


                # Step 4: Double the number of rows according to the number of features and add standard deviation to each column
                for idx, col in zip(range(1, len(df_feat_impact_item.columns)+1), df_feat_impact_item.columns):
                    # df_feat_impact_item.iloc[idx, idx-1] += std_devs[col]

                    if col in binary_cols:
                        df_feat_impact_item.iloc[idx, idx-1] = 1
                    else:
                        df_feat_impact_item.iloc[idx, idx-1] += std_devs[col]

                df_feat_impact_item['pred'] = clone_clf.predict_proba(df_feat_impact_item)[:,1]
                proba1 = df_feat_impact_item.iloc[0, -1]
                df_feat_impact_item['impact'] = None

                df_feat_impact_item.iloc[1:, -1] = 0 if proba1 == 0 else (df_feat_impact_item.iloc[1:, -2] - proba1) / proba1

                df_feat_impact_item['fold'] = fold
                df_feat_impact_item['Classifier'] = label
                df_feat_impact_item['pair_type'] = dev_type

                df_feat_impact = pd.concat((df_feat_impact, df_feat_impact_item))

        # feature_importances /= (fold+1)
        prec_avg = np.average(precision_scores)
        recall_avg = np.average(recall_scores)
        f1_avg = np.average(f1_scores)
        f2_avg = np.average(f2_scores)
        auc_avg = np.average(auc_scores)
        brier_avg = np.average(brier_scores)

        print(f"{label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}, Brier: {brier_avg}")

        df_topk = pd.DataFrame({})
        for df_topk_result in [top3_results, top5_results, top7_results, top10_results, top20_results, top30_results, top40_results, top50_results, top60_results, top70_results, top80_results, top90_results, top100_results]:
            df_topk_item = pd.DataFrame({})
            for df_fold_topk in df_topk_result.values():
                df_topk_item = pd.concat((df_topk_item, df_fold_topk))
    
            if df_topk.empty:
                df_topk = df_topk_item
            else:
                df_topk = pd.merge(
                    df_topk, df_topk_item,
                    on="Target",
                    how="left"
                )
            

        # training_results[label] = 
        training_results.append({
            # 'Config': config,
            'Developer': dev_type,
            'Classifier': label,
            'Precision': prec_avg,
            'Recall': recall_avg,
            'F1': f1_avg,
            'F2': f2_avg,
            'AUC': auc_avg,
            'Brier': brier_avg,
            'AUC Scores': auc_scores,
            'Precision Scores': precision_scores,
            'F1 Scores': f1_scores,
            'F2 Scores': f2_scores,
            'Recall Scores': recall_scores,
            'Brier Scores': brier_scores
        } | {col: df_topk[col].median() for col in df_topk.columns[1:]})
        # training_results[label]['Precision Scores'] += precision_scores
        # training_results[label]['AUC Scores'] += auc_scores
        # training_results[label]['Recall Scores'] += recall_scores
        # training_results[label]['Brier Scores'] += brier_scores

Start training with classifier: ET and Developer type: All...
Start training with classifier: RF and Developer type: All...
Start training with classifier: XGBoost and Developer type: All...
XGBoost, Fold: 1, Precision: 0.0048226254700016345, Recall: 0.9076923076923077, F1-Score: 0.009594275957394911, F2-Score: 0.023611333440051223, AUC: 0.9300305754259042, Brier: 0.047642514911508754
XGBoost, Fold: 2, Precision: 0.0034291949185566205, Recall: 0.8, F1-Score: 0.006829116871022815, F2-Score: 0.01685694582790591, AUC: 0.8740813861097148, Brier: 0.051870255460400194
XGBoost, Fold: 3, Precision: 0.006536918734214827, Recall: 0.88, F1-Score: 0.01297743695620115, F2-Score: 0.031741451450007216, AUC: 0.9262800937229427, Brier: 0.027458799492916835
XGBoost, Fold: 4, Precision: 0.007109617373319545, Recall: 0.9016393442622951, F1-Score: 0.01410799025266128, F2-Score: 0.03446115288220551, AUC: 0.9358870739251589, Brier: 0.029881438289601554
XGBoost, Fold: 5, Precision: 0.007837226827430294, Recal

In [7]:
df_model2_perf = pd.DataFrame(training_results)
df_model2_perf.sort_values("AUC", ascending=0, inplace=True)
# df_model2_perf
# df_model2_perf.to_csv("./Results/second_model_perf_new.csv", index=None)

In [None]:
df_model2_perf.iloc[:,[0, 1]+[n for n in range(-26, 0, 1)]].T # Target is dependent and Source is ALL

In [9]:
df_model2_perf = pd.read_csv("./Results/second_model_perf_new.csv")
df_model2_perf.sort_values("AUC", ascending=0, inplace=True)

In [10]:
# Pivot the table to compute the difference
pivot_df = df_model2_perf.pivot(index='Classifier', columns='Developer', values='AUC')

# Calculate the AUC difference between All and Different
pivot_df['AUC Difference (All - Different)'] = 100*(pivot_df['All'] - pivot_df['Different'])

pivot_df.reset_index(inplace=True)
pivot_df[['Classifier', 'AUC Difference (All - Different)']]

Developer,Classifier,AUC Difference (All - Different)
0,AdaBoost,10.304237
1,ET,8.932527
2,MLP,15.978754
3,RF,15.848726
4,XGBoost,15.516515


In [13]:
# Generate LaTeX table
latex_code = """\\begin{table}[h]
\\centering
\\caption{Performance of the \\nth{2} model when the pairs are made by (1) \\textbf{all}, (2) \\textbf{same}, and (3) \\textbf{different} developers.}
\\label{tab:model2_perf_new}
\\begin{adjustbox}{width=\\textwidth}
\\begin{tabular}{llcccccccccc}
\\toprule
Classifier & Developer & top-3-prec (\\%) & top-3-recall (\\%) & top-5-prec (\\%) & top-5-recall (\\%) & top-7-prec (\\%) & top-7-recall (\\%) & top-10-prec (\\%) & top-10-recall (\\%) & AUC (\\%) & Brier Score \\\\
\\midrule
"""
top3_precision_max = 100 * df_model2_perf['Top3_Precision'].max()
top3_recall_max = 100 * df_model2_perf['Top3_Recall'].max()
top5_precision_max = 100 * df_model2_perf['Top5_Precision'].max()
top5_recall_max = 100 * df_model2_perf['Top5_Recall'].max()
top7_precision_max = 100 * df_model2_perf['Top7_Precision'].max()
top7_recall_max = 100 * df_model2_perf['Top7_Recall'].max()
top10_precision_max = 100 * df_model2_perf['Top10_Precision'].max()
top10_recall_max = 100 * df_model2_perf['Top10_Recall'].max()
auc_max = 100 * df_model2_perf['AUC'].max()
brier_min = df_model2_perf['Brier'].min()
# Group by classifier and add rows
classifiers = df_model2_perf['Classifier'].unique()
for i, clf in enumerate(classifiers):
    group = df_model2_perf[df_model2_perf['Classifier'] == clf]
    for j, (_, row) in enumerate(group.iterrows()):
        top3_precision_percent = 100 * row['Top3_Precision']  # Convert to percentage
        if top3_precision_percent == top3_precision_max:
            top3_precision_percent = "\\textbf{" + f"{top3_precision_percent:.2f}" + "}"
        else:
            top3_precision_percent = f"{top3_precision_percent:.2f}"
        
        top3_recall_percent = 100 * row['Top3_Recall']  # Convert to percentage
        if top3_recall_percent == top3_recall_max:
            top3_recall_percent = "\\textbf{" + f"{top3_recall_percent:.2f}" + "}"
        else:
            top3_recall_percent = f"{top3_recall_percent:.2f}"
        
        top5_precision_percent = 100 * row['Top5_Precision']  # Convert to percentage
        if top5_precision_percent == top5_precision_max:
            top5_precision_percent = "\\textbf{" + f"{top5_precision_percent:.2f}" + "}"
        else:
            top5_precision_percent = f"{top5_precision_percent:.2f}"
        
        top5_recall_percent = 100 * row['Top5_Recall']  # Convert to percentage
        if top5_recall_percent == top5_recall_max:
            top5_recall_percent = "\\textbf{" + f"{top5_recall_percent:.2f}" + "}"
        else:
            top5_recall_percent = f"{top5_recall_percent:.2f}"
        
        top7_precision_percent = 100 * row['Top7_Precision']  # Convert to percentage
        if top7_precision_percent == top7_precision_max:
            top7_precision_percent = "\\textbf{" + f"{top7_precision_percent:.2f}" + "}"
        else:
            top7_precision_percent = f"{top7_precision_percent:.2f}"
        
        top7_recall_percent = 100 * row['Top7_Recall']  # Convert to percentage
        if top7_recall_percent == top7_recall_max:
            top7_recall_percent = "\\textbf{" + f"{top7_recall_percent:.2f}" + "}"
        else:
            top7_recall_percent = f"{top7_recall_percent:.2f}"
        
        top10_precision_percent = 100 * row['Top10_Precision']  # Convert to percentage
        if top10_precision_percent == top10_precision_max:
            top10_precision_percent = "\\textbf{" + f"{top10_precision_percent:.2f}" + "}"
        else:
            top10_precision_percent = f"{top10_precision_percent:.2f}"
        
        top10_recall_percent = 100 * row['Top10_Recall']  # Convert to percentage
        if top10_recall_percent == top10_recall_max:
            top10_recall_percent = "\\textbf{" + f"{top10_recall_percent:.2f}" + "}"
        else:
            top10_recall_percent = f"{top10_recall_percent:.2f}"
        
        # f2_percent = row['Top3_Recall'] * 100  # Convert to percentage
        # if f2_percent == f2_max:
        #     f2_percent = "\\textbf{" + f"{f2_percent:.2f}" + "}"
        # else:
        #     f2_percent = f"{f2_percent:.2f}"
        
        auc_percent = row['AUC'] * 100  # Convert to percentage
        if auc_percent == auc_max:
            auc_percent = "\\textbf{" +f"{auc_percent:.2f}" + "}"
        else:
            auc_percent = f"{auc_percent:.2f}"
        
        brier_score = row['Brier']
        if brier_score == brier_min:
            brier_score = "\\textbf{" +f"{brier_score:.3f}" + "}"
        else:
            brier_score = f"{brier_score:.3f}"

        if j == 0:
            latex_code += (
                "\\multirow{2}{*}{" + clf + "}" +
                f" & {row['Developer']} & {top3_precision_percent}\\ & {top3_recall_percent}\\ & {top5_precision_percent}\\ & {top5_recall_percent}\\ & {top7_precision_percent}\\ & {top7_recall_percent}\\ & {top10_precision_percent}\\ & {top10_recall_percent} & "
                f"{auc_percent}\\ & {brier_score} \\\\\n"
            )
        else:
            latex_code += (
                f" & {row['Developer']} & {top3_precision_percent}\\ & {top3_recall_percent}\\ & {top5_precision_percent}\\ & {top5_recall_percent}\\ & {top7_precision_percent}\\ & {top7_recall_percent}\\ & {top10_precision_percent}\\ & {top10_recall_percent} & "
                f"{auc_percent}\\ & {brier_score} \\\\\n"
            )
    
    if i < len(classifiers) - 1:
        latex_code += "\\hhline{~-----------}\n"

latex_code += """\\bottomrule
\\end{tabular}
\\end{adjustbox}
\\end{table}
"""

print(latex_code.replace(".00", ""))

\begin{table}[h]
\centering
\caption{Performance of the \nth{2} model when the pairs are made by (1) \textbf{all}, (2) \textbf{same}, and (3) \textbf{different} developers.}
\label{tab:model2_perf_new}
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{llcccccccccc}
\toprule
Classifier & Developer & top-3-prec (\%) & top-3-recall (\%) & top-5-prec (\%) & top-5-recall (\%) & top-7-prec (\%) & top-7-recall (\%) & top-10-prec (\%) & top-10-recall (\%) & AUC (\%) & Brier Score \\
\midrule
\multirow{2}{*}{XGBoost} & All & \textbf{33.33}\ & \textbf{100}\ & 20\ & \textbf{100}\ & 14.29\ & \textbf{100}\ & 10\ & \textbf{100} & \textbf{91.89}\ & 0.039 \\
 & Different & 0\ & 0\ & 0\ & 0\ & 14.29\ & \textbf{100}\ & 10\ & \textbf{100} & 76.38\ & 0.029 \\
 & Same & \textbf{33.33}\ & \textbf{100}\ & \textbf{25}\ & \textbf{100}\ & \textbf{25}\ & \textbf{100}\ & \textbf{20}\ & \textbf{100} & 73.07\ & 0.454 \\
\hhline{~-----------}
\multirow{2}{*}{AdaBoost} & All & 0\ & 0\ & 0\ & 0\ & 0\ & 0\ & 0\ & 0 &

### Feature importance

In [40]:
df_feat_impo.fillna(0, inplace=True)
# df_feat_impo["pair_type"] = "All"
# df_feat_impo.iloc[10:20, -1] = "Different"
# df_feat_impo.iloc[20:, -1] = "Same"
df_feat_impo.to_csv(osp.join('.', 'Results', 'Feature_importance', f'second_feat_impo_new.csv'), index=None)

In [39]:
df_feat_impo

Unnamed: 0,deletions_source,is_corrective_source,is_refactoring_source,is_preventive_source,is_non_functional_source,has_feature_addition_source,is_merge_source,cross_project_changes_owner_source,pctg_cross_project_changes_owner_source,projects_contributed_owner_source,...,within_project_changes_owner_source,whole_changes_owner_source,description_word_count_source,code_churn_target,whole_changes_owner_target,within_project_changes_target,whole_within_project_changes_target,description_word_count_target,num_directory_files_source,pair_type
0,0.010216,0.015564,0.00741,0.002218,0.00179,0.012592,0.0,0.016976,0.006765,0.023343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,All
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.036368,0.084356,0.040331,0.035698,0.116567,0.047932,0.052224,0.03297,0.0,All
0,0.0,0.022261,0.009682,0.011731,0.008363,0.019637,0.005779,0.0,0.008714,0.01429,...,0.0,0.016476,0.0,0.0,0.020712,0.0,0.0,0.0,0.010621,All
0,0.0,0.01939,0.021497,0.009994,0.010119,0.008703,0.006483,0.0,0.00907,0.017468,...,0.0,0.017884,0.0,0.0,0.018023,0.0,0.0,0.0,0.0,All
0,0.0,0.026158,0.017643,0.012111,0.0077,0.010958,0.006627,0.0,0.007635,0.015985,...,0.0,0.018466,0.0,0.0,0.016272,0.0,0.0,0.0,0.0,All
0,0.0,0.021956,0.016794,0.011071,0.007995,0.009315,0.00477,0.0,0.006846,0.015956,...,0.0,0.018481,0.0,0.0,0.014136,0.0,0.0,0.012844,0.0,All
0,0.0,0.020439,0.013329,0.010088,0.007649,0.011596,0.005772,0.0,0.009027,0.015922,...,0.0,0.018891,0.0,0.0,0.016249,0.0,0.0,0.015365,0.008151,All
0,0.0,0.018234,0.010318,0.00993,0.004246,0.010326,0.005609,0.0,0.01111,0.016064,...,0.0,0.019249,0.0,0.0,0.014657,0.0,0.0,0.013794,0.009292,All
0,0.0,0.020512,0.011376,0.011428,0.008514,0.006473,0.004629,0.0,0.011665,0.018243,...,0.0,0.019059,0.0,0.0,0.016942,0.0,0.0,0.0,0.008149,All
0,0.0,0.014324,0.009389,0.01372,0.01001,0.006078,0.006949,0.0,0.009931,0.01822,...,0.0,0.019725,0.0,0.0,0.018037,0.0,0.0,0.0,0.007645,All


### Feature Impact

In [74]:
df_feat_impact = df_feat_impact.reset_index(drop=True)
df_feat_impact = df_feat_impact[df_feat_impact['impact'].notnull()]
df_feat_impact.to_csv(osp.join('.', 'Results', 'Impact', 'second_model_feat_impact_new.csv'), index=None)

In [31]:
df_feat_impact = pd.read_csv(osp.join('.', 'Results', 'Impact', 'second_model_feat_impact_new.csv'))

In [100]:
# Creating a DataFrame from the additional data
data = {
    "Feature": [
        "dev_in_src_change_nbr", "src_trgt_co_changed_nbr", "num_shrd_desc_tkns",
        "cmn_dev_pctg", "last_mth_cro_proj_nbr_source", "project_changes_owner_source",
        "changed_files_overlap", "last_mth_cro_proj_nbr_target", "whole_changes_owner_target",
        "whole_changes_owner_source", "last_mth_dep_proj_nbr_target", "is_corrective_source",
        "projects_contributed_owner_target", "projects_contributed_owner_source", "num_shrd_file_tkns",
        "is_preventive_target", "is_refactoring_source", "pctg_cross_project_changes_owner_target",
        "subject_length_target", "num_file_types_source", "project_age_source",
        "has_feature_addition_source", "subject_length_source", "desc_sim",
        "is_preventive_source", "pctg_cross_project_changes_source", "last_mth_dep_proj_nbr_source",
        "ratio_dep_chan_owner_source", "pctg_cross_project_changes_target", "pctg_cross_project_changes_owner_source",
        "has_feature_addition_target", "ratio_dep_chan_owner_target", "is_corrective_target",
        "description_word_count_target", "add_lines_sim", "is_non_functional_target",
        "subject_sim", "project_age_target", "num_file_types_target",
        "is_non_functional_source", "del_lines_sim", "project_changes_owner_target",
        "is_merge_target", "is_refactoring_target", "whole_within_project_changes_target",
        "within_project_changes_target", "is_merge_source", "num_directory_files_source",
        "description_word_count_source", "code_churn_source", "within_project_changes_owner_source",
        "code_churn_target", "description_length_target", "num_file_changes_source",
        "num_directory_files_target", "cross_project_changes_target", "cross_project_changes_owner_source",
        "deletions_source", "description_length_source", "insertions_target",
        "num_file_changes_target", "whole_within_project_changes_source", "deletions_target"
    ],
    "Ranking": [
        1, 2, 3,
        4, 4, 4,
        5, 5, 5,
        5, 6, 6,
        7, 7, 7,
        8, 8, 9,
        9, 10, 10,
        10, 10, 10,
        10, 11, 11,
        12, 12, 12,
        12, 12, 12,
        12, 13, 13,
        14, 14, 14,
        14, 14, 15,
        16, 16, 17,
        17, 17, 18,
        18, 18, 18,
        18, 19, 19,
        20, 20, 20,
        21, 21, 21,
        21, 21, 21
    ]
}

# Creating the dataframe
df_most_impo_feat = pd.DataFrame(data)
# df_most_impo_feat = df_most_impo_feat[df_most_impo_feat['Ranking']<=10]
# df_most_impo_feat['Classifier'] = "RF"
classifiers = df_feat_impact['Classifier'].unique().tolist()
df_most_impo_feat['Classifier'] = [classifiers for _ in range(len(df_most_impo_feat))]
df_most_impo_feat = df_most_impo_feat.explode("Classifier")
df_most_impo_feat.reset_index(drop=True, inplace=True)

In [101]:
df_most_impo_feat

Unnamed: 0,Feature,Ranking,Classifier
0,dev_in_src_change_nbr,1,ET
1,dev_in_src_change_nbr,1,RF
2,dev_in_src_change_nbr,1,XGBoost
3,dev_in_src_change_nbr,1,AdaBoost
4,src_trgt_co_changed_nbr,2,ET
...,...,...,...
247,whole_within_project_changes_source,21,AdaBoost
248,deletions_target,21,ET
249,deletions_target,21,RF
250,deletions_target,21,XGBoost


In [102]:
def map_impact_to_fold(row):
    # print(row)
    for i in range(10):
        row[f'fold{i}'] = df_feat_impact.loc[(df_feat_impact['Classifier']==row['Classifier'])&(df_feat_impact['fold']==i), [row['Feature'],  'fold', 'impact']].sort_values(by=row['Feature']).iloc[-1, -1]
    return row

def retrieve_impact(row, func):
    df_sub = [row[f'fold{i}'] for i in range(10) if row[f'fold{i}'] != 0]
    return func(df_sub)

In [103]:
df_most_impo_feat = df_most_impo_feat.apply(map_impact_to_fold, axis=1)
# df_most_impo_feat = df_most_impo_feat[df_most_impo_feat["Ranking"]<=10]
# df_most_impo_feat['mean'] = df_most_impo_feat.apply(retrieve_impact, args=(np.mean,), axis=1) 
df_most_impo_feat['median'] = df_most_impo_feat.apply(retrieve_impact, args=(np.median,), axis=1) 
# df_most_impo_feat['max'] = df_most_impo_feat.apply(retrieve_impact, args=(max,), axis=1) 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [106]:
df_most_impo_feat = df_most_impo_feat[df_most_impo_feat["Classifier"]=="XGBoost"]

In [113]:
df_most_impo_feat.iloc[-30:]

Unnamed: 0,Feature,Ranking,Classifier,fold0,fold1,fold2,fold3,fold4,fold5,fold6,fold7,fold8,fold9,median
134,description_word_count_target,12,XGBoost,0.0,-0.17515,0.0,0.059914,0.009181,-0.20901,-0.109815,-0.130275,-0.003708,0.042296,-0.056762
138,add_lines_sim,13,XGBoost,0.0,1.34913,0.003376,-0.196835,0.004759,-0.002686,0.077061,-0.076688,-0.004029,0.00918,0.003376
142,is_non_functional_target,13,XGBoost,0.0,1.34913,0.0,-0.161575,-0.122923,-0.059584,0.0,0.0,-0.050541,-0.143317,-0.091254
146,subject_sim,14,XGBoost,0.554272,1.34913,0.075565,0.148,0.056684,0.14173,0.149731,0.043662,0.057894,0.0,0.14173
150,project_age_target,14,XGBoost,-0.075919,1.34913,0.044405,-0.141143,-0.298137,-0.097118,-0.297101,-0.070888,-0.166303,-0.208395,-0.119131
154,num_file_types_target,14,XGBoost,0.0,1.34913,0.0,0.0,-0.001041,-0.045163,-0.004729,0.0,0.0,-0.090301,-0.004729
158,is_non_functional_source,14,XGBoost,0.0,1.34913,0.032676,-0.044911,-0.075405,0.0,0.0,0.0,-0.058707,-0.035292,-0.040102
162,del_lines_sim,14,XGBoost,0.0,1.34913,0.0,0.059914,0.009181,0.062222,0.028839,0.025486,-0.003708,0.042296,0.035568
166,project_changes_owner_target,15,XGBoost,-0.073777,1.34913,-0.199812,0.0318,-0.068741,-0.030921,-0.025926,-0.005728,-0.003335,0.021813,-0.015827
170,is_merge_target,16,XGBoost,0.0,1.34913,0.0,0.160743,0.114757,0.068376,0.28535,0.052335,-0.04626,0.12276,0.118758


In [105]:
df_most_impo_feat.to_csv(osp.join('.', 'Results', 'second_feat_impact_import_new.csv'), index=None)

In [89]:
# Generate LaTeX table
df_most_impo_feat = df_most_impo_feat[df_most_impo_feat["Classifier"]=="RF"]
# Escape underscores in feature names
df_most_impo_feat['Feature'] = df_most_impo_feat['Feature'].str.replace('_', r'\_', regex=False)
df_most_impo_feat["median"] = df_most_impo_feat["median"].apply(lambda x: f"{x:.2f}")
latex_table = df_most_impo_feat[["Feature", "Ranking", "median"]].to_latex(
    index=False,
    caption="Ranking of 10 top important features of \\nth{2} model and their impacts.",
    label="tab:feat-imp-model2",
    position='htbp',
    column_format='lcc',
    header=['Feature', 'Ranking', 'Median impact'],
    escape=False
)

# Add booktabs rules
latex_table = latex_table.replace('\\toprule', '\\toprule\n\\caption{Ranking of 10 top important features of \\nth{2} model and their impacts.} \\label{tab:feat-imp-model2}\n')
latex_table = latex_table.replace('\\midrule', '\\midrule')
latex_table = latex_table.replace('\\bottomrule', '\\bottomrule')

# Ensure we're using booktabs package
# latex_table = "latex_table

print(latex_table)

\begin{table}[htbp]
\caption{Ranking of 10 top important features of \nth{2} model and their impacts.}
\label{tab:feat-imp-model2}
\begin{tabular}{lcc}
\toprule
\caption{Ranking of 10 top important features of \nth{2} model and their impacts.} \label{tab:feat-imp-model2}

Feature & Ranking & Median impact \\
\midrule
dev\_in\_src\_change\_nbr & 1 & 0.40 \\
src\_trgt\_co\_changed\_nbr & 2 & 0.35 \\
num\_shrd\_desc\_tkns & 3 & 0.70 \\
cmn\_dev\_pctg & 4 & 0.40 \\
last\_mth\_cro\_proj\_nbr\_source & 4 & -0.20 \\
project\_changes\_owner\_source & 4 & -0.08 \\
changed\_files\_overlap & 5 & 0.18 \\
last\_mth\_cro\_proj\_nbr\_target & 5 & -0.13 \\
whole\_changes\_owner\_target & 5 & -0.18 \\
whole\_changes\_owner\_source & 5 & -0.08 \\
last\_mth\_dep\_proj\_nbr\_target & 6 & -0.10 \\
is\_corrective\_source & 6 & 0.10 \\
projects\_contributed\_owner\_target & 7 & -0.17 \\
projects\_contributed\_owner\_source & 7 & -0.07 \\
num\_shrd\_file\_tkns & 7 & 0.16 \\
is\_preventive\_target & 8 & 0.04 \

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_most_impo_feat['Feature'] = df_most_impo_feat['Feature'].str.replace('_', r'\_', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_most_impo_feat["median"] = df_most_impo_feat["median"].apply(lambda x: f"{x:.2f}")


In [62]:
# Filter for classifier == "RF"
rf_df = df_most_impo_feat[df_most_impo_feat['Classifier'] == 'RF'].copy()

# List of fold columns
fold_cols = [f"fold{i}" for i in range(10)]

# Group by Feature, keep the first Ranking per feature and average the folds
rf_df = df_most_impo_feat.groupby('Feature').agg({
    'Ranking': 'first',      # or 'min'/'max' if you prefer
    **{col: 'max' for col in fold_cols}
}).reset_index()

# Define fold columns and select necessary ones
fold_columns = [f'fold{i}' for i in range(10)]
selected_columns = ['Feature', 'Ranking'] + fold_columns
rf_df = rf_df[selected_columns]

# Keep only top 10 ranked features
rf_df = rf_df[rf_df['Ranking'] <= 10]
rf_df.sort_values("Ranking", inplace=True)

# Escape underscores in feature names
rf_df['Feature'] = rf_df['Feature'].str.replace('_', r'\_', regex=False)

# Format fold values: 2 decimal places, replace 0.00 with "*"
for col in fold_columns:
    rf_df[col] = rf_df[col].apply(lambda x: "*" if round(x, 3) == 0 else f"{x:.2f}")

# Generate full LaTeX table (with tabular environment)
latex_full = rf_df.to_latex(
    index=False,
    header=False,
    column_format='lrc' + 'c' * 10,
    escape=False
)

# Split into lines
lines = latex_full.splitlines()

# Remove \begin{tabular} and \end{tabular} lines
body_lines = []
for line in lines:
    if line.strip().startswith(r'\begin{tabular}'):
        continue
    elif line.strip().startswith(r'\end{tabular}'):
        continue
    elif line.strip().startswith(r'\toprule'):
        continue
    elif line.strip().startswith(r'\midrule'):
        continue
    elif line.strip().startswith(r'\bottomrule'):
        continue
    else:
        body_lines.append(line)

# Insert \midrule after each data row line
new_body_lines = []
for idx, line in enumerate(body_lines):
    new_body_lines.append(line)
    # Each row ends with \\ and contains & separating columns
    if  '&' in line and line.strip().endswith(r'\\') and idx < len(body_lines) - 1 :
        new_body_lines.append(r'\hline')

latex_body_with_lines = "\n".join(new_body_lines)

# Define your custom header (with the tabular environment)
header = r"""
\begin{table}[h]
\caption{Ranking of 10 top important features of \nth{2} model and their impacts.}
\label{tab:feat-imp-model2}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{lrc*{10}{c}}
\toprule
\textbf{Feature} & \textbf{Rank} & \multicolumn{10}{c}{\textbf{Impact on fold ...}} \\
\cmidrule(l){3-12} & & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
\midrule
"""

# Footer closes the tabular environment
footer = r"""
\bottomrule
\end{tabular}
\end{adjustbox}
\end{table}
"""

# Combine all parts
full_latex = header + latex_body_with_lines + footer

print(full_latex)


\begin{table}[h]
\caption{Ranking of 10 top important features of \nth{2} model and their impacts.}
\label{tab:feat-imp-model2}
\centering
\begin{adjustbox}{width=\textwidth}
\begin{tabular}{lrc*{10}{c}}
\toprule
\textbf{Feature} & \textbf{Rank} & \multicolumn{10}{c}{\textbf{Impact on fold ...}} \\
\cmidrule(l){3-12} & & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 & 9 & 10\\
\midrule
src\_trgt\_co\_changed\_nbr & 1 & 0.43 & 2.75 & 0.18 & 0.54 & 0.37 & 0.31 & 0.38 & 0.31 & 0.21 & 0.48 \\
\hline
dev\_in\_src\_change\_nbr & 1 & 0.40 & 2.75 & 0.26 & 0.50 & 0.65 & 0.20 & 0.42 & 0.47 & 0.21 & 0.42 \\
\hline
cmn\_dev\_pctg & 2 & 0.38 & 2.75 & 0.16 & 0.24 & 0.31 & 0.24 & 0.28 & 0.27 & 0.11 & 0.44 \\
\hline
num\_shrd\_desc\_tkns & 2 & 1.07 & 2.75 & 0.47 & 0.89 & 0.78 & 0.58 & 0.86 & 0.90 & 0.50 & 0.81 \\
\hline
whole\_changes\_owner\_target & 3 & -0.02 & -0.46 & -0.16 & -0.11 & -0.18 & -0.08 & -0.20 & -0.27 & -0.23 & -0.25 \\
\hline
whole\_changes\_owner\_source & 3 & -0.02 & -0.33 & -0.05 & -0.02 & -0.08 &

### Dimension importance

In [82]:
# df.drop(columns=["number"], inplace=True)
M1_METRICS = df.columns.tolist()
CHANGE_METRICS = [col for col in constants.CHANGE_METRICS if col in M1_METRICS]
TEXT_METRICS = [col for col in constants.TEXT_METRICS if col in M1_METRICS]
DEVELOPER_METRICS = [col for col in constants.DEVELOPER_METRICS if col in M1_METRICS]
PROJECT_METRICS = [col for col in constants.PROJECT_METRICS if col in M1_METRICS]
FILE_METRICS = [col for col in constants.FILE_METRICS if col in M1_METRICS]
CHANGE_METRICS = [f'{cm}_source' for cm in CHANGE_METRICS] + [f'{cm}_target' for cm in CHANGE_METRICS]
TEXT_METRICS = [f'{cm}_source' for cm in TEXT_METRICS] + [f'{cm}_target' for cm in TEXT_METRICS]
DEVELOPER_METRICS = [f'{cm}_source' for cm in DEVELOPER_METRICS] + [f'{cm}_target' for cm in DEVELOPER_METRICS]
PROJECT_METRICS = [f'{cm}_source' for cm in PROJECT_METRICS] + [f'{cm}_target' for cm in PROJECT_METRICS]
FILE_METRICS = [f'{cm}_source' for cm in FILE_METRICS] + [f'{cm}_target' for cm in FILE_METRICS]

dimensions = {
    'Change': CHANGE_METRICS,
    'Text': TEXT_METRICS,
    'Developer': DEVELOPER_METRICS,
    'Project': PROJECT_METRICS,
    'File': FILE_METRICS,
    'Pairs': constants.PAIR_METRICS
}

In [86]:
dimension_results = {key: [] for key in dimensions.keys()}
tscv = TimeSeriesSplit(n_splits = 10)
clf_path = osp.join('.', 'Results')
dimension_type = 'discard'

if not os.path.exists(clf_path):
    os.makedirs(clf_path)
    
for dim_label, dim_feats in dimensions.items():
    print(f'Start training with {dim_label} dimension...')

    # if dim == 'Pairs':
    #     continue

    features = []
    if dimension_type == 'keep':
        features = dim_feats
    else:
        for lab, dim in dimensions.items():
            if lab != dim_label:
                features += dim

    auc_scores = []
    precision_scores = []
    recall_scores = []
    brier_scores = []
    feature_importances = []
    corr_features = []
    redundant_features = []

    for fold in range(0, 10):

        # if fold in [3, 4]:
        #     continue

        clone_clf = XGBClassifier(random_state=42)
        
        # Split training data into features and dims
        X_train = pd.read_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"))
        y_train = X_train['related']

        # df_test = pd.concat((df_test, X_train.iloc[:1]))
        # if (dimension_type == 'keep' and dim_label == 'Pairs') or (dimension_type == 'discard' and dim_label != 'Pairs'):
            # desc_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold)
        #     subject_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, "subject")
        #     add_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'added_lines')
        #     del_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'deleted_lines')

        #     X_train = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_train, 'commit_message', 'desc')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_train, 'subject', 'subject')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_train, 'added_lines', 'add_lines')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_train, 'deleted_lines', 'del_lines')

        corr_features = df_features.loc[df_features[f'Fold{fold}']==0, 'Feat'].tolist()
        features = [c for c in features if c not in corr_features]

        X_train = X_train[features]

        # X_train = X_train.drop(columns=cols_dropped)

        # ros = RandomUnderSampler(random_state=0)
        
        # Perform under-sampling of the majority class(es)
        # X_train, y_train = ros.fit_resample(X_train, y_train)

        # conduct the correlation analysis
        # if fold == 0:
        

        # # Remove highly correlated features from the training set
        # X_train = X_train.drop(columns=corr_features)

        # Conduct redundancy analysis
        # if len(X_train.columns) > 1:
        #     redundant_features = clas_util.redundancy_analysis(X_train)

        # Remove indepandent variables explained by others
        # if len(redundant_features) != 0:
        #     X_train = X_train.drop(columns=redundant_features)

        # Instantiate the OverSampler class then fit it on the each fold training dataset
        # features = X_train.columns.tolist()
        # print(f'len(X_train) {len(X_train)}')

        # Columns to exclude in the test set
        # cols_exluded = corr_features + redundant_features 

        X_test = pd.read_csv(osp.join(".", "Files", "Data", "Test", f"{fold}.csv"))
        # X_test_pairs = X_test[['Source', 'Target', 'related']]
        y_test = X_test['related']

        # if (dimension_type == 'keep' and dim_label == 'Pairs') or (dimension_type == 'discard' and dim_label != 'Pairs'):
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'commit_message', 'desc')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_test, 'subject', 'subject')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_test, 'added_lines', 'add_lines')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_test, 'deleted_lines', 'del_lines')
        
        X_test = X_test[X_train.columns.tolist()]

        # Train the Random Forest Classifier on the training fold set 
        clone_clf.fit(X_train, y_train)

        # Test the Random Forest Classifier on the test fold set 
        y_probs = clone_clf.predict_proba(X_test)[:,1]

        # Set custom threshold
        threshold = 0.75
        y_pred = [1 if p >= threshold else 0 for p in y_probs]

        # y_pred_prob = clone_clf.predict_proba(X_test)[:, 1]
        # X_test_pairs['pred'] = y_pred_prob
        # compute_top_k_prec_recal(X_test_pairs, dim, fold+1)

        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))
        brier_scores.append(brier_score_loss(y_test, y_pred))

        print(f"{dim_label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}, Brier: {brier_scores[-1]}")

    # feature_importances /= (fold+1)
    prec_avg = np.average(precision_scores)
    recall_avg = np.average(recall_scores)
    auc_avg = np.average(auc_scores)
    brier_avg = np.average(brier_scores)

    print(f"{dim_label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}, Brier: {brier_avg}")

    dimension_results[dim_label] += [{
        'Dimension': dim_label,
        'Precision': prec_avg,
        'Recall': recall_avg,
        'AUC': auc_avg,
        'Brier': brier_avg
    }]

Start training with Change dimension...
Change, Fold: 1, Precision: 0.03999379941094404, Recall: 0.8896551724137931, AUC: 0.922430242207128, Brier: 0.0449318983997748
Change, Fold: 2, Precision: 0.04195345788266142, Recall: 0.8648648648648649, AUC: 0.9106976664734036, Brier: 0.043670843813297125
Change, Fold: 3, Precision: 0.040658775090066906, Recall: 0.8116438356164384, AUC: 0.8846991473710323, Brier: 0.042567144827794154
Change, Fold: 4, Precision: 0.035505759823260215, Recall: 0.8302583025830258, AUC: 0.8911343648184168, Brier: 0.04824808825372947
Change, Fold: 5, Precision: 0.03969291081345965, Recall: 0.8496503496503497, AUC: 0.9034831580245533, Brier: 0.04290713597403256
Change, Fold: 6, Precision: 0.04164422904325974, Recall: 0.8315412186379928, AUC: 0.8943445030994459, Brier: 0.0431328581724994
Change, Fold: 7, Precision: 0.043258832011535686, Recall: 0.8275862068965517, AUC: 0.893187033431117, Brier: 0.041506890701619836
Change, Fold: 8, Precision: 0.04114420062695925, Recall

In [87]:
dim_imp = pd.DataFrame([item[0] for item in list(dimension_results.values())])
dim_imp.to_csv(osp.join('.', 'Results', 'Feature_importance', f'second_model_{dimension_type}_dim_new.csv'), index=None)

### Impact

In [11]:
df_feat_impact = pd.read_csv(osp.join('.', 'Results', 'Impact', f'third_model_feat_impact.csv'))

In [26]:
# attr = "last_mth_mod_uniq_proj_nbr_target"
# test = df_feat_impact#.loc[(df_feat_impact['fold']!=0)&(df_feat_impact[attr].duplicated()==False), [attr, "fold", "impact"]].sort_values("impact")
# test = test[test['impact'].notnull()==True].iloc[:-1, -1]
# test = f" & {round(test.min(), 2)} & {round(test.median(), 2)} & {round(test.max(), 2)}"