In [1]:
import pandas as pd
import seaborn as sns
import json

In [2]:
base_result_dir = '../benchmark_results'
TO_EXCLUDE_MODEL = ['lr']

In [3]:
def get_result_df(result_dir, target_metric):
    # print(result_dir)
    with open(f'{result_dir}/results_{target_metric}.json', 'r') as f:
        result = json.load(f)
        
    refined_result = {k: result[k]['cv_avg_scores'] for k in result.keys() if k not in TO_EXCLUDE_MODEL}
    return pd.DataFrame.from_dict(refined_result).T

In [4]:
from itertools import product

_token = 'words'
result_lst = []

_chained_lst = [None, 'chained', 'unchained']
_rule_lst = [False, True]
_feature_select_lst = [None, 'mutual_info_classif', 'chi2']
_sample_method_lst = [None, 'random_over', 'random_under', 'smote']

# get the combination of chained, rule, feature_select

for _sample_method, _chained, _rule, _feature_select in product(_sample_method_lst, _chained_lst, _rule_lst, _feature_select_lst):
    _rule_txt = '_rule_added' if _rule else ''
    
    if _chained is None:
        logging_nm = f'{_sample_method}/no_imputation_{_feature_select}'
        
    else:
        logging_nm = f'{_sample_method}/imputation_{_chained}_{_feature_select}{_rule_txt}'
        
    result_lst.append(logging_nm)
    print(logging_nm)

None/no_imputation_None
None/no_imputation_mutual_info_classif
None/no_imputation_chi2
None/no_imputation_None
None/no_imputation_mutual_info_classif
None/no_imputation_chi2
None/imputation_chained_None
None/imputation_chained_mutual_info_classif
None/imputation_chained_chi2
None/imputation_chained_None_rule_added
None/imputation_chained_mutual_info_classif_rule_added
None/imputation_chained_chi2_rule_added
None/imputation_unchained_None
None/imputation_unchained_mutual_info_classif
None/imputation_unchained_chi2
None/imputation_unchained_None_rule_added
None/imputation_unchained_mutual_info_classif_rule_added
None/imputation_unchained_chi2_rule_added
random_over/no_imputation_None
random_over/no_imputation_mutual_info_classif
random_over/no_imputation_chi2
random_over/no_imputation_None
random_over/no_imputation_mutual_info_classif
random_over/no_imputation_chi2
random_over/imputation_chained_None
random_over/imputation_chained_mutual_info_classif
random_over/imputation_chained_chi2
r

In [5]:
target_metric = 'AUC'
result_all = {}

for dir in result_lst:
    result_dir = f'{base_result_dir}/{dir}'
    result_df = get_result_df(result_dir, target_metric)
    best_ = result_df.sort_values(f"test_auc", ascending=False).reset_index().iloc[0, :]
    key_str = "-".join(dir.split('/')) + "-" + best_['index']
    # print("-".join(dir.split('/')))
    result_all[key_str] = {k: best_[k] for k in best_.keys() if 'test' in k}

In [15]:
result_df_original = pd.DataFrame.from_dict(result_all).T.sort_values('test_auc', ascending=False).reset_index()

In [45]:
# index name order: sample_method, imputation, feature_select, rule, model
# split the index name to get the information and create a new column for each information

index_nm_order = ['sample_method', 'imputation', 'feature_select', 'rule', 'model']
df = result_df_original.copy(deep=True)

# create new column for each information
for nm in index_nm_order:
    df[nm] = None

final_result_df_col = ["test_auc", "test_precision", "test_recall", "test_f1", "sample_method", "imputation", "featuer_select", "rule", "model"]
final_result_df = {}

for idx_num, row in df.iterrows():
    temp_dict = {}
    
    index_nm = row['index']
    # print(index_nm.split('-'))
    
    lst = index_nm.split('-')
    
    sampling = lst[0]
    
    l2_split = []
    
    l2 = lst[1].split("_")
    # print(l2)
    imputation = "_".join(l2[:2])
    l2_split.append(imputation)
    
    l2 = l2[2:]
    
    if l2[-1] == 'added':
        rule_added = "rule_added"
        l2 = l2[:-2]
        
    else:
        rule_added = "no_rule_added"
    
    l2_split.append(rule_added)    
    
    feat_sel = "_".join(l2)
    
    l2_split.append(feat_sel)   

    model = lst[2]
    
    for col in final_result_df_col:
        if col in row.keys():
            temp_dict[col] = row[col]
            
    temp_dict['sample_method'] = sampling
    temp_dict['imputation'] = imputation
    temp_dict['featuer_select'] = feat_sel
    temp_dict['rule'] = rule_added
    temp_dict['model'] = model
    
    final_result_df[idx_num] = temp_dict
        
final_result_df = pd.DataFrame.from_dict(final_result_df).T
final_result_df.to_csv(f'{base_result_dir}/final_result.csv', index=False)

In [49]:
final_result_df.head()

Unnamed: 0,test_auc,test_precision,test_recall,test_f1,sample_method,imputation,rule,model,featuer_select
0,0.97447,0.801016,0.970022,0.970022,random_over,imputation_chained,rule_added,et,mutual_info_classif
1,0.974363,0.801744,0.97047,0.97047,smote,imputation_chained,rule_added,et,mutual_info_classif
2,0.974326,0.801598,0.970022,0.970022,random_under,imputation_chained,rule_added,et,chi2
3,0.974326,0.801598,0.970022,0.970022,random_under,imputation_chained,rule_added,et,
4,0.974326,0.801598,0.970022,0.970022,,imputation_chained,rule_added,et,


In [48]:
final_result_df.groupby('sample_method').mean().sort_values('test_auc', ascending=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,test_auc,test_precision,test_recall,test_f1
sample_method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
smote,0.968226,0.778934,0.967174,0.967174
,0.968192,0.779745,0.967487,0.967487
random_under,0.968123,0.778509,0.967233,0.967233
random_over,0.967687,0.775984,0.967248,0.967248


In [50]:
final_result_df.groupby('imputation').mean().sort_values('test_auc', ascending=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,test_auc,test_precision,test_recall,test_f1
imputation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
imputation_chained,0.970981,0.785731,0.968111,0.968111
imputation_unchained,0.966818,0.783899,0.966937,0.966937
no_imputation,0.964689,0.752206,0.966331,0.966331


In [51]:
final_result_df.groupby('rule').mean().sort_values('test_auc', ascending=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,test_auc,test_precision,test_recall,test_f1
rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rule_added,0.968057,0.778293,0.967286,0.967286
