In [39]:
import pandas as pd 
import glob 
import os
PATH = os.getcwd()  
while not os.path.basename(PATH).startswith('Optimal-Robust-Feature-Selection'):
    parent = os.path.dirname(PATH)
    if parent == PATH:  
        break
    PATH = parent
results_dir = os.path.join(PATH, 'src', 'experiment', 'results')
datasets = ['cleveland', 'diabetes', 'ionosphere', 'sonar', 'wdbc', 'colon']
list = [os.path.join(results_dir, f'experiment_results_{dataset}.xlsx') for dataset in datasets]


In [40]:
df_list = [pd.read_excel(file) for file in list]

In [41]:
combined_df = pd.concat(df_list, ignore_index=True)


In [42]:
columns_to_select = combined_df.columns[:8].tolist() + ['Overlap Ratio']
combined_df = combined_df[columns_to_select]
combined_df = combined_df.rename(columns={'Overlap Ratio': ' Average Overlap Ratio'})

In [43]:
combined_df

Unnamed: 0,Model,Type of dataset,Average Accuracy,Average AUC,Average F1 Score,Average G-Mean,Average train_time,Average num_features,Average Overlap Ratio
0,L1SVM,Not noise,0.838280,0.838476,0.837574,0.833741,0.038930,8.5,0.7500
1,L1SVM,Noise,0.785591,0.784821,0.784129,0.777550,0.025879,10.8,0.5556
2,L1SVM,Outlier,0.785484,0.791390,0.786142,0.785037,0.025614,9.6,0.6000
3,L1SVM,Noise + Outlier,0.734946,0.738004,0.737131,0.734668,0.025873,11.1,0.6000
4,L2SVM,Not noise,0.838172,0.836778,0.837311,0.831671,0.040423,13.0,0.5385
...,...,...,...,...,...,...,...,...,...
163,FisherSVM,Noise + Outlier,0.758974,0.741389,0.758978,0.722948,4.715056,7.6,0.0000
164,RFESVM,Not noise,0.791026,0.782500,0.787402,0.766634,942.125757,4.8,0.6000
165,RFESVM,Noise,0.710256,0.711429,0.710099,0.689102,844.958844,4.8,0.2000
166,RFESVM,Outlier,0.692308,0.710833,0.697893,0.687868,965.411293,10.2,0.0000


In [44]:
# Calculate mean and std, then format as 'mean ± std'
def mean_std_format(x):
    mean_val = x.mean()
    std_val = x.std()
    return f"{mean_val:.4f} ± {std_val:.4f}"

avg_df = combined_df.groupby(['Model', 'Type of dataset']).agg(
    lambda x: mean_std_format(x) if x.dtype in ['float64', 'int64'] else x.iloc[0]
).reset_index()


In [45]:
avg_df

Unnamed: 0,Model,Type of dataset,Average Accuracy,Average AUC,Average F1 Score,Average G-Mean,Average train_time,Average num_features,Average Overlap Ratio
0,FisherSVM,Noise,0.7752 ± 0.0716,0.7715 ± 0.0615,0.7739 ± 0.0702,0.7599 ± 0.0665,1.0990 ± 1.4857,8.2500 ± 2.9208,0.5906 ± 0.2366
1,FisherSVM,Noise + Outlier,0.7422 ± 0.0685,0.7401 ± 0.0646,0.7424 ± 0.0678,0.7288 ± 0.0686,1.1828 ± 1.7367,8.7167 ± 2.3181,0.5510 ± 0.2795
2,FisherSVM,Not noise,0.8290 ± 0.0941,0.8248 ± 0.0857,0.8279 ± 0.0931,0.8168 ± 0.0887,1.0111 ± 1.2644,8.8000 ± 3.2766,0.5912 ± 0.2769
3,FisherSVM,Outlier,0.7800 ± 0.0846,0.7793 ± 0.0835,0.7800 ± 0.0842,0.7672 ± 0.0899,1.0582 ± 1.3988,7.6500 ± 2.0782,0.5278 ± 0.3112
4,L1SVM,Noise,0.7747 ± 0.0778,0.7728 ± 0.0659,0.7747 ± 0.0758,0.7627 ± 0.0706,0.1084 ± 0.1409,11.5000 ± 5.8227,0.4751 ± 0.2043
5,L1SVM,Noise + Outlier,0.7266 ± 0.0734,0.7307 ± 0.0692,0.7275 ± 0.0728,0.7182 ± 0.0749,0.1104 ± 0.1392,11.4833 ± 3.9463,0.4427 ± 0.2475
6,L1SVM,Not noise,0.8289 ± 0.0899,0.8242 ± 0.0845,0.8282 ± 0.0889,0.8172 ± 0.0872,0.1119 ± 0.1292,9.2667 ± 4.1195,0.5544 ± 0.1993
7,L1SVM,Outlier,0.7706 ± 0.0928,0.7749 ± 0.0876,0.7718 ± 0.0909,0.7632 ± 0.0940,0.1061 ± 0.1395,10.0833 ± 2.8477,0.4448 ± 0.2809
8,L2SVM,Noise,0.7682 ± 0.0883,0.7691 ± 0.0741,0.7681 ± 0.0867,0.7578 ± 0.0816,0.1086 ± 0.1093,357.5000 ± 804.8666,0.3063 ± 0.2013
9,L2SVM,Noise + Outlier,0.7271 ± 0.0714,0.7293 ± 0.0675,0.7284 ± 0.0702,0.7176 ± 0.0725,0.1159 ± 0.1235,357.5000 ± 804.8666,0.3063 ± 0.2013


In [46]:
avg_df.to_excel("average_metrics_per_model_dataset.xlsx", index=False)
