In [1]:
import pandas as pd
import os
import glob
import numpy as np
from pathlib import Path
current_dir = os.getcwd()
BASE_DIR = current_dir

ALL_DATASETS = ['wdbc', 'sonar', 'ionosphere', 'diabetes', 'cleveland', 'colon']
DATASET_TYPES = ['original', 'noise', 'outlier', 'both'] 
ALL_MODELS = ['RFESVM', 'PinFSSVM', 'PinballSVM', 'MILP1', 'L2SVM', 'L1SVM', 'FisherSVM']


def load_excel_file(file_path):
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            return df
        else:
            print(f"  Warning: File not found: {file_path}")
            return None
    except Exception as e:
        print(f"  Error reading {file_path}: {e}")
        return None



def merge_single_dataset(dataset_name, models_list):
    
    all_data = []
    missing_files = []
    
    print(f"\n--- Merging : {dataset_name} ---")
    
    for model in models_list:
        print(f"\nProcessing model: {model}")
        model_data = []
        
        for dataset_type in DATASET_TYPES:
            file_path = os.path.join(BASE_DIR, dataset_name, dataset_type, f"{model}_auc_folds.xlsx")
            
            print(f"  Loading: {dataset_name}/{dataset_type}/{model}_auc_folds.xlsx")
            
            df = load_excel_file(file_path)
            
            if df is not None:
                df['Model'] = model
                df['Dataset'] = dataset_name
                df['Dataset_Type'] = dataset_type
                
                model_data.append(df)
                all_data.append(df)
            else:
                missing_files.append(file_path)
        
        total_samples_for_model = sum([len(df) for df in model_data])
        print(f"  Total samples for {model}: {total_samples_for_model}")
    
    if all_data:
        merged_df = pd.concat(all_data, ignore_index=True)
        print(f"\n--- Kết quả merge cho dataset {dataset_name} ---")
        print(f"Total DataFrame shape: {merged_df.shape}")
        print(f"Total missing files: {len(missing_files)}")
        
        return merged_df, missing_files
    else:
        print("No data to merge!")
        return None, missing_files



In [2]:
from scipy.stats import wilcoxon,bootstrap
import scipy.stats as stats
import pandas as pd
import os
import glob
import numpy as np
from pathlib import Path
current_dir = os.getcwd()
BASE_DIR = current_dir
n_resamples = 10000
confidence_level = 0.95
alpha = 0.05
print(f'Alpha for testing: {alpha}')

def get_bootstrap_ci_median_diff(data1, data2, n_resamples, confidence_level):
    differences = np.array(data1) - np.array(data2)
    if len(differences) < 2: return (np.nan, np.nan)
    try:
        res = bootstrap((differences,), np.median, confidence_level=confidence_level,
                        n_resamples=n_resamples, method='percentile', random_state=42)
        return (res.confidence_interval.low, res.confidence_interval.high)
    except Exception as e:
        print(f"  Error in bootstrap CI (median diff): {e}")
        return (np.nan, np.nan)

Alpha for testing: 0.05


In [3]:
ALL_DATASETS = ['wdbc', 'sonar', 'ionosphere', 'diabetes', 'cleveland', 'colon']
MODELS_TO_COMPARE = ['PinFSSVM', 'MILP1', 'RFESVM', 'FisherSVM', 'L1SVM']

all_results_log = []

for TARGET_DATASET in ALL_DATASETS:
    # Đọc dữ liệu đã merge cho từng dataset
    merged_data, missing_files = merge_single_dataset(TARGET_DATASET, MODELS_TO_COMPARE)
    if merged_data is None:
        continue

    results_log = []
    for i in range(len(MODELS_TO_COMPARE) - 1):
        model1 = MODELS_TO_COMPARE[0]
        model2 = MODELS_TO_COMPARE[i + 1]
        model1_data = merged_data[merged_data['Model'] == model1].copy()
        model2_data = merged_data[merged_data['Model'] == model2].copy()
        comparison_df = pd.merge(
            model1_data[['Dataset_Type', 'Fold', 'AUC']],
            model2_data[['Dataset_Type', 'Fold', 'AUC']],
            on=['Dataset_Type', 'Fold'],
            suffixes=(f'_{model1}', f'_{model2}')
        )
        if len(comparison_df) > 0:
            auc1_col = f'AUC_{model1}'
            auc2_col = f'AUC_{model2}'
            model1_scores = comparison_df[auc1_col].values
            model2_scores = comparison_df[auc2_col].values
            mean_diff = np.mean(model1_scores - model2_scores)
            ci_lower, ci_upper = get_bootstrap_ci_median_diff(
                model1_scores, model2_scores, n_resamples=n_resamples, confidence_level=confidence_level)
            statistic, p_value = wilcoxon(model1_scores, model2_scores, alternative='two-sided')
            statistic_greater, p_value_greater = wilcoxon(model1_scores, model2_scores, alternative='greater')
            result_row = {
                'Comparison': f"{model1} vs {model2}",
                'Dataset': TARGET_DATASET,
                'Alpha': alpha,
                'P_value_1sided': p_value_greater,
                'CI_Lower': ci_lower,
                'CI_Upper': ci_upper,
                'Mean_Difference': mean_diff,
            }
            results_log.append(result_row)
        else:
            result_row = {
                'Comparison': f"{model1} vs {model2}",
                'Dataset': TARGET_DATASET,
                'Alpha': alpha,
                'P_value_1sided': np.nan,
                'CI_Lower': np.nan,
                'CI_Upper': np.nan,
                'Mean_Difference': np.nan,
            }
            results_log.append(result_row)
    # Gộp vào list tổng
    all_results_log.extend(results_log)



--- Merging : wdbc ---

Processing model: PinFSSVM
  Loading: wdbc/original/PinFSSVM_auc_folds.xlsx
  Loading: wdbc/noise/PinFSSVM_auc_folds.xlsx
  Loading: wdbc/outlier/PinFSSVM_auc_folds.xlsx
  Loading: wdbc/both/PinFSSVM_auc_folds.xlsx
  Total samples for PinFSSVM: 40

Processing model: MILP1
  Loading: wdbc/original/MILP1_auc_folds.xlsx
  Loading: wdbc/noise/MILP1_auc_folds.xlsx
  Loading: wdbc/outlier/MILP1_auc_folds.xlsx
  Loading: wdbc/both/MILP1_auc_folds.xlsx
  Total samples for MILP1: 40

Processing model: RFESVM
  Loading: wdbc/original/RFESVM_auc_folds.xlsx
  Loading: wdbc/noise/RFESVM_auc_folds.xlsx
  Loading: wdbc/outlier/RFESVM_auc_folds.xlsx
  Loading: wdbc/both/RFESVM_auc_folds.xlsx
  Total samples for RFESVM: 40

Processing model: FisherSVM
  Loading: wdbc/original/FisherSVM_auc_folds.xlsx
  Loading: wdbc/noise/FisherSVM_auc_folds.xlsx
  Loading: wdbc/outlier/FisherSVM_auc_folds.xlsx
  Loading: wdbc/both/FisherSVM_auc_folds.xlsx
  Total samples for FisherSVM: 40

Pro

In [4]:
import pandas as pd
from statsmodels.stats.multitest import multipletests

all_results_df = pd.DataFrame(all_results_log)
valid_p_indices = all_results_df['P_value_1sided'].notna()
raw_p_values = all_results_df.loc[valid_p_indices, 'P_value_1sided'].tolist()

if raw_p_values:
    reject_bh, p_corrected_bh, _, _ = multipletests(raw_p_values, alpha=alpha, method='fdr_bh')
    all_results_df.loc[valid_p_indices, 'Benjamini-Hochberg_Corrected_P_1sided_Greater'] = p_corrected_bh
    all_results_df.loc[valid_p_indices, 'Significant_Benjamini-Hochberg (Main > Baseline)'] = reject_bh

# Lưu kết quả tổng hợp
output_filename = "statistical_summary_all_datasets.xlsx"
all_results_df.to_excel(output_filename, index=False)
print(f"Results saved to: {output_filename}")


Results saved to: statistical_summary_all_datasets.xlsx
