### Prepare the dataset


In [11]:
import pandas as pd
import os
import glob
import numpy as np
from pathlib import Path
current_dir = os.getcwd()
# Đường dẫn gốc đến thư mục chứa dữ liệu wilcoxon
BASE_DIR = current_dir

# Danh sách các thành phần
ALL_DATASETS = ['wdbc', 'sonar', 'ionosphere', 'diabetes', 'cleveland']
DATASET_TYPES = ['original', 'noise', 'outlier', 'both'] 
ALL_MODELS = ['RFESVM', 'PinFSSVM', 'PinballSVM', 'MILP1', 'L2SVM', 'L1SVM', 'FisherSVM']


TARGET_DATASET = 'diabetes' 
MODELS_TO_COMPARE = ['PinFSSVM', 'MILP1', 'RFESVM', 'FisherSVM', 'L1SVM']

def load_excel_file(file_path):
    """Load dữ liệu từ file Excel và trả về DataFrame"""
    try:
        if os.path.exists(file_path):
            df = pd.read_excel(file_path)
            return df
        else:
            print(f"  Warning: File not found: {file_path}")
            return None
    except Exception as e:
        print(f"  Error reading {file_path}: {e}")
        return None



def merge_single_dataset(dataset_name, models_list):
    """Merge dữ liệu cho 1 dataset cụ thể và danh sách models"""
    
    all_data = []
    missing_files = []
    
    print(f"\n--- Bắt đầu merge cho dataset: {dataset_name} ---")
    
    for model in models_list:
        print(f"\nProcessing model: {model}")
        model_data = []
        
        for dataset_type in DATASET_TYPES:
            # Tạo đường dẫn đến file
            file_path = os.path.join(BASE_DIR, dataset_name, dataset_type, f"{model}_auc_folds.xlsx")
            
            print(f"  Loading: {dataset_name}/{dataset_type}/{model}_auc_folds.xlsx")
            
            # Load dữ liệu
            df = load_excel_file(file_path)
            
            if df is not None:
                # Thêm thông tin metadata vào DataFrame
                df['Model'] = model
                df['Dataset'] = dataset_name
                df['Dataset_Type'] = dataset_type
                
                # Thêm vào danh sách
                model_data.append(df)
                all_data.append(df)
            else:
                missing_files.append(file_path)
        
        # Tính số samples cho model này
        total_samples_for_model = sum([len(df) for df in model_data])
        print(f"  Total samples for {model}: {total_samples_for_model}")
    
    # Merge tất cả dữ liệu
    if all_data:
        merged_df = pd.concat(all_data, ignore_index=True)
        print(f"\n--- Kết quả merge cho dataset {dataset_name} ---")
        print(f"Total DataFrame shape: {merged_df.shape}")
        print(f"Total missing files: {len(missing_files)}")
        
        return merged_df, missing_files
    else:
        print("No data to merge!")
        return None, missing_files

# Thực hiện merge cho dataset được chọn
merged_data, missing_files = merge_single_dataset(TARGET_DATASET, MODELS_TO_COMPARE)


# Lưu dữ liệu merged vào file Excel
if merged_data is not None:
    
    # Tạo tên file output với tên dataset
    output_file = os.path.join(BASE_DIR, TARGET_DATASET,f"merged_{TARGET_DATASET}_data.xlsx")
    
    try:
        # Lưu vào Excel
        merged_data.to_excel(output_file, index=False)
        print(f"✅ Đã lưu dữ liệu merged vào: {output_file}")
        print(f"   Shape: {merged_data.shape}")
        
        
    except Exception as e:
        print(f"❌ Lỗi khi lưu file: {e}")

else:
    print("❌ Không có dữ liệu để lưu")


--- Bắt đầu merge cho dataset: diabetes ---

Processing model: PinFSSVM
  Loading: diabetes/original/PinFSSVM_auc_folds.xlsx
  Loading: diabetes/noise/PinFSSVM_auc_folds.xlsx
  Loading: diabetes/outlier/PinFSSVM_auc_folds.xlsx
  Loading: diabetes/both/PinFSSVM_auc_folds.xlsx
  Total samples for PinFSSVM: 40

Processing model: MILP1
  Loading: diabetes/original/MILP1_auc_folds.xlsx
  Loading: diabetes/noise/MILP1_auc_folds.xlsx
  Loading: diabetes/outlier/MILP1_auc_folds.xlsx
  Loading: diabetes/both/MILP1_auc_folds.xlsx
  Total samples for MILP1: 40

Processing model: RFESVM
  Loading: diabetes/original/RFESVM_auc_folds.xlsx
  Loading: diabetes/noise/RFESVM_auc_folds.xlsx
  Loading: diabetes/outlier/RFESVM_auc_folds.xlsx
  Loading: diabetes/both/RFESVM_auc_folds.xlsx
  Total samples for RFESVM: 40

Processing model: FisherSVM
  Loading: diabetes/original/FisherSVM_auc_folds.xlsx
  Loading: diabetes/noise/FisherSVM_auc_folds.xlsx
  Loading: diabetes/outlier/FisherSVM_auc_folds.xlsx
  L

In [None]:
from scipy.stats import wilcoxon
import scipy.stats as stats
alpha = 0.05
print(f'Alpha for testing: {alpha}')
def calculate_confidence_interval(data1, data2, confidence=0.95):
    """Calculate confidence interval for the difference between two samples"""
    differences = data1 - data2
    n = len(differences)
    mean_diff = np.mean(differences)
    std_diff = np.std(differences, ddof=1)
    
    # Calculate confidence interval
    alpha = 1 - confidence
    t_critical = stats.t.ppf(1 - alpha/2, n - 1)
    margin_error = t_critical * (std_diff / np.sqrt(n))
    
    ci_lower = mean_diff - margin_error
    ci_upper = mean_diff + margin_error
    
    return mean_diff, ci_lower, ci_upper

print(f"\n=== WILCOXON SIGNED-RANK TEST for dataset {TARGET_DATASET} ===")
print('='*70)

for i in range(len(MODELS_TO_COMPARE)-1):
    model1 = MODELS_TO_COMPARE[0]  
    model2 = MODELS_TO_COMPARE[i+1]  
    
    # Take data for each model
    model1_data = merged_data[merged_data['Model'] == model1].copy()
    model2_data = merged_data[merged_data['Model'] == model2].copy()
    
    print(f"\n{model1} vs {model2}")
    print('-' * 40)
    
    # Make comparison DataFrame 
    comparison_df = pd.merge(
        model1_data[['Dataset_Type', 'Fold', 'AUC']],
        model2_data[['Dataset_Type', 'Fold', 'AUC']],
        on=['Dataset_Type', 'Fold'],
        suffixes=(f'_{model1}', f'_{model2}')
    )
    
    if len(comparison_df) > 0:
        auc1_col = f'AUC_{model1}'
        auc2_col = f'AUC_{model2}'
        
        # Prepare for Wilcoxon test
        model1_scores = comparison_df[auc1_col].values
        model2_scores = comparison_df[auc2_col].values
        
        # Tính statistics và CI
        mean_diff, ci_lower, ci_upper = calculate_confidence_interval(model1_scores, model2_scores)
        
        # Wilcoxon test
        statistic, p_value = wilcoxon(model1_scores, model2_scores, alternative='two-sided')
        statistic_greater, p_value_greater = wilcoxon(model1_scores, model2_scores, alternative='greater')
        
        # Compact output
        print(f"Mean Difference: {mean_diff:.4f}")
        print(f"95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")
        print(f"P-value (two-sided): {p_value:.4f}")
        print(f"P-value ({model1} > {model2}): {p_value_greater:.4f}")
        
        if p_value < alpha:
            if np.mean(model1_scores) > np.mean(model2_scores):
                print(f"✅ {model1} significantly better")
            else:
                print(f"✅ {model2} significantly better")
        else:
            print(f"❌ No significant difference")
        
        # Save comparison results to Excel
        comparison_df['Difference'] = comparison_df[auc1_col] - comparison_df[auc2_col]
        comparison_output = os.path.join(BASE_DIR, TARGET_DATASET, f"comparison_{model1}_vs_{model2}_{TARGET_DATASET}.xlsx")
        comparison_df.to_excel(comparison_output, index=False)
        
    else:
        print("❌ No matching data for comparison")

print('\n' + '='*70)


=== WILCOXON SIGNED-RANK TEST cho dataset diabetes ===

PinFSSVM vs MILP1
----------------------------------------
Mean Difference: 0.0042
95% CI: [-0.0016, 0.0101]
P-value (two-sided): 0.1494
P-value (PinFSSVM > MILP1): 0.0747
❌ No significant difference

PinFSSVM vs RFESVM
----------------------------------------
Mean Difference: 0.0034
95% CI: [-0.0056, 0.0124]
P-value (two-sided): 0.2188
P-value (PinFSSVM > RFESVM): 0.1094
❌ No significant difference

PinFSSVM vs FisherSVM
----------------------------------------
Mean Difference: 0.0151
95% CI: [0.0073, 0.0228]
P-value (two-sided): 0.0003
P-value (PinFSSVM > FisherSVM): 0.0001
✅ PinFSSVM significantly better

PinFSSVM vs L1SVM
----------------------------------------
Mean Difference: 0.0088
95% CI: [0.0016, 0.0160]
P-value (two-sided): 0.0143
P-value (PinFSSVM > L1SVM): 0.0071
✅ PinFSSVM significantly better

