In [101]:
import pandas as pd
from scipy.stats import wilcoxon, bootstrap # Thêm bootstrap
import os
import numpy as np

# --- CẤU HÌNH CẦN THAY ĐỔI ---
BASE_RESULTS_DIR = r'D:\Pin_FS_SVM\src\experiment\results'
WILCOXON_DATA_SUBDIR = 'wilcoxon'
METRIC_COLUMN_NAME = 'AUC'

TARGET_DATASET_FOLDER = 'diabetes'  # Ví dụ: 'wdbc', 'colon', 'ionosphere'
TARGET_DATASET_TYPE_FOLDER = 'both'
MODEL1_CLASS_NAME = 'PinFSSVM'
MODEL2_CLASS_NAME = 'RFESVM'

ALPHA = 0.05 # Mức ý nghĩa
N_BOOTSTRAP_RESAMPLES = 9999 # Số lần lặp bootstrap (nên > 1000)
CONFIDENCE_LEVEL_FOR_CI = 0.95 # Mức tin cậy cho CI (tương ứng với alpha 0.05)
# --- KẾT THÚC CẤU HÌNH ---

def load_scores_from_file(dataset_folder, dataset_type_folder, model_name):
    # ... (Hàm này giữ nguyên như trước) ...
    file_path = os.path.join(BASE_RESULTS_DIR, WILCOXON_DATA_SUBDIR,
                             dataset_folder, dataset_type_folder,
                             f"{model_name}_{METRIC_COLUMN_NAME.lower()}_folds.xlsx")
    print(f"Attempting to read: {file_path}")
    if not os.path.exists(file_path):
        print(f"  ERROR: File not found: {file_path}")
        return None
    try:
        df = pd.read_excel(file_path)
        if METRIC_COLUMN_NAME not in df.columns:
            print(f"  ERROR: Column '{METRIC_COLUMN_NAME}' not found in {file_path}")
            return None
        scores = df[METRIC_COLUMN_NAME].values
        if len(scores) == 0:
            print(f"  ERROR: No data in '{METRIC_COLUMN_NAME}' column in {file_path}")
            return None
        if np.isnan(scores).any():
            print(f"  ERROR: NaN values found in '{METRIC_COLUMN_NAME}' column in {file_path}.")
            return None
        return scores
    except Exception as e:
        print(f"  ERROR: Could not read file {file_path}. Error: {e}")
        return None

def get_bootstrap_ci(data1, data2, n_resamples, confidence_level, statistic_func=np.median):
    """Tính CI cho một thống kê của sự khác biệt (data1 - data2) bằng bootstrap."""
    differences = np.array(data1) - np.array(data2)
    if len(differences) < 5: # Bootstrap cần ít nhất 2 điểm, tốt hơn là nhiều hơn
        print("  Warning: Not enough data points for bootstrap CI on differences.")
        return (np.nan, np.nan)
    try:
        res = bootstrap((differences,), statistic_func, confidence_level=confidence_level,
                        n_resamples=n_resamples, method='percentile')
        return (res.confidence_interval.low, res.confidence_interval.high)
    except Exception as e:
        print(f"  Error during bootstrap CI calculation: {e}")
        return (np.nan, np.nan)

def perform_single_wilcoxon_test_with_ci(scores1, scores2, model1_name, model2_name, alpha_level,
                                         n_bootstrap_resamples, ci_confidence_level):
    print("-" * 40)
    print(f"Performing Wilcoxon Signed-Rank Test (Two-Sided) with Confidence Interval")
    # ... (phần print thông tin model, mean AUC, differences giữ nguyên như trước) ...
    print(f"Comparing: {model1_name} vs {model2_name}")
    print(f"Alpha level: {alpha_level}")
    print(f"Scores for {model1_name} (N={len(scores1)}): {scores1}")
    print(f"Scores for {model2_name} (N={len(scores2)}): {scores2}")
    print(f"{model1_name} Mean {METRIC_COLUMN_NAME}: {np.mean(scores1):.4f}")
    print(f"{model2_name} Mean {METRIC_COLUMN_NAME}:    {np.mean(scores2):.4f}")

    if len(scores1) != len(scores2):
        print("ERROR: Score lists have different lengths. Cannot perform paired test.")
        return
    if len(scores1) < 5:
        print(f"WARNING: Sample size (N={len(scores1)}) is small. Test results might not be very reliable.")

    differences = scores1 - scores2
    print(f"Differences ({model1_name} - {model2_name}): {differences}")
    median_of_differences = np.median(differences)
    print(f"Median of Differences: {median_of_differences:.4f}")


    p_value = 1.0
    statistic = np.nan
    conclusion = "Cannot conclude due to error or insufficient data."
    ci_low, ci_high = (np.nan, np.nan)

    if np.all(differences == 0):
        print("All differences are zero. The two models performed identically on these folds.")
        p_value = 1.0
        conclusion = "No difference (p-value = 1.0)."
        ci_low, ci_high = (0.0, 0.0) 
    else:
        try:
            statistic, p_value = wilcoxon(scores1, scores2, alternative = 'two-sided')
            print(f"Wilcoxon Statistic: {statistic:.4f}")
            print(f"P-value (two-sided): {p_value:.4f}")

            # Tính CI cho trung vị của sự khác biệt
            ci_low, ci_high = get_bootstrap_ci(scores1, scores2, n_bootstrap_resamples, ci_confidence_level)
            print(f"{int(ci_confidence_level*100)}% CI for Median Difference ({model1_name} - {model2_name}): ({ci_low:.4f}, {ci_high:.4f})")


            if p_value < alpha_level:
                conclusion = f"Statistically significant difference found (p < {alpha_level})."
                if ci_low >= 0 and ci_high >= 0 : # Khoảng CI hoàn toàn dương
                    conclusion += f" {model1_name} is likely better than {model2_name}."
                elif ci_low < 0 and ci_high < 0: # Khoảng CI hoàn toàn âm
                    conclusion += f" {model2_name} is likely better than {model1_name}."
            else:
                conclusion = f"No statistically significant difference found (p >= {alpha_level})."
                if ci_low > 0 and ci_high > 0 : # Khoảng CI hoàn toàn dương
                    conclusion += f" Howerver, CI is positive means that {model1_name} is likely better than {model2_name}."
                if ci_low <= 0 <= ci_high:
                     conclusion += " CI for median difference includes zero, supporting no significant difference."

        except ValueError as e:
            print(f"ERROR during Wilcoxon test: {e}")
            conclusion = f"Wilcoxon test could not be performed: {e}"

    print(f"Conclusion: {conclusion}")
    print("-" * 40)

if __name__ == '__main__':
    print("--- Wilcoxon Signed-Rank Test Script with Confidence Interval ---")
    print(f"Dataset: {TARGET_DATASET_FOLDER} - Type: {TARGET_DATASET_TYPE_FOLDER}")
    print(f"Comparing: {MODEL1_CLASS_NAME} vs {MODEL2_CLASS_NAME}")
    print(f"Metric: {METRIC_COLUMN_NAME}")
    print("-" * 40)

    scores_m1 = load_scores_from_file(TARGET_DATASET_FOLDER, TARGET_DATASET_TYPE_FOLDER, MODEL1_CLASS_NAME)
    scores_m2 = load_scores_from_file(TARGET_DATASET_FOLDER, TARGET_DATASET_TYPE_FOLDER, MODEL2_CLASS_NAME)

    if scores_m1 is not None and scores_m2 is not None:
        perform_single_wilcoxon_test_with_ci(scores_m1, scores_m2,
                                             MODEL1_CLASS_NAME, MODEL2_CLASS_NAME,
                                             ALPHA, N_BOOTSTRAP_RESAMPLES, CONFIDENCE_LEVEL_FOR_CI)
    else:
        print("\nCould not perform test due to errors in loading data for one or both models.")

--- Wilcoxon Signed-Rank Test Script with Confidence Interval ---
Dataset: diabetes - Type: both
Comparing: PinFSSVM vs RFESVM
Metric: AUC
----------------------------------------
Attempting to read: D:\Pin_FS_SVM\src\experiment\results\wilcoxon\diabetes\both\PinFSSVM_auc_folds.xlsx
Attempting to read: D:\Pin_FS_SVM\src\experiment\results\wilcoxon\diabetes\both\RFESVM_auc_folds.xlsx
----------------------------------------
Performing Wilcoxon Signed-Rank Test (Two-Sided) with Confidence Interval
Comparing: PinFSSVM vs RFESVM
Alpha level: 0.05
Scores for PinFSSVM (N=10): [0.63807692 0.68891403 0.59692982 0.78615385 0.70219298 0.66287879
 0.67916667 0.67467949 0.68269231 0.72275641]
Scores for RFESVM (N=10): [0.63807692 0.68891403 0.58815789 0.76615385 0.70219298 0.64772727
 0.66354167 0.65384615 0.66185897 0.72275641]
PinFSSVM Mean AUC: 0.6834
RFESVM Mean AUC:    0.6733
Differences (PinFSSVM - RFESVM): [0.         0.         0.00877193 0.02       0.         0.01515152
 0.015625   0.0208