In [None]:
import os
import optuna
import numpy as np
import pandas as pd
import warnings
# from three_class_ncv_selectk import NestedCVOptimizer
from two_class_ncv_selectk import NestedCVOptimizer
warnings.filterwarnings('ignore')

In [None]:
def nested_cv_binary(X, y, groups, exp_name, positive_class="void", save_plots=True):
    """
    Run binary nested cross-validation with comprehensive results including confusion matrices.
    
    Parameters:
    -----------
    X : DataFrame
        Feature matrix
    y : Series/array 
        Target labels (binary)
    groups : Series/array
        Group labels for GroupKFold
    exp_name : str
        Experiment name for saving plots
    positive_class : str
        Which class to treat as positive (default: "void")
    save_plots : bool
        Whether to save visualization plots
        
    Returns:
    --------
    dict: Comprehensive results including DataFrames and confusion matrices
    """
    # Initialize binary classifier
    optimizer = NestedCVOptimizer(  # Use Document 2 implementation
        X=X,
        y=y,
        groups=groups,
        positive_class=positive_class,
        n_outer_folds=5,
        n_inner_folds=3,
        n_trials=50,
        random_state=42
    )
    
    print(f"\nRunning binary nested cross-validation...")
    print(f"Positive class: '{positive_class}'")
    summaries = optimizer.run_nested_cv()
    
    # Get comprehensive results
    results_df = optimizer.get_results_dataframe()
    auc_summary = optimizer.get_auc_summary()
    feature_summary = optimizer.get_feature_selection_summary()
    cm_summary = optimizer.get_confusion_matrices_summary()
    cm_dataframe = optimizer.get_confusion_matrix_dataframe()
    
    # Print key results
    print(f"\nüìä BINARY CLASSIFICATION RESULTS:")
    if 'best_model' in summaries:
        print(f"Best performing model: {summaries['best_model'].upper()}")
        print(f"Best F1 (positive): {summaries['best_f1_positive']:.4f}")
        print(f"Best accuracy: {summaries['best_accuracy']:.4f}")
        print(f"Best AUC: {summaries['best_auc']:.4f}")
    else:
        print("Error: Could not determine best model")
    
    # Generate and save plots if requested
    if save_plots:
        plot_dir = f"plots_{exp_name}_binary"
        import os
        os.makedirs(plot_dir, exist_ok=True)
        
        # ROC curves
        optimizer.plot_roc_curves(save_path=f"{plot_dir}/roc_curves.png")
        
        # AUC comparison
        optimizer.plot_auc_comparison(save_path=f"{plot_dir}/auc_comparison.png")
        
        # Feature selection analysis
        optimizer.plot_feature_selection_analysis(save_path=f"{plot_dir}/feature_analysis.png")
        
        # Confusion matrices - aggregated view
        optimizer.plot_confusion_matrices(save_path=f"{plot_dir}/confusion_matrices.png")
        
        # Individual fold confusion matrices for best model
        best_model = summaries.get('best_model', 'rf')
        optimizer.plot_individual_fold_cms(
            model_name=best_model, 
            save_path=f"{plot_dir}/cm_folds_{best_model}.png"
        )
        
        print(f"\nüìÅ Plots saved to: {plot_dir}/")
    
    return {
        'summary': summaries,
        'results_dataframe': results_df,
        'auc_summary': auc_summary,
        'feature_summary': feature_summary,
        'confusion_matrices': cm_summary,
        'confusion_matrix_dataframe': cm_dataframe,
        'optimizer': optimizer  # Keep reference for additional analysis
    }

In [None]:
files = [
    'two_class_pp_1s_no.csv',
    'two_class_pp_1s_0.5.csv',
    'two_class_pp_1s_0.8.csv',
    'two_class_pp_2s_no.csv',
    'two_class_pp_2s_0.5.csv',
    'two_class_pp_2s_0.8.csv',
    'two_class_pp_3s_no.csv',
    'two_class_pp_3s_0.5.csv',
    'two_class_pp_3s_0.8.csv',
    'two_class_pp_4s_no.csv',
    'two_class_pp_4s_0.5.csv',
    'two_class_pp_4s_0.8.csv',
    'two_class_pp_5s_no.csv',
    'two_class_pp_5s_0.5.csv',
    'two_class_pp_5s_0.8.csv'
]

base_path = '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/data_extracted_features'


1. Loop through all files
2. Perform nested cross validation on each file
3. Store result in this format

![Output format](/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/output_format.png)


In [None]:
all_results = {}
for file in files:
    data_path = os.path.join(base_path, file)
    features = pd.read_csv(data_path)
    features.drop(columns=['center_time', 'start_time', 'end_time'], inplace=True)
    details = file.split('_')
    exp_name = f"{details[3]}_{details[-1].replace('.csv', '')}"
    print(f"Nested cross-validation for {exp_name}")
    
    X = features.drop(columns=['label', 'experiment_id'])
    y = features['label']
    groups = features['experiment_id']
    
    results = nested_cv_binary(X, y, groups, exp_name, positive_class="void", save_plots=True)

    # Save to csv
    results['results_dataframe'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/two_class/{exp_name}_two_class_nested_cv.csv')
    results['feature_summary'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/two_class/{exp_name}_two_class_feature_summary.csv')
    results['confusion_matrix_dataframe'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/two_class/{exp_name}_two_class_confusion_matrices.csv')


## Three class

In [None]:
# def nested_cv_multiclass(X, y, groups, exp_name, save_plots=True):
#     """
#     Run 3-class nested cross-validation with comprehensive results including confusion matrices.
    
#     Parameters:
#     -----------
#     X : DataFrame
#         Feature matrix
#     y : Series/array 
#         Target labels (3 classes: 'pre-void', 'void', 'post-void')
#     groups : Series/array
#         Group labels for GroupKFold
#     exp_name : str
#         Experiment name for saving plots
#     save_plots : bool
#         Whether to save visualization plots
        
#     Returns:
#     --------                cm_data = self.calculate_confusion_matrix(y_test_outer, y_pred, fold_idx, model_name)
#                     self.confusion_matrices[model_name].append(cm_data)
                    
#     dict: Comprehensive results including DataFrames and confusion matrices
#     """
#     # Initialize 3-class classifier
#     optimizer = NestedCVOptimizer(  
#         X=X,
#         y=y,
#         groups=groups,
#         n_outer_folds=5,
#         n_inner_folds=3,
#         n_trials=50,
#         random_state=42
#     )
    
#     print(f"\nRunning 3-class nested cross-validation...")
#     print(f"Expected classes: {sorted(y.unique())}")
#     summaries = optimizer.run_nested_cv()
    
#     # Get comprehensive results
#     results_df = optimizer.get_results_dataframe()
#     auc_summary = optimizer.get_auc_summary()
#     per_class_summary = optimizer.get_per_class_summary()
#     feature_summary = optimizer.get_feature_selection_summary()
#     cm_summary = optimizer.get_confusion_matrices_summary()
#     cm_dataframe = optimizer.get_confusion_matrix_dataframe()
    
#     # Print key results
#     print(f"\nüìä 3-CLASS CLASSIFICATION RESULTS:")
#     if 'best_model' in summaries:
#         print(f"Best performing model: {summaries['best_model'].upper()}")
#         print(f"Best F1 (macro): {summaries['best_f1_macro']:.4f}")
#         print(f"Best accuracy: {summaries['best_accuracy']:.4f}")
#         print(f"Best macro AUC: {summaries['best_macro_auc']:.4f}")
#     else:
#         print("Error: Could not determine best model")
    
#     # Generate and save plots if requested
#     if save_plots:
#         plot_dir = f"plots_{exp_name}_multiclass"
#         import os
#         os.makedirs(plot_dir, exist_ok=True)
        
#         # ROC curves (one-vs-rest for each class)
#         optimizer.plot_roc_curves(save_path=f"{plot_dir}/roc_curves.png")
        
#         # AUC comparison
#         optimizer.plot_auc_comparison(save_path=f"{plot_dir}/auc_comparison.png")
        
#         # Feature selection analysis
#         optimizer.plot_feature_selection_analysis(save_path=f"{plot_dir}/feature_analysis.png")
        
#         # Confusion matrices - aggregated view
#         optimizer.plot_confusion_matrices(save_path=f"{plot_dir}/confusion_matrices.png")
        
#         # Individual fold confusion matrices for best model
#         best_model = summaries.get('best_model', 'rf')
#         optimizer.plot_individual_fold_cms(
#             model_name=best_model, 
#             save_path=f"{plot_dir}/cm_folds_{best_model}.png"
#         )
        
#         print(f"\nüìÅ Plots saved to: {plot_dir}/")
    
#     return {
#         'summary': summaries,
#         'results_dataframe': results_df,
#         'auc_summary': auc_summary,
#         'per_class_summary': per_class_summary,
#         'feature_summary': feature_summary,
#         'confusion_matrices': cm_summary,
#         'confusion_matrix_dataframe': cm_dataframe,
#         'optimizer': optimizer  # Keep reference for additional analysis
#     }

In [None]:
# files = [
#     'three_class_pp_1s_no.csv',
#     'three_class_pp_1s_0.5.csv',
#     'three_class_pp_1s_0.8.csv',
#     'three_class_pp_2s_no.csv',
#     'three_class_pp_2s_0.5.csv',
#     'three_class_pp_2s_0.8.csv',
#     'three_class_pp_3s_no.csv',
#     'three_class_pp_3s_0.5.csv',
#     'three_class_pp_3s_0.8.csv',
#     'three_class_pp_4s_no.csv',
#     'three_class_pp_4s_0.5.csv',
#     'three_class_pp_4s_0.8.csv',
#     'three_class_pp_5s_no.csv',
#     'three_class_pp_5s_0.5.csv',
#     'three_class_pp_5s_0.8.csv'
# ]

# base_path = '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/data_extracted_features'

In [None]:
# all_results = {}
# for file in files:
#     data_path = os.path.join(base_path, file)
#     features = pd.read_csv(data_path)
#     features.drop(columns=['center_time', 'start_time', 'end_time'], inplace=True)
#     details = file.split('_')
#     exp_name = f"{details[3]}_{details[-1].replace('.csv', '')}"
#     print(f"Nested cross-validation for {exp_name}")
    
#     X = features.drop(columns=['label', 'experiment_id'])
#     y = features['label']
#     groups = features['experiment_id']
    
#     results = nested_cv_multiclass(X, y, groups, exp_name, save_plots=True)

#     # Save to csv
#     results['results_dataframe'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/three_class/{exp_name}_two_class_nested_cv.csv')
#     results['feature_summary'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/three_class/{exp_name}_two_class_feature_summary.csv')
#     results['confusion_matrix_dataframe'].to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/pipeline/modified/nested_cv_results/three_class/{exp_name}_two_class_confusion_matrices.csv')