In [None]:
import os
import optuna
import numpy as np
import pandas as pd
import warnings
from sb_norm_analysis.three_class.three_class_nested_cv import NestedCVOptimizer

warnings.filterwarnings('ignore')

In [None]:
def nested_cv(X, y, groups):
    """
    Run nested cross-validation with comprehensive results.
    Returns a DataFrame with all metrics, fold results, and summary statistics.
    """
    # Initialize and run optimizer
    optimizer = NestedCVOptimizer(
        X=X,
        y=y,
        groups=groups,
        n_outer_folds=5,
        n_inner_folds=3,
        n_trials=50,
        random_state=42
    )
    
    print("\nRunning nested cross-validation...")
    summaries = optimizer.run_nested_cv()

    # Display summary
    print(f"\nðŸ“Š RESULTS SUMMARY:")
    print(f"Best performing model: {summaries['best_model'].upper()}")
    print(f"Best accuracy: {summaries['best_accuracy']:.4f}")

    results = optimizer.get_results_dataframe()

    
    return results

In [None]:
files = [
    # 'three_class_raw_1s_no.csv',
    'three_class_raw_1s_0.5.csv',
    'three_class_raw_1s_0.8.csv',
    'three_class_raw_2s_no.csv',
    'three_class_raw_2s_0.5.csv',
    'three_class_raw_2s_0.8.csv',
    'three_class_raw_3s_no.csv',
    'three_class_raw_3s_0.5.csv',
    'three_class_raw_3s_0.8.csv',
    'three_class_raw_4s_no.csv',
    'three_class_raw_4s_0.5.csv',
    'three_class_raw_4s_0.8.csv',
    'three_class_raw_5s_no.csv',
    'three_class_raw_5s_0.5.csv',
    'three_class_raw_5s_0.8.csv'
]

base_path = '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/feature_datasets/subject_based_norm/feature_set_2'

1. Loop through all files
2. Perform nested cross validation on each file
3. Store result in this format

![Output format](/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/output_format.png)


In [None]:
all_results = {}
for file in files:
    data_path = os.path.join(base_path, file)
    features = pd.read_csv(data_path)
    features.drop(columns=['center_time', 'start_time', 'end_time'], inplace=True)
    details = file.split('_')
    exp_name = f"{details[3]}_{details[-1].replace('.csv', '')}"
    print(f"Nested cross-validation for {exp_name}")
    
    X = features.drop(columns=['label', 'experiment_id'])
    y = features['label']
    groups = features['experiment_id']
    
    results_df = nested_cv(X, y, groups)

    # Save to csv
    results_df.to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/sb_norm_analysis/three_class/nested_cv_results/feature_set_2/all_metrics_nested_cv_{exp_name}.csv')
    


Single test

In [None]:
# data_path = os.path.join(base_path, files[0])
# features = pd.read_csv(data_path)
# features.drop(columns=['center_time', 'start_time', 'end_time'], inplace=True)
# details = files[0].split('_')
# exp_name = f"{details[3]}_{details[-1].replace('.csv', '')}"
# print(f"Nested cross-validation for {exp_name}")
    
# X = features.drop(columns=['label', 'experiment_id'])
# y = features['label']
# groups = features['experiment_id']


# # Initialize and run optimizer
# optimizer = NestedCVOptimizer(
#         X=X,
#         y=y,
#         groups=groups,
#         n_outer_folds=5,
#         n_inner_folds=3,
#         n_trials=50,
#         random_state=42
#     )

# print("\nRunning nested cross-validation...")
# summaries = optimizer.run_nested_cv()

# # Display summary
# print(f"\nðŸ“Š RESULTS SUMMARY:")
# print(f"Best performing model: {summaries['best_model'].upper()}")
# print(f"Best accuracy: {summaries['best_accuracy']:.4f}")

# results = optimizer.get_results_dataframe()


# results.to_csv(f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/sb_norm_analysis/three_class/nested_cv_results/feature_set_2/all_metrics_nested_cv_{exp_name}_2.csv')


In [None]:
# def nested_cv(X, y, groups):
#     # Initialize optimizer
#     optimizer = NestedCVOptimizer(
#         X=X,
#         y=y,
#         groups=groups,
#         n_outer_folds=5,
#         n_inner_folds=3,
#         n_trials=50,
#         random_state=42
#     )
    
#     # Run nested CV
#     print("\nRunning nested cross-validation...")
#     results = optimizer.run_nested_cv()
    
    
#     # Get fold accuracies
#     fold_accuracies = optimizer.get_fold_results()
    
    
#     # Create comprehensive results DataFrame
#     results_data = []
    
#     for model_name in optimizer.models:
#         for fold_idx in range(optimizer.n_outer_folds):
#             # Get accuracy for this fold
#             accuracy = fold_accuracies[model_name][fold_idx]
            
#             # Get best parameters for this fold
#             best_params = optimizer.best_params_per_fold[model_name][fold_idx]
            
#             # Create row
#             row = {
#                 'Model': model_name.upper(),
#                 'Fold': fold_idx + 1,
#                 'Accuracy': round(accuracy, 4)
#             }
            
#             # Add hyperparameters as separate columns
#             for param_name, param_value in best_params.items():
#                 row[f'best_{param_name}'] = param_value
            
#             results_data.append(row)
    
#     # Convert to DataFrame
#     results_df = pd.DataFrame(results_data)
    
#     # Add summary statistics
#     summary_stats = []
#     for model_name in optimizer.models:
#         accuracies = fold_accuracies[model_name]
#         summary_stats.append({
#             'Model': model_name.upper(),
#             'Fold': 'MEAN',
#             'Accuracy': round(np.mean(accuracies), 4)
#         })
#         summary_stats.append({
#             'Model': model_name.upper(), 
#             'Fold': 'STD',
#             'Accuracy': round(np.std(accuracies), 4)
#         })
    
#     summary_df = pd.DataFrame(summary_stats)
    
#     # Combine results
#     final_df = pd.concat([results_df, summary_df], ignore_index=True)
    
#     print(f"\nðŸ“Š RESULTS SUMMARY:")
#     print(f"Best performing model: {results['best_model'].upper()}")
#     print(f"Best accuracy: {results['best_accuracy']:.4f}")
    
#     print(f"\nðŸ“ˆ PERFORMANCE SUMMARY:")
#     for model_name in optimizer.models:
#         model_scores = [score for score in optimizer.outer_scores[model_name]]
        
#         if model_scores:
#             # Calculate means across folds
#             accuracies = [score['accuracy'] for score in model_scores]
#             precision_macro = [score['precision_macro'] for score in model_scores]
#             recall_macro = [score['recall_macro'] for score in model_scores]
#             f1_macro = [score['f1_macro'] for score in model_scores]
            
#             print(f"\n{model_name.upper()}:")
#             print(f"  Accuracy:        {np.mean(accuracies):.4f} Â± {np.std(accuracies):.4f}")
#             print(f"  Precision Macro: {np.mean(precision_macro):.4f} Â± {np.std(precision_macro):.4f}")
#             print(f"  Recall Macro:    {np.mean(recall_macro):.4f} Â± {np.std(recall_macro):.4f}")
#             print(f"  F1 Macro:        {np.mean(f1_macro):.4f} Â± {np.std(f1_macro):.4f}")
#             print(f"  Fold Accuracies: {[round(acc, 4) for acc in accuracies]}")
    
#     return final_df

In [None]:
# results_df = pd.read_csv('/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/sb_norm_analysis/three_class/nested_cv_results/feature_set_2/nested_cv_1s_no.csv')
# print(f"\n" + "="*80)
# print("RESULTS DATAFRAME:")
# print("="*80)
# print(results_df.to_string(index=False))

# # Example: Show key metrics comparison
# print(f"\n" + "="*70)
# print("KEY METRICS COMPARISON:")
# print("="*70)
        
# # Filter to fold results only (exclude MEAN/STD rows)
# fold_results = results_df[results_df['Fold'].isin([1,2,3,4,5])]

# # Create summary pivot tables
# metrics_to_show = ['Accuracy', 'Precision_Macro', 'Recall_Macro', 'F1_Macro']

# for metric in metrics_to_show:
#     print(f"\n{metric.replace('_', ' ').upper()}:")
#     pivot_df = fold_results.pivot(index='Fold', columns='Model', values=metric)
#     print(pivot_df)
        
# print(f"\n" + "="*70)
# print("SUMMARY STATISTICS:")
# print("="*70)
        
# # Show mean Â± std for each metric
# stats_results = results_df[results_df['Fold'].isin(['MEAN', 'STD'])]
        
# for metric in metrics_to_show:
#     print(f"\n{metric.replace('_', ' ').upper()}:")
#     pivot_stats = stats_results.pivot(index='Fold', columns='Model', values=metric)
#     print(pivot_stats)