In [15]:
import pandas as pd
import numpy as np
import pickle
import os
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                        roc_curve, auc, roc_auc_score, log_loss)
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
from essentials import complete_preprocessing_pipeline
import copy
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from typing import Tuple, Dict
from essentials import normalization
from feature_sets_center_less import GenerateFeatures
from three_class_ncv_selectk_gn import ModifiedNestedCVOptimizer


In [None]:
with open('../data/df_dict_imu.pkl', 'rb') as f:
    imu_dict = pickle.load(f)
with open('../data/df_dict_urineestimate_method1.pkl', 'rb') as f:
    urine_estimates_dict = pickle.load(f)
with open('../data/df_minze_dict.pkl', 'rb') as f:
    ground_truth_dict = pickle.load(f)

In [None]:
del imu_dict['subj_9_void4']
del imu_dict['subj_11_void2']

In [None]:
data_dict = copy.deepcopy(imu_dict)
labelled_imu_dict = complete_preprocessing_pipeline(data_dict, ground_truth_dict, 
                                target_fs=60,normalize_data=False, use_three_classes=True)

In [None]:
# Add experiment_id to each dataframe and append to imu_dict
imu_list = []
for i, key in enumerate(labelled_imu_dict.keys()):
    df  = labelled_imu_dict[key]
    df['experiment_id'] = i + 1

    imu_list.append(df)

In [None]:
# Concatenate all the dataframes in imu_dict into a single dataframe
main_df = pd.concat(imu_list, ignore_index=True)

In [None]:
# Define window configurations (same as your pipeline)
window_configs = [
    (1, 0.0), (1, 0.5), (1, 0.8),
    (2, 0.0), (2, 0.5), (2, 0.8),
    (3, 0.0), (3, 0.5), (3, 0.8),
    (4, 0.0), (4, 0.5), (4, 0.8),
    (5, 0.0), (5, 0.5), (5, 0.8)
]

def dataframe_to_dict_by_experiment(df: pd.DataFrame) -> dict:
    """Convert DataFrame to dictionary with experiment_id as key"""
    experiment_dict = {}
    for exp_id in df['experiment_id'].unique():
        exp_data = df[df['experiment_id'] == exp_id].copy()
        exp_data = exp_data.drop(columns=['experiment_id'])
        # IMPORTANT: Reset index to avoid KeyError in feature extraction
        exp_data = exp_data.reset_index(drop=True)
        experiment_dict[f'exp_{exp_id}'] = exp_data
    return experiment_dict

def extract_features_from_dict(data_dict: dict, window_size: float, overlap: float) -> pd.DataFrame:
    """Extract features from dictionary of DataFrames"""
    all_features = []
    
    for exp_key, df in data_dict.items():
        actual_exp_id = int(exp_key.split('_')[1])
        
        analyzer = GenerateFeatures(fs=60, window_duration=window_size, overlap=overlap)
        features, _ = analyzer.analyze_multi_axis_imu(df)
        
        table = analyzer.create_summary_table()
        table['experiment_id'] = actual_exp_id
        all_features.append(table)
    
    return pd.concat(all_features, ignore_index=True)


In [None]:
# Store results for all window configurations
all_global_norm_results = {}

In [None]:
n_outer_splits = 5
n_inner_splits = 3

outer_cv = StratifiedGroupKFold(n_splits=n_outer_splits, shuffle=True, random_state=42)
inner_cv = StratifiedGroupKFold(n_splits=n_inner_splits, shuffle=True, random_state=42)

# Loop through each window configuration
for window_size, overlap in window_configs:
    print(f"\n{'='*80}")
    print(f"PROCESSING: {window_size}s window, {overlap} overlap")
    print(f"{'='*80}")
    
    config_fold_results = []

    # Split dictionary into training and testing sets based on void instances
    for fold_id, (train_id, test_id) in enumerate(outer_cv.split(main_df, y=main_df['label'], groups=main_df['experiment_id'])):
        print(f"Fold {fold_id + 1}")
        data_train, data_test = main_df.iloc[train_id], main_df.iloc[test_id]
        _, _ = main_df['label'].iloc[train_id], main_df['label'].iloc[test_id]
        groups_train, groups_test = main_df['experiment_id'].iloc[train_id], main_df['experiment_id'].iloc[test_id]

        
        # Apply global normalization
        data_train_norm, data_test_norm = normalization(data_train, data_test)

        print(f"✓ Applied global normalization")
        print(f"Train experiments: {sorted(groups_train.unique())}")
        print(f"Test experiments: {sorted(groups_test.unique())}")
    
        # Convert normalized DataFrames to dictionaries for feature extraction
        train_dict = dataframe_to_dict_by_experiment(data_train_norm)
        test_dict = dataframe_to_dict_by_experiment(data_test_norm)
        
        # Extract features
        print(f"Extracting features...")
        train_features = extract_features_from_dict(train_dict, window_size, overlap)
        test_features = extract_features_from_dict(test_dict, window_size, overlap)
        
        print(f"Train features shape: {train_features.shape}")
        print(f"Test features shape: {test_features.shape}")
        
        # Store this fold's results
        fold_result = {
            'fold': fold_id + 1,
            'window_size': window_size,
            'overlap': overlap,
            'train_features': train_features,
            'test_features': test_features,
            'original_train_groups': groups_train,
            'original_test_groups': groups_test
        }
        config_fold_results.append(fold_result)
        
    # Store results for this configuration - for each window size and overlap, we get 5 fold results
    overlap_str = 'no' if overlap == 0.0 else str(overlap)
    config_key = f"{window_size}s_{overlap_str}"
    all_global_norm_results[config_key] = config_fold_results
    
    print(f"\n✓ Completed {config_key}: {len(config_fold_results)} folds")
    
print(f"\n{'='*80}")
print("FEATURE EXTRACTION COMPLETED FOR ALL CONFIGURATIONS")
print(f"{'='*80}")

# Show summary
for config_key, fold_results in all_global_norm_results.items():
    print(f"\n{config_key}:")
    for fold_result in fold_results:
        fold_id = fold_result['fold']
        train_shape = fold_result['train_features'].shape
        test_shape = fold_result['test_features'].shape
        print(f"  Fold {fold_result['fold']}: Train{train_shape}, Test{test_shape}")
        
        # Save train and test features to CSV for inspection
        fold_result['train_features'].to_csv(
            f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/extracted_features/three_class/train_{config_key}_fold{fold_id}.csv', 
            index=False
        )
        fold_result['test_features'].to_csv(
            f'/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/extracted_features/three_class/test_{config_key}_fold{fold_id}.csv', 
            index=False
        )
        


In [12]:
# pickle all_global_norm_results
with open('/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/three_class_global_norm_extracted_features.pkl', 'wb') as f:
    pickle.dump(all_global_norm_results,f)

# Nested CV with pre-split data

In [None]:
# # Import the modified optimizer

# # Run the modified nested CV on your extracted features
# print(f"\n{'='*80}")
# print("STARTING MODIFIED NESTED CV WITH HYPERPARAMETER OPTIMIZATION")
# print(f"{'='*80}")

# # Check what you have
# print(f"Available configurations: {list(all_global_norm_results.keys())}")
# print(f"Total folds per configuration: {len(list(all_global_norm_results.values())[0])}")

# # Run the evaluation
# detailed_results, summary_results, optimizer = run_modified_nested_cv(
#     all_global_norm_results,
#     positive_class="void",
#     n_inner_folds=3,  # Same as your original
#     n_trials=50       # Same as your original
# )

# # Save detailed results
# detailed_results.to_csv(
#     '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/nested_cv_results/global_norm_detailed_results.csv',
#     index=False
# )

# # Save summary results
# summary_results.to_csv(
#     '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/nested_cv_results/global_norm_summary_results.csv',
#     index=False
# )

# print(f"\n{'='*80}")
# print("RESULTS SUMMARY - AVERAGES ACROSS 5 OUTER FOLDS")
# print(f"{'='*80}")

# # Verify we have 5 folds per configuration
# folds_per_config = detailed_results.groupby(['config', 'model'])['fold'].nunique()
# print(f"Verification - Folds per config/model: {folds_per_config.iloc[0]} (should be 5)")
# print(f"Total evaluations: {len(detailed_results)} (should be {15 * 5 * 3} = 225)")

# # Show best configurations with explicit fold averaging
# print(f"\n{'='*50}")
# print("TOP 10 CONFIGURATIONS (Mean ± Std across 5 folds)")
# print(f"{'='*50}")
# top_configs = summary_results.sort_values('f1_positive_mean', ascending=False).head(10)

# print(f"{'Config':<12} {'Model':<4} {'F1(+)':<12} {'Accuracy':<12} {'AUC':<12}")
# print("-" * 60)
# for _, row in top_configs.iterrows():
#     print(f"{row['config']:<12} {row['model']:<4} "
#           f"{row['f1_positive_mean']:.3f}±{row['f1_positive_std']:.3f}  "
#           f"{row['accuracy_mean']:.3f}±{row['accuracy_std']:.3f}  "
#           f"{row['auc_mean']:.3f}±{row['auc_std']:.3f}")

# # Show best model overall
# best_result = summary_results.loc[summary_results['f1_positive_mean'].idxmax()]
# print(f"\n{'='*50}")
# print("BEST OVERALL RESULT (Mean ± Std across 5 folds)")
# print(f"{'='*50}")
# print(f"Configuration: {best_result['config']}")
# print(f"Model: {best_result['model']}")
# print(f"F1 (positive): {best_result['f1_positive_mean']:.4f} ± {best_result['f1_positive_std']:.4f}")
# print(f"Accuracy: {best_result['accuracy_mean']:.4f} ± {best_result['accuracy_std']:.4f}")
# print(f"AUC: {best_result['auc_mean']:.4f} ± {best_result['auc_std']:.4f}")
# print(f"Precision (pos): {best_result['precision_positive_mean']:.4f} ± {best_result['precision_positive_std']:.4f}")
# print(f"Recall (pos): {best_result['recall_positive_mean']:.4f} ± {best_result['recall_positive_std']:.4f}")

# # Model comparison (averaged across ALL configurations and folds)
# print(f"\n{'='*50}")
# print("MODEL COMPARISON (Mean ± Std across all configs & folds)")
# print(f"{'='*50}")
# model_stats = []
# for model in detailed_results['model'].unique():
#     model_data = detailed_results[detailed_results['model'] == model]
#     model_stats.append({
#         'Model': model,
#         'F1_Pos_Mean': model_data['f1_positive'].mean(),
#         'F1_Pos_Std': model_data['f1_positive'].std(),
#         'Accuracy_Mean': model_data['accuracy'].mean(),
#         'Accuracy_Std': model_data['accuracy'].std(),
#         'AUC_Mean': model_data['auc'].mean(),
#         'AUC_Std': model_data['auc'].std(),
#         'N_Evaluations': len(model_data)
#     })

# model_comparison_df = pd.DataFrame(model_stats)
# print(model_comparison_df.round(4))

# # Configuration comparison (averaged across models and folds)
# print(f"\n{'='*50}")
# print("TOP 5 WINDOW CONFIGURATIONS (Mean ± Std across models & folds)")
# print(f"{'='*50}")
# config_stats = []
# for config in detailed_results['config'].unique():
#     config_data = detailed_results[detailed_results['config'] == config]
#     config_stats.append({
#         'Config': config,
#         'F1_Pos_Mean': config_data['f1_positive'].mean(),
#         'F1_Pos_Std': config_data['f1_positive'].std(),
#         'Accuracy_Mean': config_data['accuracy'].mean(),
#         'AUC_Mean': config_data['auc'].mean(),
#         'N_Evaluations': len(config_data)
#     })

# config_comparison_df = pd.DataFrame(config_stats).sort_values('F1_Pos_Mean', ascending=False)
# print(config_comparison_df.head().round(4))

# print(f"\n{'='*80}")
# print("ANALYSIS COMPLETE")
# print(f"Detailed results saved to: global_normalization/nested_cv_results/global_norm_detailed_results.csv")
# print(f"Summary results saved to: global_normalization/nested_cv_results/global_norm_summary_results.csv")
# print(f"{'='*80}")

# Nested CV with pre-split data - save each dataset separately

In [13]:
# open pickled results
with open('/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/three_class_global_norm_extracted_features.pkl', 'rb') as f:
    all_global_norm_results = pickle.load(f)

In [16]:
def run_global_norm_nested_cv_with_class(all_global_norm_results: Dict, base_save_path: str, 
                                        n_inner_folds: int = 3, 
                                        n_trials: int = 50) -> ModifiedNestedCVOptimizer:
    """
    Convenience function to run global normalization nested CV using the class-based approach.
    
    Parameters:
    -----------
    all_global_norm_results : Dict
        Results from your global normalization pipeline
    base_save_path : str
        Directory to save results
    positive_class : str
        Positive class label
    n_inner_folds : int
        Number of inner CV folds for hyperparameter optimization
    n_trials : int
        Number of optimization trials per model
        
    Returns:
    --------
    ModifiedNestedCVOptimizer instance with all results
    """
    
    # Initialize the optimizer
    optimizer = ModifiedNestedCVOptimizer(
        n_inner_folds=n_inner_folds,
        n_trials=n_trials,
        random_state=42
    )
    
    # Run all configurations
    all_results = optimizer.run_all_configurations(all_global_norm_results, base_save_path)
    
    # Print summary
    best_config, best_model, best_f1 = optimizer.get_best_configuration()
    print(f"\nBEST OVERALL RESULT:")
    print(f"Configuration: {best_config}")
    print(f"Model: {best_model}")
    print(f"F1 Score (positive class): {best_f1:.4f}")
    
    return optimizer


In [17]:
# Simple usage in your notebook:
base_save_path = '/home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/nested_cv_results/three_class'

# Run everything with one function call
optimizer = run_global_norm_nested_cv_with_class(
    all_global_norm_results,
    base_save_path,
    n_inner_folds=3,
    n_trials=50
)

# Then analyze results easily:
best_config, best_model, best_f1 = optimizer.get_best_configuration()
print(f"Best: {best_config} with {best_model} - F1: {best_f1:.4f}")

# Compare specific configurations
comparison = optimizer.compare_configurations(['1s_0.5', '2s_0.5', '3s_0.5'])
print(comparison)

# Get detailed summary for best configuration
summary = optimizer.get_configuration_summary(best_config)
print(summary)

[I 2025-09-20 16:41:38,615] A new study created in memory with name: no-name-8b27e755-c754-4558-b35a-82a06a330fdd



GLOBAL NORMALIZATION NESTED CV - CLASS-BASED APPROACH
Base save path: /home/edumaba/Public/MPhil_Thesis/Code/wear_uropatch/global_normalization/nested_cv_results/three_class
Configurations to process: 15

PROCESSING CONFIGURATION: 1s_no

  OUTER FOLD 1
  Train: (2020, 107), Test: (378, 107)
    Optimizing RF...


[I 2025-09-20 16:41:39,918] Trial 0 finished with value: -0.6633519959316031 and parameters: {'selector__k': 21, 'selector__score_func': 'f_classif', 'clf__n_estimators': 369, 'clf__max_depth': 3, 'clf__min_samples_split': 20, 'clf__min_samples_leaf': 9, 'clf__max_features': 'sqrt', 'clf__bootstrap': False, 'clf__class_weight': None}. Best is trial 0 with value: -0.6633519959316031.
[I 2025-09-20 16:41:40,297] Trial 1 finished with value: -0.6547456198398659 and parameters: {'selector__k': 73, 'selector__score_func': 'f_classif', 'clf__n_estimators': 70, 'clf__max_depth': 13, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__bootstrap': True, 'clf__class_weight': None}. Best is trial 1 with value: -0.6547456198398659.
[I 2025-09-20 16:41:42,446] Trial 2 finished with value: -4.734611287458243 and parameters: {'selector__k': 42, 'selector__score_func': 'f_classif', 'clf__n_estimators': 133, 'clf__max_depth': 20, 'clf__min_samples_split': 16, 'cl

KeyboardInterrupt: 