In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                        roc_curve, auc, roc_auc_score, log_loss)
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
from essentials import complete_preprocessing_pipeline
import copy
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from typing import Tuple
from essentials import normalization



In [2]:
with open('../data/df_dict_imu.pkl', 'rb') as f:
    imu_dict = pickle.load(f)
with open('../data/df_dict_urineestimate_method1.pkl', 'rb') as f:
    urine_estimates_dict = pickle.load(f)
with open('../data/df_minze_dict.pkl', 'rb') as f:
    ground_truth_dict = pickle.load(f)

In [3]:
del imu_dict['subj_9_void4']
del imu_dict['subj_11_void2']

In [4]:
data_dict = copy.deepcopy(imu_dict)
labelled_imu_dict = complete_preprocessing_pipeline(data_dict, ground_truth_dict, 
                                target_fs=60,normalize_data=False, use_three_classes=False)

Step 1: Resampling data to 60 Hz...


100%|██████████| 41/41 [00:00<00:00, 595.13it/s]
Step 2: Processing each instance: 100%|██████████| 41/41 [00:00<00:00, 1201.26it/s]


In [5]:
# Add experiment_id to each dataframe and append to imu_dict
imu_list = []
for i, key in enumerate(labelled_imu_dict.keys()):
    df  = labelled_imu_dict[key]
    df['experiment_id'] = i + 1

    imu_list.append(df)

In [6]:
# Concatenate all the dataframes in imu_dict into a single dataframe
main_df = pd.concat(imu_list, ignore_index=True)

In [7]:
main_df

Unnamed: 0,time,acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,label,experiment_id
0,0.000000,0.066242,-13.923373,-17.075456,-38.322404,8.725663,83.141039,non-void,1
1,0.016672,0.042377,-13.943841,-17.017971,-26.512188,2.859905,81.112387,non-void,1
2,0.033344,0.008036,-13.984501,-16.977426,-13.290214,-2.208047,90.963856,non-void,1
3,0.050016,-0.011408,-14.014916,-16.958092,-5.035968,-4.649128,128.658540,non-void,1
4,0.066688,0.012667,-13.938258,-16.923885,3.089309,-1.511407,184.109181,non-void,1
...,...,...,...,...,...,...,...,...,...
145221,41.109668,-0.414715,-9.773258,-5.399930,-5.878189,-39.491056,5.752304,non-void,41
145222,41.126339,-0.446581,-9.757047,-5.420230,-14.090114,-42.481000,-8.307202,non-void,41
145223,41.143010,-0.432605,-9.771430,-5.389436,-23.564834,-46.225333,-23.997358,non-void,41
145224,41.159680,-0.406158,-9.756612,-5.327400,-19.459784,-51.782481,-38.238428,non-void,41


In [8]:
n_outer_splits = 5
n_inner_splits = 3

outer_cv = StratifiedGroupKFold(n_splits=n_outer_splits, shuffle=True, random_state=42)
inner_cv = StratifiedGroupKFold(n_splits=n_inner_splits, shuffle=True, random_state=42)

In [None]:
# Split dictionary into training and testing sets based on void instances
val = outer_cv.split(main_df, y=main_df['label'], groups=main_df['experiment_id'])
for fold_id, (train_id, test_id) in enumerate(outer_cv.split(main_df, y=main_df['label'], groups=main_df['experiment_id'])):
    print(f"Fold {fold_id + 1}")
    data_train, data_test = main_df.iloc[train_id], main_df.iloc[test_id]
    _, _ = main_df['label'].iloc[train_id], main_df['label'].iloc[test_id]
    groups_train, groups_test = main_df['experiment_id'].iloc[train_id], main_df['experiment_id'].iloc[test_id]

    
    # Apply global normalization
    data_train_norm, data_test_norm = normalization(data_train, data_test)

    
    
    

Fold 1
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Fold 2
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Fold 3
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Fold 4
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Fold 5
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [None]:
def extract_features_labels(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    feature_cols = [col for col in data.columns if col not in ['timestamp', 'label', 'experiment_id']]
    X = data[feature_cols]
    y = data['label'] 
    return X, y 

In [10]:
class NestedCVOptimizer:
    def __init__(self, data, positive_class="void", n_outer_folds=3, n_inner_folds=2,
                n_trials=50, random_state=42):
        self.data = data
        self.positive_class = positive_class
        self.n_outer_folds = n_outer_folds
        self.n_inner_folds = n_inner_folds
        self.n_trials = n_trials
        self.random_state = random_state
        
        
        

In [11]:
num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]