In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../../data/processed/WESAD/data_processed/30s/eda_data.csv', index_col=0)
data.head()

Unnamed: 0,ACC_x_mean,ACC_y_mean,ACC_z_mean,net_acc_mean,net_acc_std,EDA_mean,EDA_std,EDA_slope,RESP_rate,RESP_regularity,HR,IBI,RMSSD,SDNN,pNN50,lf/hf,label,subject,focus_label
0,39.252,-47.664667,7.822,62.549339,0.544205,0.563131,0.014406,-0.000412,21.083279,1.630067,63.966173,958.669355,213.058396,133.561891,90.0,5.933398,1,8,1
1,39.628667,-46.246667,9.764,62.469299,1.120992,0.523506,0.009367,-0.000265,21.327375,1.591487,66.372243,911.132812,134.469912,80.095987,74.193548,44.207315,1,8,1
2,38.927333,-48.439333,6.083333,62.442314,0.428267,0.520051,0.018623,4.8e-05,22.168906,1.35259,78.588847,803.631757,159.088795,155.42606,58.333333,707.930928,1,8,1
3,38.919333,-47.006667,-0.914667,62.576766,1.166077,0.493109,0.007353,-0.000203,24.011434,1.809602,65.088073,940.020161,199.098718,134.709634,63.333333,6.149627,1,8,1
4,49.157333,-18.907333,31.472,62.762208,1.500866,0.470519,0.010338,-0.000283,20.909046,1.358827,65.761617,924.316406,145.87533,94.891277,54.83871,1.541767,1,8,1


In [3]:
subjects = data['subject'].unique()

In [4]:
def safe_slope(x):
    if len(x) < 2 or np.all(np.isnan(x)):
        return np.nan
    try:
        return np.polyfit(range(len(x)), x, 1)[0]
    except np.linalg.LinAlgError:
        return np.nan

def add_features_mean_based(df):
    df['hr_diff'] = df['HR'].diff()
    df['hr_center'] = df['HR'] - df['HR'].mean()
    df['hr_slope'] = df['HR'].rolling(window=3, min_periods=2).apply(safe_slope, raw=True)
    df['hr_zscore'] = (df['HR'] - df['HR'].mean()) / df['HR'].std()

    df['lf_hf_log'] = np.log1p(df['lf/hf'])

    df['sdnn_diff'] = df['SDNN'].diff()
    df['sdnn_slope'] = df['SDNN'].rolling(window=3, min_periods=2).apply(safe_slope, raw=True)
    df['sdnn_zscore'] = (df['SDNN'] - df['SDNN'].mean()) / df['SDNN'].std()
    df['hr_sdnn_ratio'] = df['HR'] / (df['SDNN'] + 1e-6)

    df['resp_zscore'] = (df['RESP_rate'] - df['RESP_rate'].mean()) / df['RESP_rate'].std()

    df['EDA_slope_clipped'] = df['EDA_slope'].clip(-0.002, 0.005)
    df['EDA_slope_log'] = np.sign(df['EDA_slope_clipped']) * np.log1p(np.abs(df['EDA_slope_clipped']))
    df['EDA_slope_pos'] = (df['EDA_slope'] > 0).astype(int)

    df['eda_hr_interaction'] = df['EDA_mean'] * df['HR']
    df['eda_resp_ratio'] = df['EDA_std'] / df['RESP_regularity']
    df['hr_resp_interaction'] = df['HR'] * df['RESP_rate']
    df['hr_resp_z_interaction'] = df['hr_zscore'] * df['resp_zscore']
    df['hrv_composite'] = (df['RMSSD'] + df['SDNN'] + df['pNN50']) / 3
    df['hrv_stress_index'] = df['SDNN'] / df['RMSSD']

    df['arousal_index'] = df['HR'] * df['EDA_mean'] * df['RESP_rate']

    return df

In [5]:
data_feature_addon = []

In [6]:
for s in subjects:
    df = data[data['subject'] == s].copy().reset_index(drop=True)
    df = add_features_mean_based(df)
    df.dropna(inplace=True)
    data_feature_addon.append(df)

In [7]:
df = pd.concat(data_feature_addon, ignore_index=True)

In [8]:
df.head()

Unnamed: 0,ACC_x_mean,ACC_y_mean,ACC_z_mean,net_acc_mean,net_acc_std,EDA_mean,EDA_std,EDA_slope,RESP_rate,RESP_regularity,...,EDA_slope_clipped,EDA_slope_log,EDA_slope_pos,eda_hr_interaction,eda_resp_ratio,hr_resp_interaction,hr_resp_z_interaction,hrv_composite,hrv_stress_index,arousal_index
0,39.628667,-46.246667,9.764,62.469299,1.120992,0.523506,0.009367,-0.000265,21.327375,1.591487,...,-0.000265,-0.000265,0,34.746298,0.005886,1415.545722,-0.035662,96.253149,0.595642,741.047339
1,38.927333,-48.439333,6.083333,62.442314,0.428267,0.520051,0.018623,4.8e-05,22.168906,1.35259,...,4.8e-05,4.8e-05,1,40.8702,0.013768,1742.228749,0.00707,124.28273,0.976977,906.047629
2,38.919333,-47.006667,-0.914667,62.576766,1.166077,0.493109,0.007353,-0.000203,24.011434,1.809602,...,-0.000203,-0.000203,0,32.095491,0.004064,1562.857969,-1.403354,132.380562,0.676597,770.658757
3,49.157333,-18.907333,31.472,62.762208,1.500866,0.470519,0.010338,-0.000283,20.909046,1.358827,...,-0.000283,-0.000283,0,30.942103,0.007608,1375.012643,0.16433,98.535106,0.650496,646.969837
4,46.228,-39.174667,13.782,62.674127,1.910619,0.442887,0.007276,-0.000196,22.165715,1.926374,...,-0.000196,-0.000196,0,33.749942,0.003777,1689.123832,-0.076409,116.329704,1.112837,748.091592


In [9]:
df.to_csv('../../data/processed/WESAD/data_processed/30s/addon_feature_data.csv')

In [10]:
df.columns.unique

<bound method Index.unique of Index(['ACC_x_mean', 'ACC_y_mean', 'ACC_z_mean', 'net_acc_mean', 'net_acc_std',
       'EDA_mean', 'EDA_std', 'EDA_slope', 'RESP_rate', 'RESP_regularity',
       'HR', 'IBI', 'RMSSD', 'SDNN', 'pNN50', 'lf/hf', 'label', 'subject',
       'focus_label', 'hr_diff', 'hr_center', 'hr_slope', 'hr_zscore',
       'lf_hf_log', 'sdnn_diff', 'sdnn_slope', 'sdnn_zscore', 'hr_sdnn_ratio',
       'resp_zscore', 'EDA_slope_clipped', 'EDA_slope_log', 'EDA_slope_pos',
       'eda_hr_interaction', 'eda_resp_ratio', 'hr_resp_interaction',
       'hr_resp_z_interaction', 'hrv_composite', 'hrv_stress_index',
       'arousal_index'],
      dtype='object')>

# Feature selection using this flow
1. plot heatmap correlation

2. cut-off feature that have correlate > 0.9 left just one feature

3. Random forest training and using SHAP to understand feature importance

4. Top-N feature selection
### By using this flow we will got top-N feature ;)

In [None]:
selector = SelectKBest(score_func=f_classif, k=18)
X_selected = selector.fit_transform(X, y)
selected_columns = X.columns[selector.get_support()]