In [1]:
import pandas as pd
import numpy as np
from data import POG4_Dataset
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

import arfs
import arfs.feature_selection as arfsfs
import arfs.feature_selection.allrelevant as arfsgroot
from arfs.feature_selection import (
    MinRedundancyMaxRelevance,
    GrootCV,
    MissingValueThreshold,
    UniqueValuesThreshold,
    CollinearityThreshold,
    make_fs_summary,
)
from arfs.utils import LightForestClassifier, LightForestRegressor
from arfs.benchmark import highlight_tick, compare_varimp, sklearn_pimp_bench
from arfs.utils import load_data
from arfs.preprocessing import OrdinalEncoderPandas

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [2]:
data = POG4_Dataset()

INFO - Creating XML data
INFO - Creating activity data
INFO - Missing days: 87
INFO - Featurizing time series data
INFO - Creating interactions...


In [3]:
for i in data.train.columns:
    print(i)

date
sleep_hours
workout_duration
workout_totalDistance
workout_totalDistanceUnit
workout_totalEnergyBurned
workout_totalEnergyBurnedUnit
WalkingSpeed
slp_WalkingSpeed_max_hrs_between
slp_WalkingSpeed_sum_hrs_between
slp_WalkingSpeed_count_hrs_between
slp_WalkingSpeed_hrs_min_min
slp_WalkingSpeed_hrs_min_max
slp_WalkingSpeed_hrs_max_min
slp_WalkingSpeed_hrs_max_max
slp_WalkingSpeed_min_endDate_hr
slp_WalkingSpeed_max_endDate_hr
slp_WalkingSpeed_min_startDate_hr
slp_WalkingSpeed_max_startDate_hr
slp_WalkingSpeed_21_00_00
slp_WalkingSpeed_21_05_00
slp_WalkingSpeed_21_10_00
slp_WalkingSpeed_21_15_00
slp_WalkingSpeed_21_20_00
slp_WalkingSpeed_21_25_00
slp_WalkingSpeed_21_30_00
slp_WalkingSpeed_21_35_00
slp_WalkingSpeed_21_40_00
slp_WalkingSpeed_21_45_00
slp_WalkingSpeed_21_50_00
slp_WalkingSpeed_21_55_00
slp_WalkingSpeed_22_00_00
slp_WalkingSpeed_22_05_00
slp_WalkingSpeed_22_10_00
slp_WalkingSpeed_22_15_00
slp_WalkingSpeed_22_20_00
slp_WalkingSpeed_22_25_00
slp_WalkingSpeed_22_30_00
slp_Wa

In [4]:
train = data.train[(data.train['date'] >= pd.to_datetime('2020-09-25').date()) & (data.train['date'] <= pd.to_datetime('2021-11-30').date())]


In [5]:
train

Unnamed: 0,date,sleep_hours,workout_duration,workout_totalDistance,workout_totalDistanceUnit,workout_totalEnergyBurned,workout_totalEnergyBurnedUnit,WalkingSpeed,slp_WalkingSpeed_max_hrs_between,slp_WalkingSpeed_sum_hrs_between,...,min_startDate_max_hr,avg_startDate_min_hr,max_startDate_min_hr,min_startDate_min_hr,avg_endDate_max_hr,max_endDate_max_hr,min_endDate_max_hr,avg_endDate_min_hr,max_endDate_min_hr,min_endDate_min_hr
1936,2020-09-25,,,,,,,47.49016,13.186389,13.186389,...,7.0,18.083333,21.0,7.0,13.807692,23.0,7.0,18.384615,23.0,8.0
1937,2020-09-26,7.216667,46.032538,0.0,0.0,0.0,0.0,137.21369,19.325556,23.556667,...,7.0,12.695652,18.0,10.0,12.880000,23.0,7.0,13.560000,23.0,10.0
1938,2020-09-27,6.616667,,,,,,189.31194,11.176944,20.018611,...,7.0,12.105263,13.0,12.0,11.666667,23.0,7.0,13.190476,23.0,12.0
1939,2020-09-28,6.950000,46.266667,0.0,0.0,0.0,0.0,184.25647,16.380000,23.176667,...,6.0,12.736842,22.0,12.0,12.523810,22.0,6.0,13.476190,22.0,12.0
1940,2020-09-29,8.025000,,,,,,20.87063,15.120278,20.001111,...,6.0,12.900000,19.0,12.0,11.454545,19.0,6.0,13.000000,19.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363,2021-11-26,8.716667,80.947758,0.0,0.0,0.0,0.0,105.18075,20.645278,26.924722,...,8.0,13.800000,22.0,12.0,12.954545,23.0,8.0,14.545455,23.0,12.0
2364,2021-11-27,6.341667,69.831432,0.0,0.0,0.0,0.0,145.84824,11.058889,17.676944,...,7.0,14.150000,19.0,12.0,13.086957,23.0,7.0,15.000000,23.0,12.0
2365,2021-11-28,7.158333,244.530898,0.0,0.0,0.0,0.0,251.52111,16.835000,18.743611,...,7.0,13.565217,21.0,11.0,12.320000,23.0,7.0,14.280000,23.0,11.0
2366,2021-11-29,6.041667,97.006833,0.0,0.0,0.0,0.0,71.82803,8.885833,20.324444,...,7.0,13.636364,19.0,12.0,12.960000,23.0,7.0,14.208333,22.0,12.0


In [6]:

# Using cross-validation so concat the train and test sets
X = train.drop(['sleep_hours', 'date'], axis=1, errors='ignore')
y = train.sleep_hours.fillna(method="ffill")


In [7]:
lgb_kwargs = {"objective": "rmse", "zero_as_missing": False}

basic_fs_pipeline = Pipeline(
    [
        ("missing", arfsfs.MissingValueThreshold(threshold=0.05)),
        ("unique", arfsfs.UniqueValuesThreshold(threshold=1)),
        ("cardinality", arfsfs.CardinalityThreshold(threshold=10)),
        #("collinearity", arfsfs.CollinearityThreshold(threshold=0.99)),
        
    ]
)

X_trans = basic_fs_pipeline.fit_transform(
    X=X, y=y
)

In [8]:
imputer = SimpleImputer(strategy="median")
scaler = RobustScaler()

preprocessor = Pipeline(steps=[("imputer", imputer), ("scaler", scaler)])

X_scaled = pd.DataFrame(preprocessor.fit_transform(X_trans), columns=X_trans.columns)


In [9]:
X_scaled #1518 columns (264 with 0.95)

Unnamed: 0,WalkingSpeed,slp_WalkingSpeed_max_hrs_between,slp_WalkingSpeed_sum_hrs_between,slp_WalkingSpeed_count_hrs_between,ActiveEnergyBurned,slp_ActiveEnergyBurned_max_hrs_between,slp_ActiveEnergyBurned_sum_hrs_between,slp_ActiveEnergyBurned_count_hrs_between,slp_ActiveEnergyBurned_hrs_min_min,slp_ActiveEnergyBurned_hrs_min_max,...,min_startDate_max_hr,avg_startDate_min_hr,max_startDate_min_hr,min_startDate_min_hr,avg_endDate_max_hr,max_endDate_max_hr,min_endDate_max_hr,avg_endDate_min_hr,max_endDate_min_hr,min_endDate_min_hr
0,-0.791460,-0.232136,-1.289443,-1.416667,-1.750887,0.016757,-2.079281,-3.196676,0.190212,1.024476,...,0.0,3.583498,0.666667,-5.0,0.741617,0.333333,0.0,3.447306,0.666667,-4.0
1,0.129233,0.923109,0.670166,0.583333,0.381503,-0.329825,0.329846,0.360111,-1.481071,0.127668,...,0.0,-0.780680,-0.333333,-2.0,0.154256,0.333333,0.0,-0.535166,0.666667,-2.0
2,0.663836,-0.610266,0.001601,1.000000,-0.544577,0.196996,-0.021555,0.459834,0.910434,0.636773,...,0.0,-1.258912,-2.000000,0.0,-0.613956,0.333333,0.0,-0.840189,0.666667,0.0
3,0.611959,0.368825,0.598360,0.958333,2.487004,0.455298,-0.643394,12.349030,-0.188366,-0.234972,...,-1.0,-0.747315,1.000000,0.0,-0.071263,0.000000,-1.0,-0.604346,0.333333,0.0
4,-1.064615,0.131776,-0.001706,-1.333333,-1.302102,0.126290,0.688607,-0.293629,-0.212373,-0.440963,...,-1.0,-0.615152,0.000000,0.0,-0.748258,-1.000000,-1.0,-0.997417,-0.666667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,-0.199472,1.171449,1.306607,-0.041667,1.170169,-0.232145,0.473477,-0.814404,-0.378578,-0.384570,...,1.0,0.113874,1.000000,0.0,0.201454,0.333333,1.0,0.278276,0.666667,0.0
428,0.217835,-0.632481,-0.440890,0.166667,1.922021,1.399407,-1.657431,1.290859,-0.131117,-0.237321,...,0.0,0.397384,0.000000,0.0,0.285289,0.333333,0.0,0.653480,0.666667,0.0
429,1.302192,0.454446,-0.239328,1.250000,9.435820,-0.382140,-2.000841,-1.590028,-0.092336,0.682984,...,0.0,-0.076307,0.666667,-1.0,-0.200303,0.333333,0.0,0.059157,0.666667,-1.0
430,-0.541719,-1.041399,0.059392,-0.708333,1.764955,2.035762,0.351086,-0.847645,-5.850416,0.314862,...,0.0,-0.018677,0.000000,0.0,0.204908,0.333333,0.0,0.000000,0.333333,0.0


In [10]:
model = XGBRegressor(tree_method="gpu_hist", gpu_id=0, verbosity=0)

In [11]:
# Groota
feat_selector_groota = arfsgroot.BoostAGroota(
    est=model, iters=100, importance="shap"
)
feat_selector_groota.fit(X_scaled, y)

groota_features = feat_selector_groota.get_feature_names_out()

print(f"The selected features: {groota_features}")
# fig = feat_selector_groota.plot_importance(n_feat_per_inch=5)


BoostaGRoota round:   0%|          | 1/500 [03:29<28:59:39, 209.18s/it]


The selected features: ['slp_ActiveEnergyBurned_sum_hrs_between'
 'slp_ActiveEnergyBurned_count_hrs_between'
 'slp_ActiveEnergyBurned_23_45_00' 'slp_ActiveEnergyBurned_00_00_00'
 'slp_ActiveEnergyBurned_00_05_00' 'slp_ActiveEnergyBurned_00_30_00'
 'slp_ActiveEnergyBurned_00_45_00' 'slp_ActiveEnergyBurned_01_10_00'
 'slp_ActiveEnergyBurned_06_40_00' 'slp_ActiveEnergyBurned_07_10_00'
 'slp_ActiveEnergyBurned_07_15_00' 'slp_ActiveEnergyBurned_07_20_00'
 'slp_ActiveEnergyBurned_07_35_00' 'slp_ActiveEnergyBurned_07_40_00'
 'slp_EnvironmentalAudioExposure_23_20_00'
 'slp_EnvironmentalAudioExposure_00_05_00'
 'slp_EnvironmentalAudioExposure_00_40_00'
 'slp_EnvironmentalAudioExposure_00_45_00'
 'slp_EnvironmentalAudioExposure_00_50_00'
 'slp_EnvironmentalAudioExposure_00_55_00' 'BasalEnergyBurned'
 'slp_BasalEnergyBurned_23_25_00' 'slp_BasalEnergyBurned_00_10_00'
 'slp_BasalEnergyBurned_04_15_00' 'slp_BasalEnergyBurned_08_35_00'
 'slp_StairAscentSpeed_sum_hrs_between'
 'slp_FlightsClimbed_max_

In [12]:
# Leshy
feat_selector_leshy = arfsgroot.Leshy(
    model, max_iter=100, random_state=42, importance="shap"
)
feat_selector_leshy.fit(X_scaled, y)

leshy_features = feat_selector_leshy.get_feature_names_out()

print(f"The selected features: {leshy_features}")
# fig = feat_selector_leshy.plot_importance(n_feat_per_inch=5)

Leshy iteration:  99%|█████████▉| 99/100 [04:44<00:02,  2.87s/it]

All relevant predictors selected in 00:04:44.34
The selected features: ['WalkingSpeed' 'slp_ActiveEnergyBurned_sum_hrs_between'
 'slp_ActiveEnergyBurned_count_hrs_between'
 'slp_ActiveEnergyBurned_hrs_max_min' 'slp_ActiveEnergyBurned_22_15_00'
 'slp_ActiveEnergyBurned_22_45_00' 'slp_ActiveEnergyBurned_23_05_00'
 'slp_ActiveEnergyBurned_23_45_00' 'slp_ActiveEnergyBurned_00_00_00'
 'slp_ActiveEnergyBurned_00_05_00' 'slp_ActiveEnergyBurned_00_15_00'
 'slp_ActiveEnergyBurned_01_10_00' 'slp_ActiveEnergyBurned_01_35_00'
 'slp_ActiveEnergyBurned_04_20_00' 'slp_ActiveEnergyBurned_06_25_00'
 'slp_ActiveEnergyBurned_06_40_00' 'slp_ActiveEnergyBurned_07_10_00'
 'slp_ActiveEnergyBurned_07_15_00' 'slp_ActiveEnergyBurned_07_20_00'
 'slp_ActiveEnergyBurned_07_35_00' 'slp_ActiveEnergyBurned_07_40_00'
 'slp_EnvironmentalAudioExposure_00_05_00'
 'slp_EnvironmentalAudioExposure_00_45_00'
 'slp_EnvironmentalAudioExposure_03_25_00' 'BasalEnergyBurned'
 'slp_BasalEnergyBurned_23_25_00' 'slp_BasalEnergyBurne




In [13]:
#GrootCV

# GrootCV
feat_selector_gcv = arfsgroot.GrootCV(
    objective="rmse", n_iter=200, silent=True
)
feat_selector_gcv.fit(X_scaled, y)

grootcv_features = feat_selector_gcv.get_feature_names_out()

print(f"The selected features: {grootcv_features}")
#fig = feat_selector.plot_importance(n_feat_per_inch=5)

Repeated k-fold: 100%|██████████| 1000/1000 [15:32<00:00,  1.07it/s]


The selected features: ['slp_ActiveEnergyBurned_count_hrs_between'
 'slp_ActiveEnergyBurned_23_35_00' 'slp_ActiveEnergyBurned_23_45_00'
 'slp_ActiveEnergyBurned_23_50_00' 'slp_ActiveEnergyBurned_00_00_00'
 'slp_ActiveEnergyBurned_00_05_00'
 'slp_EnvironmentalAudioExposure_00_05_00' 'BasalEnergyBurned'
 'slp_FlightsClimbed_sum_hrs_between' 'slp_HeartRate_06_55_00'
 'slp_HeartRate_07_00_00' 'slp_AppleStandTime_max_hrs_between'
 'slp_AppleStandTime_sum_hrs_between' 'doy_cos']


In [14]:
# Combined unique features
combined_features = list(set(groota_features) | set(leshy_features) | set(grootcv_features))

print(f"The selected features: {combined_features}")

The selected features: ['DistanceWalkingRunning', 'slp_ActiveEnergyBurned_07_20_00', 'doy_cos', 'slp_EnvironmentalAudioExposure_00_55_00', 'slp_ActiveEnergyBurned_06_40_00', 'slp_HeartRate_06_30_00', 'slp_ActiveEnergyBurned_sum_hrs_between', 'slp_OxygenSaturation_01_00_00', 'WalkingSpeed', 'slp_StairAscentSpeed_sum_hrs_between', 'slp_HeartRate_01_30_00', 'slp_EnvironmentalAudioExposure_00_45_00', 'slp_ActiveEnergyBurned_22_15_00', 'slp_ActiveEnergyBurned_01_10_00', 'slp_ActiveEnergyBurned_07_10_00', 'slp_BasalEnergyBurned_04_15_00', 'slp_ActiveEnergyBurned_06_25_00', 'slp_EnvironmentalAudioExposure_00_50_00', 'slp_FlightsClimbed_max_hrs_between', 'slp_ActiveEnergyBurned_23_50_00', 'slp_HeartRate_08_05_00', 'slp_StepCount_07_40_00', 'slp_ActiveEnergyBurned_22_45_00', 'slp_ActiveEnergyBurned_00_30_00', 'slp_HeartRate_06_55_00', 'slp_EnvironmentalAudioExposure_03_25_00', 'slp_HeartRate_01_00_00', 'slp_ActiveEnergyBurned_23_35_00', 'slp_AppleStandTime_sum_hrs_between', 'slp_EnvironmentalAu

In [15]:

# Save the selected features
np.save("selected_features_short_term.npy", combined_features)