In [18]:
import pandas as pd
import numpy as np
from data import POG4_Dataset
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

import arfs
import arfs.feature_selection as arfsfs
import arfs.feature_selection.allrelevant as arfsgroot
from arfs.feature_selection import (
    MinRedundancyMaxRelevance,
    GrootCV,
    MissingValueThreshold,
    UniqueValuesThreshold,
    CollinearityThreshold,
    make_fs_summary,
)
from arfs.utils import LightForestClassifier, LightForestRegressor
from arfs.benchmark import highlight_tick, compare_varimp, sklearn_pimp_bench
from arfs.utils import load_data
from arfs.preprocessing import OrdinalEncoderPandas

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [19]:
data = POG4_Dataset()

INFO - Creating XML data
INFO - Creating activity data
INFO - Missing days: 87
INFO - Featurizing time series data
INFO - Creating interactions...


In [20]:
for i in data.train.columns:
    print(i)

date
sleep_hours
workout_duration
workout_totalDistance
workout_totalDistanceUnit
workout_totalEnergyBurned
workout_totalEnergyBurnedUnit
WalkingSpeed
slp_WalkingSpeed_max_hrs_between
slp_WalkingSpeed_sum_hrs_between
slp_WalkingSpeed_count_hrs_between
slp_WalkingSpeed_hrs_min_min
slp_WalkingSpeed_hrs_min_max
slp_WalkingSpeed_hrs_max_min
slp_WalkingSpeed_hrs_max_max
slp_WalkingSpeed_min_endDate_hr
slp_WalkingSpeed_max_endDate_hr
slp_WalkingSpeed_min_startDate_hr
slp_WalkingSpeed_max_startDate_hr
slp_WalkingSpeed_21_00_00
slp_WalkingSpeed_21_10_00
slp_WalkingSpeed_21_20_00
slp_WalkingSpeed_21_30_00
slp_WalkingSpeed_21_40_00
slp_WalkingSpeed_21_50_00
slp_WalkingSpeed_22_00_00
slp_WalkingSpeed_22_10_00
slp_WalkingSpeed_22_20_00
slp_WalkingSpeed_22_30_00
slp_WalkingSpeed_22_40_00
slp_WalkingSpeed_22_50_00
slp_WalkingSpeed_23_00_00
slp_WalkingSpeed_23_10_00
slp_WalkingSpeed_23_20_00
slp_WalkingSpeed_23_30_00
slp_WalkingSpeed_23_40_00
slp_WalkingSpeed_23_50_00
slp_WalkingSpeed_00_00_00
slp_Wa

In [21]:
train = data.train[(data.train['date'] >= pd.to_datetime('2015-06-01').date()) & (data.train['date'] <= pd.to_datetime('2021-11-30').date())]


In [22]:
train

Unnamed: 0,date,sleep_hours,workout_duration,workout_totalDistance,workout_totalDistanceUnit,workout_totalEnergyBurned,workout_totalEnergyBurnedUnit,WalkingSpeed,slp_WalkingSpeed_max_hrs_between,slp_WalkingSpeed_sum_hrs_between,...,min_startDate_max_hr,avg_startDate_min_hr,max_startDate_min_hr,min_startDate_min_hr,avg_endDate_max_hr,max_endDate_max_hr,min_endDate_max_hr,avg_endDate_min_hr,max_endDate_min_hr,min_endDate_min_hr
0,2015-06-08,6.283333,43.816667,0.0,0.0,0.0,0.0,,,,...,11.0,13.500000,18.0,12.0,21.000000,23.0,11.0,18.250000,23.0,12.0
1,2015-06-09,5.833333,,,,,,,,,...,8.0,13.000000,15.0,12.0,20.857143,23.0,8.0,18.714286,23.0,12.0
2,2015-06-10,10.033333,43.133333,0.0,0.0,0.0,0.0,,,,...,10.0,13.500000,18.0,12.0,20.875000,23.0,10.0,18.250000,23.0,12.0
3,2015-06-11,,,,,,,,,,...,11.0,13.333333,14.0,13.0,21.285714,23.0,11.0,18.857143,23.0,13.0
4,2015-06-12,,35.450000,0.0,0.0,0.0,0.0,,,,...,11.0,13.500000,18.0,12.0,21.000000,23.0,11.0,18.250000,23.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363,2021-11-26,8.716667,80.947758,0.0,0.0,0.0,0.0,105.18075,20.645278,26.924722,...,8.0,13.800000,22.0,12.0,12.954545,23.0,8.0,14.545455,23.0,12.0
2364,2021-11-27,6.341667,69.831432,0.0,0.0,0.0,0.0,145.84824,11.058889,17.676944,...,7.0,14.150000,19.0,12.0,13.086957,23.0,7.0,15.000000,23.0,12.0
2365,2021-11-28,7.158333,244.530898,0.0,0.0,0.0,0.0,251.52111,16.835000,18.743611,...,7.0,13.565217,21.0,11.0,12.320000,23.0,7.0,14.280000,23.0,11.0
2366,2021-11-29,6.041667,97.006833,0.0,0.0,0.0,0.0,71.82803,8.885833,20.324444,...,7.0,13.636364,19.0,12.0,12.960000,23.0,7.0,14.208333,22.0,12.0


In [23]:

# Using cross-validation so concat the train and test sets
X = train.drop(['sleep_hours', 'date'], axis=1, errors='ignore')
y = train.sleep_hours.fillna(method="ffill")


In [24]:
lgb_kwargs = {"objective": "rmse", "zero_as_missing": False}

basic_fs_pipeline = Pipeline(
    [
        ("missing", arfsfs.MissingValueThreshold(threshold=0.05)),
        ("unique", arfsfs.UniqueValuesThreshold(threshold=1)),
        ("cardinality", arfsfs.CardinalityThreshold(threshold=10)),
        #("collinearity", arfsfs.CollinearityThreshold(threshold=0.75)),
        
    ]
)

X_trans = basic_fs_pipeline.fit_transform(
    X=X, y=y
)

In [25]:
imputer = SimpleImputer(strategy="median")
scaler = RobustScaler()

preprocessor = Pipeline(steps=[("imputer", imputer), ("scaler", scaler)])

X_scaled = pd.DataFrame(preprocessor.fit_transform(X_trans), columns=X_trans.columns)


In [26]:
X_scaled

Unnamed: 0,BasalEnergyBurned,slp_BasalEnergyBurned_max_hrs_between,slp_BasalEnergyBurned_sum_hrs_between,slp_BasalEnergyBurned_count_hrs_between,slp_BasalEnergyBurned_min_endDate_hr,slp_BasalEnergyBurned_max_endDate_hr,slp_BasalEnergyBurned_21_10_00,slp_BasalEnergyBurned_21_20_00,slp_BasalEnergyBurned_21_30_00,slp_BasalEnergyBurned_21_40_00,...,min_startDate_max_hr,avg_startDate_min_hr,max_startDate_min_hr,min_startDate_min_hr,avg_endDate_max_hr,max_endDate_max_hr,min_endDate_max_hr,avg_endDate_min_hr,max_endDate_min_hr,min_endDate_min_hr
0,-1.131579,0.000000,0.000000,0.0,0.0,0.0,-0.738095,-0.738095,-0.738095,-0.738095,...,0.666667,0.346044,0.666667,0.0,-2.800000,0.0,-2.4,0.628205,0.0,0.0
1,-1.131579,0.000000,0.000000,0.0,0.0,0.0,-0.738095,-0.738095,-0.738095,-0.738095,...,-0.333333,0.083050,0.166667,0.0,-3.000000,0.0,-3.0,1.128205,0.0,0.0
2,-1.131579,0.000000,0.000000,0.0,0.0,0.0,-0.738095,-0.738095,-0.738095,-0.738095,...,0.333333,0.346044,0.666667,0.0,-2.975000,0.0,-2.6,0.628205,0.0,0.0
3,-1.131579,0.000000,0.000000,0.0,0.0,0.0,-0.738095,-0.738095,-0.738095,-0.738095,...,0.666667,0.258379,0.000000,1.0,-2.400000,0.0,-2.4,1.282051,0.0,1.0
4,-1.131579,0.000000,0.000000,0.0,0.0,0.0,-0.738095,-0.738095,-0.738095,-0.738095,...,0.666667,0.346044,0.666667,0.0,-2.800000,0.0,-2.4,0.628205,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2363,2.797211,0.002778,0.017500,101.0,-1.0,-22.0,-45.186310,-45.186310,-45.188000,-45.187571,...,-0.333333,0.503839,1.333333,0.0,-14.063636,0.0,-3.0,-3.361305,0.0,0.0
2364,6.947447,0.002778,0.035556,146.0,-1.0,-22.0,-45.027571,-45.027571,-45.051643,-45.214911,...,-0.666667,0.687935,0.833333,0.0,-13.878261,0.0,-3.2,-2.871795,0.0,0.0
2365,3.251921,0.002778,0.013333,120.0,-1.0,-22.0,-45.026881,-45.406262,-45.057619,-45.365667,...,-0.666667,0.380347,1.166667,-1.0,-14.952000,0.0,-3.2,-3.647179,0.0,-1.0
2366,3.685053,0.002778,0.030278,141.0,-1.0,-22.0,-43.797381,-43.797381,-43.797381,-43.797381,...,-0.666667,0.417769,0.833333,0.0,-14.056000,0.0,-3.2,-3.724359,-1.0,0.0


In [27]:
model = XGBRegressor(tree_method="gpu_hist", gpu_id=0, verbosity=0)

In [28]:
# Groota
feat_selector_groota = arfsgroot.BoostAGroota(
    est=model, iters=100, importance="shap"
)
feat_selector_groota.fit(X_scaled, y)

groota_features = feat_selector_groota.get_feature_names_out()

print(f"The selected features: {groota_features}")
# fig = feat_selector_groota.plot_importance(n_feat_per_inch=5)


BoostaGRoota round:   0%|          | 1/500 [02:42<22:32:09, 162.58s/it]

The selected features: ['BasalEnergyBurned' 'slp_BasalEnergyBurned_21_10_00'
 'slp_BasalEnergyBurned_00_10_00' 'slp_BasalEnergyBurned_01_00_00'
 'FlightsClimbed' 'slp_FlightsClimbed_max_hrs_between'
 'slp_FlightsClimbed_sum_hrs_between' 'slp_FlightsClimbed_00_00_00'
 'slp_FlightsClimbed_07_00_00' 'slp_StepCount_max_hrs_between'
 'slp_StepCount_sum_hrs_between' 'slp_StepCount_count_hrs_between'
 'slp_StepCount_max_endDate_hr' 'slp_StepCount_23_00_00'
 'slp_StepCount_23_30_00' 'slp_StepCount_00_00_00'
 'slp_StepCount_07_00_00' 'slp_StepCount_07_10_00'
 'slp_StepCount_07_30_00' 'slp_StepCount_07_50_00'
 'slp_StepCount_08_00_00' 'slp_DistanceWalkingRunning_count_hrs_between'
 'slp_DistanceWalkingRunning_05_10_00'
 'slp_DistanceWalkingRunning_06_50_00'
 'slp_DistanceWalkingRunning_07_00_00'
 'slp_DistanceWalkingRunning_07_30_00'
 'slp_DistanceWalkingRunning_07_50_00' 'day_of_week' 'day_of_year'
 'doy_sin' 'avg_startDate_max_hr' 'min_endDate_max_hr'
 'avg_endDate_min_hr']





In [29]:
# Leshy
feat_selector_leshy = arfsgroot.Leshy(
    model, max_iter=100, random_state=42, importance="shap"
)
feat_selector_leshy.fit(X_scaled, y)

leshy_features = feat_selector_leshy.get_feature_names_out()

print(f"The selected features: {leshy_features}")
# fig = feat_selector_leshy.plot_importance(n_feat_per_inch=5)

Leshy iteration:  99%|█████████▉| 99/100 [04:56<00:02,  2.99s/it]

All relevant predictors selected in 00:04:56.27
The selected features: ['BasalEnergyBurned' 'slp_BasalEnergyBurned_21_10_00'
 'slp_BasalEnergyBurned_00_10_00' 'FlightsClimbed'
 'slp_FlightsClimbed_max_hrs_between' 'slp_FlightsClimbed_sum_hrs_between'
 'slp_StepCount_max_hrs_between' 'slp_StepCount_21_20_00'
 'slp_StepCount_22_30_00' 'slp_StepCount_00_00_00'
 'slp_StepCount_07_00_00' 'slp_StepCount_07_30_00'
 'slp_StepCount_08_00_00' 'slp_DistanceWalkingRunning_07_30_00'
 'day_of_week' 'day_of_year' 'avg_startDate_max_hr' 'avg_endDate_min_hr']





In [30]:
#GrootCV

# GrootCV
feat_selector_gcv = arfsgroot.GrootCV(
    objective="rmse", n_iter=200, silent=True
)
feat_selector_gcv.fit(X_scaled, y)

grootcv_features = feat_selector_gcv.get_feature_names_out()

print(f"The selected features: {grootcv_features}")
#fig = feat_selector.plot_importance(n_feat_per_inch=5)

Repeated k-fold: 100%|██████████| 1000/1000 [11:15<00:00,  1.48it/s]


The selected features: ['BasalEnergyBurned' 'slp_BasalEnergyBurned_21_10_00'
 'slp_FlightsClimbed_sum_hrs_between' 'slp_StepCount_max_hrs_between'
 'slp_StepCount_07_30_00' 'slp_DistanceWalkingRunning_07_20_00'
 'day_of_week' 'day_of_year']


In [31]:
# Combined unique features
combined_features = list(set(groota_features) | set(leshy_features) | set(grootcv_features))

print(f"The selected features: {combined_features}")

The selected features: ['slp_DistanceWalkingRunning_count_hrs_between', 'slp_StepCount_07_00_00', 'slp_StepCount_08_00_00', 'slp_StepCount_count_hrs_between', 'slp_DistanceWalkingRunning_07_50_00', 'avg_endDate_min_hr', 'avg_startDate_max_hr', 'slp_BasalEnergyBurned_01_00_00', 'slp_StepCount_07_30_00', 'slp_StepCount_23_00_00', 'slp_StepCount_sum_hrs_between', 'day_of_year', 'FlightsClimbed', 'slp_DistanceWalkingRunning_07_20_00', 'min_endDate_max_hr', 'slp_BasalEnergyBurned_21_10_00', 'slp_StepCount_22_30_00', 'slp_StepCount_max_endDate_hr', 'slp_StepCount_23_30_00', 'slp_FlightsClimbed_max_hrs_between', 'slp_FlightsClimbed_00_00_00', 'slp_StepCount_07_10_00', 'slp_DistanceWalkingRunning_07_00_00', 'day_of_week', 'slp_DistanceWalkingRunning_06_50_00', 'slp_StepCount_00_00_00', 'slp_DistanceWalkingRunning_07_30_00', 'doy_sin', 'slp_StepCount_max_hrs_between', 'slp_StepCount_07_50_00', 'slp_StepCount_21_20_00', 'slp_FlightsClimbed_sum_hrs_between', 'BasalEnergyBurned', 'slp_DistanceWalk

In [32]:

# Save the selected features
np.save("selected_features_long_term.npy", combined_features)

In [34]:
long_term_features = np.load("selected_features_long_term.npy", allow_pickle=True)
short_term_features = np.load("selected_features_short_term.npy", allow_pickle=True)

total_features = list(set(long_term_features) | set(short_term_features))

for i in total_features:
    print(i)

slp_BasalEnergyBurned_04_15_00
slp_StepCount_07_00_00
slp_DistanceWalkingRunning_07_50_00
slp_ActiveEnergyBurned_07_10_00
slp_ActiveEnergyBurned_00_45_00
slp_StepCount_07_30_00
slp_ActiveEnergyBurned_06_25_00
slp_StepCount_23_00_00
slp_ActiveEnergyBurned_22_15_00
slp_EnvironmentalAudioExposure_00_55_00
slp_ActiveEnergyBurned_04_20_00
slp_HeartRate_06_55_00
slp_BasalEnergyBurned_23_25_00
FlightsClimbed
slp_ActiveEnergyBurned_01_35_00
slp_HeartRate_06_30_00
slp_ActiveEnergyBurned_sum_hrs_between
slp_HeartRate_03_45_00
slp_EnvironmentalAudioExposure_00_05_00
slp_ActiveEnergyBurned_01_10_00
min_endDate_max_hr
slp_ActiveEnergyBurned_07_20_00
slp_EnvironmentalAudioExposure_23_20_00
slp_ActiveEnergyBurned_22_45_00
slp_StepCount_hrs_min_max
slp_StepCount_hrs_max_max
slp_HeartRate_01_30_00
slp_DistanceWalkingRunning_07_00_00
day_of_week
slp_ActiveEnergyBurned_07_40_00
slp_EnvironmentalAudioExposure_00_50_00
slp_ActiveEnergyBurned_hrs_max_min
slp_HeartRate_07_00_00
slp_DistanceWalkingRunning_06_