In [1]:
import json
import logging
import pandas as pd
import numpy as np

from sklearn.linear_model import RidgeCV, RANSACRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold
from sklearn.base import TransformerMixin, BaseEstimator, RegressorMixin

In [2]:
from data import POG4_Dataset

data = POG4_Dataset()

data.train.head()


INFO - Creating XML data
INFO - Creating activity data
INFO - Missing days: 87
INFO - Featurizing time series data
INFO - Creating interactions...
INFO - Dropped null columns: ['workout_duration', 'workout_totalDistance', 'workout_totalDistanceUnit', 'workout_totalEnergyBurned', 'workout_totalEnergyBurnedUnit', 'WalkingSpeed', 'slp_WalkingSpeed_max_hrs_between', 'slp_WalkingSpeed_sum_hrs_between', 'WalkingSpeed_slp_hrs_min_min', 'WalkingSpeed_slp_hrs_min_max', 'WalkingSpeed_slp_hrs_max_min', 'WalkingSpeed_slp_hrs_max_max', 'WalkingSpeed_slp_min_endDate_hr', 'WalkingSpeed_slp_max_endDate_hr', 'WalkingSpeed_slp_min_startDate_hr', 'WalkingSpeed_slp_max_startDate_hr', 'ActiveEnergyBurned_slp_hrs_min_min', 'ActiveEnergyBurned_slp_hrs_min_max', 'ActiveEnergyBurned_slp_hrs_max_min', 'ActiveEnergyBurned_slp_hrs_max_max', 'ActiveEnergyBurned_slp_min_startDate_hr', 'ActiveEnergyBurned_slp_max_startDate_hr', 'RunningSpeed', 'slp_RunningSpeed_max_hrs_between', 'slp_RunningSpeed_sum_hrs_between', '

Unnamed: 0,date,sleep_hours,ActiveEnergyBurned,slp_ActiveEnergyBurned_max_hrs_between,slp_ActiveEnergyBurned_sum_hrs_between,ActiveEnergyBurned_slp_min_endDate_hr,ActiveEnergyBurned_slp_max_endDate_hr,BasalEnergyBurned,slp_BasalEnergyBurned_max_hrs_between,slp_BasalEnergyBurned_sum_hrs_between,...,min_startDate_max_hr,avg_startDate_min_hr,max_startDate_min_hr,min_startDate_min_hr,avg_endDate_max_hr,max_endDate_max_hr,min_endDate_max_hr,avg_endDate_min_hr,max_endDate_min_hr,min_endDate_min_hr
0,2015-06-08,6.283333,1316.0,0.000278,0.0,23.0,23.0,1887.0,0.000278,0.0,...,11.0,13.5,18.0,12.0,21.0,23.0,11.0,18.25,23.0,12.0
1,2015-06-09,5.833333,1031.0,0.000278,0.0,23.0,23.0,1887.0,0.000278,0.0,...,8.0,13.0,15.0,12.0,20.857143,23.0,8.0,18.714286,23.0,12.0
2,2015-06-10,10.033333,1356.0,0.000278,0.0,23.0,23.0,1887.0,0.000278,0.0,...,10.0,13.5,18.0,12.0,20.875,23.0,10.0,18.25,23.0,12.0
3,2015-06-11,,1624.0,0.000278,0.0,23.0,23.0,1887.0,0.000278,0.0,...,11.0,13.333333,14.0,13.0,21.285714,23.0,11.0,18.857143,23.0,13.0
4,2015-06-12,,3099.0,0.000278,0.0,23.0,23.0,1887.0,0.000278,0.0,...,11.0,13.5,18.0,12.0,21.0,23.0,11.0,18.25,23.0,12.0


In [3]:
# Long time series data between 6/1/2015 and 11/30/2021
long_ts = data.train[(data.train['date'] >= pd.to_datetime('2015-06-01').date()) & (data.train['date'] <= pd.to_datetime('2021-11-30').date())]
X_long = long_ts.drop(['sleep_hours'], axis=1)
y_long = long_ts['sleep_hours'].fillna(method="ffill")

# Short time series data between 9/25/2018 and 11/30/2021 (with recent apple health data)
short_ts = data.train[(data.train['date'] >= pd.to_datetime('2018-09-25').date()) & (data.train['date'] <= pd.to_datetime('2021-11-30').date())]
X_short = short_ts.drop(['sleep_hours'], axis=1)
y_short = short_ts['sleep_hours'].fillna(method="ffill")

# Test data between 12/1/2021 and 12/31/2022
test = data.train[(data.train['date'] >= pd.to_datetime('2021-12-01').date()) & (data.train['date'] <= pd.to_datetime('2022-12-31').date())]
X_test = test.drop(['sleep_hours'], axis=1)
y_test = test['sleep_hours'].fillna(method="ffill")


In [27]:
# Check if X_test has same columns as X_short
(X_test.columns == X_short.columns).all()

(X_test.columns == X_long.columns).all()


True

In [4]:
imputer = SimpleImputer(strategy='median')
scaler = RobustScaler()

In [5]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    """To remove date column and/or slp-prefixed columns for specified models"""
    def __init__(self, include_date = True, include_slp = True):
        self.include_date = include_date
        self.include_slp = include_slp

    def fit(self, X, y=None):
        self.dummy_fitted_ = True
        return self

    def transform(self, X, y=None):
        X = X.copy()
        if not self.include_date and 'date' in X.columns:
            X = X.drop(columns=['date'])
        
        if not self.include_slp:
            slp_columns = [col for col in X.columns if col.startswith('slp')]
            X = X.drop(columns=slp_columns)
        
        return X

## Simple Models

In [6]:


# Original Dataset used for simple models
orig = pd.read_csv("./data/train.csv", low_memory=False)
orig["date"] = pd.to_datetime(orig["date"])
orig["day_of_week"] = orig["date"].dt.dayofweek
orig["is_workday"] = orig["day_of_week"] < 5
orig["day_of_year"] = orig["date"].dt.dayofyear

# Simple Models
class MedianDOWModel(BaseEstimator, RegressorMixin):
    """ Predict using median sleep hours for each day of the week"""
    def __init__(self):
        self.median_dow = orig.groupby("day_of_week")["sleep_hours"].median()
        self.global_mean = orig["sleep_hours"].mean() 
        pass

    def fit(self, X, y):
        # print('nans in median_dow:', np.isnan(MedianDOWModel().predict(X)).sum())
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        pred = X["day_of_week"].map(self.median_dow)
        return pred.to_numpy()

class MeanDOWModel(BaseEstimator, RegressorMixin):
    """ Predict using median sleep hours for each day of the week"""
    def __init__(self):
        self.mean_dow = orig.groupby("day_of_week")["sleep_hours"].mean()
        self.global_mean = orig["sleep_hours"].mean() 
        pass

    def fit(self, X, y):
        # print('nans in mean_dow:', np.isnan(MeanDOWModel().predict(X)).sum())
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        pred = X["day_of_week"].map(self.mean_dow)
        return pred.to_numpy()


class MedianWorkdayModel(BaseEstimator, RegressorMixin):
    """ Predict using median sleep hours for workdays and weekends"""
    def __init__(self):
        self.median_workday = orig.groupby("is_workday")["sleep_hours"].median()
        self.global_mean = orig["sleep_hours"].mean() 
        pass

    def fit(self, X, y):
        # print('nans in median_wd:', np.isnan(MedianWorkdayModel().predict(X)).sum())
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        pred = X["is_workday"].map(self.median_workday)
        return pred.to_numpy()

class MeanWorkdayModel(BaseEstimator, RegressorMixin):
    """ Predict using median sleep hours for workdays and weekends"""
    def __init__(self):
        self.mean_workday = orig.groupby("is_workday")["sleep_hours"].mean()
        self.global_mean = orig["sleep_hours"].mean() 
        pass

    def fit(self, X, y):
        # print('nans in mean_wd:', np.isnan(MeanWorkdayModel().predict(X)).sum())
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        pred = X["is_workday"].map(self.mean_workday)
        return pred.to_numpy()
    
class MeanDOYModel(BaseEstimator, RegressorMixin):
    """ Predict using mean sleep hours for each day of the year (1-365)"""
    def __init__(self):
        self.mean_day_of_year = orig.groupby("day_of_year")["sleep_hours"].mean()
        self.global_mean = orig["sleep_hours"].mean() # Sometimes issues with missing days of the year
        pass

    def fit(self, X, y):
        # print('nans in mean_doy_model:', np.isnan(MeanDOYModel().predict(X)).sum())
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        pred = X["day_of_year"].map(self.mean_day_of_year).fillna(self.global_mean)
        return pred.to_numpy()

median_dow = MedianDOWModel()
median_dow.fit(X_long, y_long)

mean_dow = MeanDOWModel()
mean_dow.fit(X_long, y_long)

median_wd = MedianWorkdayModel()
median_wd.fit(X_long, y_long)

mean_wd = MeanWorkdayModel()
mean_wd.fit(X_long, y_long)

mean_doy_model = MeanDOYModel()
mean_doy_model.fit(X_long, y_long)


simple_models = {
    'median_dow': median_dow,
    'mean_dow': mean_dow,
    'median_wd': median_wd,
    'mean_wd': mean_wd,
    'mean_doy_model': mean_doy_model
}

## Complex Regression Models

### Prophet by META

In [7]:
%%capture
!pip install prophet

In [8]:
from prophet import Prophet

with open("prophet_best_params.json", "r") as f:
    prophet_best_params = json.load(f)
    
print("Best Parameters for Prophet: ", prophet_best_params)

class ProphetModel(BaseEstimator, RegressorMixin):
    """ Predict using Prophet"""
    def __init__(self):
        logging.getLogger('fbprophet').setLevel(logging.ERROR)
        self.m = Prophet(**prophet_best_params)
        pass

    def fit(self, X, y):
        X = X.fillna(method="ffill").fillna(method="bfill").fillna(0) #Ok for Time Series
        X = X.rename(columns={'date': 'ds'})
        X["y"] = y
        
        # Add the additional regressors
        for col in [col for col in X.columns if col not in ["ds", "y"]]:
            self.m.add_regressor(col)
        
        self.m.fit(X)
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        X = X.fillna(method="ffill").fillna(method="bfill").fillna(0) #Ok for Time Series
        X = X.rename(columns={'date': 'ds', 'sleep_hours': 'y'})

        return self.m.predict(X)['yhat'].to_numpy()

prophet_model = ProphetModel()

prophet_model.fit(X_long, y_long)

INFO - Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG - input tempfile: /tmp/tmps8xyhe0v/ame991o5.json
DEBUG - input tempfile: /tmp/tmps8xyhe0v/sozfpprc.json
DEBUG - idx 0
DEBUG - running CmdStan, num_threads: None
DEBUG - CmdStan args: ['/home/sebastian/mambaforge/envs/pog4-sleep/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=95902', 'data', 'file=/tmp/tmps8xyhe0v/ame991o5.json', 'init=/tmp/tmps8xyhe0v/sozfpprc.json', 'output', 'file=/tmp/tmps8xyhe0v/prophet_modelahc05gjn/prophet_model-20230407101523.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
10:15:23 - cmdstanpy - INFO - Chain [1] start processing


Best Parameters for Prophet:  {'changepoint_prior_scale': 0.01, 'seasonality_prior_scale': 6, 'holidays_prior_scale': 0.01, 'seasonality_mode': 'additive'}


INFO - Chain [1] start processing
10:15:23 - cmdstanpy - INFO - Chain [1] done processing
INFO - Chain [1] done processing


### XGBoost

In [9]:
from xgboost import XGBRegressor

with open("xgb_best_params.json", "r") as f:
    xgb_best_params = json.load(f)
    
print("Best Parameters for XGBoost: ", xgb_best_params)

xgb_params = {
    "learning_rate": xgb_best_params["learning_rate"],
    "max_depth": xgb_best_params["max_depth"],
    'lambda': xgb_best_params["lmbda"],
    'alpha': xgb_best_params["alpha"],
    "n_estimators": xgb_best_params["n_estimators"],
    "subsample": xgb_best_params["subsample"],
    "colsample_bytree": xgb_best_params["colsample_bytree"],
    'eta': xgb_best_params["eta"],
    "objective": "reg:squarederror",
    "seed": 42
}

xgb_model = XGBRegressor(**xgb_params, 
                            gpu_id=0, 
                            tree_method="gpu_hist",
                            random_state=42
                            )

xgb_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=True)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('xgb', xgb_model)
])

xgb_pipeline.fit(X_long, y_long)

Best Parameters for XGBoost:  {'eta': 0.17075594525437782, 'alpha': 9.041693109990469, 'lmbda': 1.3001428098901135, 'scaler': 'minmax', 'imputer': 'mean', 'max_depth': 3, 'subsample': 0.10598059245687068, 'n_estimators': 1458, 'learning_rate': 0.1557093298606608, 'colsample_bytree': 0.40406706027916595}


### Extra Trees

In [10]:
from sklearn.ensemble import ExtraTreesRegressor

with open("et_best_params.json", "r") as f:
    et_best_params = json.load(f)
    
print("Best Parameters for ExtraTrees: ", et_best_params)

et_params = {
    "n_estimators": et_best_params["n_estimators"],
    "max_depth": et_best_params["max_depth"],
    "min_samples_split": et_best_params["min_samples_split"],
    "min_samples_leaf": et_best_params["min_samples_leaf"],
}

et_model = ExtraTreesRegressor(**et_params, random_state=42)

et_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=True)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('et', et_model)
])

et_pipeline.fit(X_short, y_short)

Best Parameters for ExtraTrees:  {'scaler': 'robust', 'imputer': 'most_frequent', 'bootstrap': True, 'max_depth': 15, 'n_estimators': 1969, 'min_samples_leaf': 9, 'min_samples_split': 5}


### K Nearest Neighbors

In [11]:
from sklearn.neighbors import KNeighborsRegressor

with open("knn_best_params.json", "r") as f:
    knn_best_params = json.load(f)
    
print("Best Parameters for KNN: ", knn_best_params)

knn_params = {
    "n_neighbors": knn_best_params["n_neighbors"],
    "weights": knn_best_params["weights"],
    "p": knn_best_params["p"]
}

knn_model = KNeighborsRegressor(**knn_params)

knn_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=True)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('knn', knn_model)
])

knn_pipeline.fit(X_short, y_short)

Best Parameters for KNN:  {'p': 1, 'scaler': 'standard', 'imputer': 'most_frequent', 'weights': 'uniform', 'n_neighbors': 44}


### Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor

with open("rf_best_params.json", "r") as f:
    rf_best_params = json.load(f)

print("Best Parameters for RF: ", rf_best_params)

rf_params = {
    "n_estimators": rf_best_params["n_estimators"],
    "max_depth": rf_best_params["max_depth"],
    "min_samples_split": rf_best_params["min_samples_split"],
    "min_samples_leaf": rf_best_params["min_samples_leaf"]
}

rf_model = RandomForestRegressor(**rf_params, random_state=42, n_jobs=-1)

rf_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=True)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('rf', rf_model)
])

rf_pipeline.fit(X_short, y_short)


Best Parameters for RF:  {'scaler': 'robust', 'imputer': 'most_frequent', 'bootstrap': True, 'max_depth': 51, 'max_features': 'auto', 'n_estimators': 702, 'min_samples_leaf': 10, 'min_samples_split': 7}


### GRU Model

In [13]:
from gru import TimeSeriesDataset, GRUModel, GRUTrainer

import torch
import torch.nn as nn
from torch.optim import Adam, SGD, RMSprop, AdamW
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

with open("gru_best_params.json", "r") as f:
    gru_best_params = json.load(f)

print("Best Parameters for GRU: ", gru_best_params)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # CUDA support

input_size = X_long.shape[1] - 14 # Minus 1 for date, +1 for sleep hours, minus 14 for sleep features

gru_target_scaler = RobustScaler() 
gru_model = GRUModel(device, input_size,
                     gru_best_params["hidden_size"],
                     gru_best_params["num_layers"],
                     1,
                     gru_best_params["dropout_rate"],
                     gru_best_params["activation_function"]).to(device)
gru_trainer = GRUTrainer(gru_model, device,
                         gru_best_params["learning_rate"], 
                         gru_best_params["criterion"], 
                         gru_best_params["optimizer"], 
                         None)

class NNModel(BaseEstimator, RegressorMixin):
    """ Predict using Prophet"""
    def __init__(self, lookback, batch_size, target_scaler, model, trainer, num_epochs=500):
        self.lookback = lookback
        self.batch_size = batch_size
        
        self.target_scaler = target_scaler
        
        self.model = model
        
        self.output_size = 1
        self.num_epochs = num_epochs
        self.trainer = trainer
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # CUDA support

        pass

    def fit(self, X, y):
        X = pd.DataFrame(X)
        
        y_scaled = self.target_scaler.fit_transform(data.y.values.reshape(-1, 1))
        y_scaled = pd.DataFrame(y_scaled, columns=["sleep_hours"])
        
        train = pd.concat([X, y_scaled], axis=1).to_numpy() # X is already scaled
        train_ds = TimeSeriesDataset(train, self.lookback, self.output_size)
        train_dl = DataLoader(train_ds, batch_size=self.batch_size, shuffle=False)
        
        for epoch in range(self.num_epochs):
            train_loss = self.trainer.train(train_dl)
            val_loss = self.trainer.evaluate(train_dl)
        
        print(f"Final Epoch {epoch+1}, train_loss: {train_loss:.4f}, valid_rmse: {val_loss:.4f}")
        
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        num_predictions = X.shape[0]
        X = torch.from_numpy(X).float() #.to_numpy()
        y_pred = self.trainer.predict(X, self.lookback, 1, num_predictions)
        y_pred = self.target_scaler.inverse_transform(y_pred) # Inverse scaling is used to calculate validation RMSE

        return np.nan_to_num(y_pred, nan=7.0)


GRU = NNModel(gru_best_params["lookback"], gru_best_params["batch_size"], gru_target_scaler, gru_model, gru_trainer)

GRU_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=False)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('gru', GRU)
])

GRU_pipeline.fit(X_long, y_long)


Best Parameters for GRU:  {'lookback': 1, 'criterion': 'mae', 'optimizer': 'adamw', 'batch_size': 210, 'num_epochs': 291, 'num_layers': 1, 'hidden_size': 287, 'dropout_rate': 0.5964791189037534, 'learning_rate': 0.007953492092921025, 'activation_function': 'sigmoid'}
Final Epoch 500, train_loss: nan, valid_rmse: nan


In [14]:

# X_scaled = RobustScaler().fit_transform(SimpleImputer(strategy="median").fit_transform(X.drop(columns="date")))

# GRU.fit(X_scaled, y)

## Complex Classification Models

### XGBoost - Classification

In [15]:
from xgboost import XGBClassifier

with open("xgb_classifier_best_params.json", "r") as f:
    xgb_classifier_best_params = json.load(f)
    
print("Best Parameters for XGBoost: ", xgb_classifier_best_params)

xgb_clf_params = {
    "learning_rate": xgb_classifier_best_params["learning_rate"],
    "max_depth": xgb_classifier_best_params["max_depth"],
    'lambda': xgb_classifier_best_params["lmbda"],
    'alpha': xgb_classifier_best_params["alpha"],
    "n_estimators": xgb_classifier_best_params["n_estimators"],
    "subsample": xgb_classifier_best_params["subsample"],
    "colsample_bytree": xgb_classifier_best_params["colsample_bytree"],
    'eta': xgb_classifier_best_params["eta"],
    "objective": "reg:squarederror",
    "seed": 42
}

class XGBClassifierModel(BaseEstimator, RegressorMixin):
    """ Predicts Median unless high probability of above or below median"""
    def __init__(self):
        self.model = XGBClassifier(**xgb_clf_params, gpu_id=0, tree_method="gpu_hist", random_state=42)
        pass

    def fit(self, X, y):
        self.y_med = np.median(y)
        self.y_std = y.std()
        self.y_low = self.y_med - 1*self.y_std
        self.y_high = self.y_med + 1*self.y_std
        
        y_classifier = (y > self.y_med).astype(int)
        self.model.fit(X, y_classifier)
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        probabilities = self.model.predict_proba(X)[:, 1]
        preds = np.array([self.y_high if x > 0.75 else 
                          (self.y_low if x < 0.25 else 
                           self.y_med) 
                          for x in probabilities])
        return preds

xgb_clf_model = XGBClassifierModel()

xgb_clf_pipeline = Pipeline([
    ("date_filter", ColumnDropper(include_date=False, include_slp=False)),
    ('imputer', imputer),
    ('scaler', scaler),
    ('xgb_clf', xgb_clf_model)
])

xgb_clf_pipeline.fit(X_long, y_long)

Best Parameters for XGBoost:  {'eta': 0.2189867593417301, 'alpha': 1.798190574808438, 'lmbda': 3.8748760484151914, 'scaler': 'minmax', 'imputer': 'median', 'max_depth': 3, 'subsample': 0.5256694730321446, 'n_estimators': 1642, 'learning_rate': 0.1682025111157145, 'colsample_bytree': 0.6490424765035034}


### XGBoost - Sleep Times

In [16]:
import pytz
import datetime

with open("sleep_model_best_params.json", "r") as f:
    sleep_model_best_params = json.load(f)
    
print("Best Parameters for XGBoost Sleep Model: ", sleep_model_best_params)

xgb_params = {
    "learning_rate": sleep_model_best_params["learning_rate"],
    "max_depth": sleep_model_best_params["max_depth"],
    "n_estimators": sleep_model_best_params["n_estimators"],
    "subsample": sleep_model_best_params["subsample"],
    "colsample_bytree": sleep_model_best_params["colsample_bytree"],
    "gamma": sleep_model_best_params["gamma"],
    "min_child_weight": sleep_model_best_params["min_child_weight"],
    "objective": "binary:logistic",
    "seed": 42
}

class XGBSleepTimesModel(BaseEstimator, RegressorMixin):
    """ Predict using median sleep hours for workdays and weekends"""
    def __init__(self, data_path, data_name):
        self.model = XGBClassifier(**xgb_clf_params, gpu_id=0, tree_method="gpu_hist", random_state=42)
        
        self.data_path = data_path
        self.data_name = data_name
        
        self.df_sleep = pd.read_csv('./data/train_detailed.csv', low_memory=False)
        self.df_sleep = self._process_sleep_data(self.df_sleep)
        
        self.lags = 60
        
        self.df_hr = pd.read_csv(self.data_path, low_memory=False)
        self.df_hr = self._preprocess_feature_data(self.df_hr, self.data_name, smoothing = 2, freq="1min", start_date='2020-09-26 00:00:00', end_date='2023-03-17 00:00:00')
        self.df_hr = self._create_lags(self.df_hr, self.data_name, self.lags)
        
        self.expanded_data  = pd.merge(self.df_sleep, self.df_hr, on='date', how='outer')
        self.expanded_data  = self.expanded_data.set_index("date")
        self.expanded_data  = self.expanded_data.astype('float32')
        self.expanded_data  = self.expanded_data.fillna(method='ffill').fillna(method='bfill')
        
        pass
    
    def _process_sleep_data(self, df, freq='1min', start_date='2020-09-26 00:00:00', end_date='2023-03-17 00:00:00'):
        #exclude where valud is HKCategoryValueSleepAnalysisInBed
        df = df.drop(df[df['value'] == 'HKCategoryValueSleepAnalysisInBed'].index)
        
        # Parse dates and times
        df['startDate'] = pd.to_datetime(df['startDate'])
        df['endDate'] = pd.to_datetime(df['endDate'])

        # Create the date range
        expanded_df = pd.DataFrame()
        expanded_df["date"] = pd.date_range(start_date, end_date, freq=freq, tz=pytz.FixedOffset(-240))

        # 1 if between startDate and endDate, 0 otherwise
        expanded_df["value"] = 0
        for _, row in df.iterrows():
            mask = (expanded_df['date'] >= row['startDate']) & (expanded_df['date'] <= row['endDate'])
            expanded_df.loc[mask, 'value'] = 1
            
        expanded_df = expanded_df.rename(columns={'value': 'sleep'})

        return expanded_df
    
    def _preprocess_feature_data(self, df, col_name, freq='1min', smoothing = 2, start_date='2020-09-26 00:00:00', end_date='2023-03-17 00:00:00'):
        df = df[(df['startDate'] >= start_date) & (df['startDate'] <= end_date)]
        
        df = pd.melt(df, id_vars=['value'], value_vars=['startDate', 'endDate'], value_name='date')
        df = df.groupby('date', as_index=False).mean(numeric_only=True)
        df = df.sort_values(by='date')
        
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date')
        df = df.resample(freq).mean()
        
        df = df.interpolate().rolling(smoothing).mean()
        df = df.fillna(method="bfill")
        
        df = df.reset_index()
        df = df.rename(columns={'date': 'date', 'value': col_name})
        
        return df

    def _create_lags(self, df, column_name, n_lags):
        bckwd_columns = [df[column_name].shift(i).fillna(method="bfill").fillna(method="ffill") for i in range(1, n_lags+1)]
        fwd_columns = [df[column_name].shift(-i).fillna(method="bfill").fillna(method="ffill") for i in range(1, n_lags+1)]
        bckwd_names = [f"{column_name}_bckwd_{i}" for i in range(1, n_lags+1)]
        fwd_names = [f"{column_name}_fwd_{i}" for i in range(1, n_lags+1)]
        df_lags = pd.concat(bckwd_columns + fwd_columns, axis=1, keys=bckwd_names + fwd_names)
        return pd.concat([df, df_lags], axis=1)

    def fit(self, X, y):
        min_date = X['date'].min()
        max_date = X['date'].max()
        
        timezone = pytz.FixedOffset(-240)
        min_date = datetime.datetime.combine(min_date, datetime.time()).replace(tzinfo=timezone)
        max_date = datetime.datetime.combine(max_date, datetime.time()).replace(tzinfo=timezone)

                                                            
        train = self.expanded_data[(self.expanded_data.index >= min_date) & (self.expanded_data.index <= max_date)]
        X, y = train.drop(columns=["sleep"]), train["sleep"]

        self.scaler = RobustScaler()
        
        self.pipeline = Pipeline(steps=[("scaler", self.scaler), ("model", self.model)])
        self.pipeline.fit(X, y)
        self.dummy_fitted_ = True
        return self

    def predict(self, X):
        min_date = X.date.min() - datetime.timedelta(days=2)
        max_date = X.date.max() + datetime.timedelta(days=2)

        timezone = pytz.FixedOffset(-240)
        min_date = datetime.datetime.combine(min_date, datetime.time()).replace(tzinfo=timezone)
        max_date = datetime.datetime.combine(max_date, datetime.time()).replace(tzinfo=timezone)
        
        model_data = self.expanded_data[(self.expanded_data.index >= min_date) & (self.expanded_data.index <= max_date)]
        model_data = model_data.drop(columns=["sleep"], errors='ignore')
        
        preds = self.pipeline.predict_proba(model_data)[:, 1]
        
        preds = pd.DataFrame({"date": model_data.index.date, "sleep_prob": preds}) # Need to fix
        
        preds = preds.groupby("date")["sleep_prob"].sum().reset_index() # Sum Probabilities by date
        preds["sleep_prob"] = preds["sleep_prob"] * 1 / 60

        preds = pd.merge(X[['date']], preds, on='date', how='left')
        
        return preds.sleep_prob.fillna(7.0).to_numpy()




Best Parameters for XGBoost Sleep Model:  {'gamma': 0.7483232914389041, 'scaler': 'none', 'max_depth': 6, 'subsample': 0.6298206100536499, 'n_estimators': 830, 'learning_rate': 0.3017020376693815, 'colsample_bytree': 0.7979468286602337, 'min_child_weight': 5}


In [17]:
xgb_sleep_hr = XGBSleepTimesModel('./data/xml_export/HeartRate.csv', "hr").fit(X_short, y_short)

In [18]:
xgb_sleep_steps = XGBSleepTimesModel('./data/xml_export/StepCount.csv', "steps").fit(X_short, y_short)

In [19]:

xgb_sleep_dist = XGBSleepTimesModel('./data/xml_export/DistanceWalkingRunning.csv', "dist").fit(X_short, y_short)

In [20]:
sleep_models = {
    "xgb_sleep_hr": xgb_sleep_hr,
    "xgb_sleep_steps": xgb_sleep_steps,
    "xgb_sleep_dist": xgb_sleep_dist
}

In [21]:
# test = XGBSleepTimesModel()
# xgb_sleep_model.fit(X, y)
# xgb_sleep_model.predict(X)

### HR Model

In [22]:
complex_models = {
    'xgb': xgb_pipeline,
    'et': et_pipeline,
    'knn': knn_pipeline,
    'rf': rf_pipeline,
    'GRU': GRU_pipeline,
    'xgb_cf': xgb_clf_pipeline
}

In [23]:
kf = KFold(n_splits=3, shuffle=True, random_state=42) #tscv = TimeSeriesSplit(n_splits=5).split(data.X)


# Create pipelines for complex models
# complex_models_pipelines = {
#     model_name: Pipeline(steps=[("date_filter", ColumnDropper(include_date=False)),
#                                 ("imputer", SimpleImputer(strategy="median")),
#                                ("scaler", RobustScaler()),
#                                ('model', model)])
#     for model_name, model in complex_models.items()
# }


# Merge simple and complex models into a single dictionary and add prophet model
models = {**simple_models, **complex_models, **sleep_models, "prophet": prophet_model}

from sklearn.linear_model import Ridge

xgb = XGBRegressor(tree_method = 'gpu_hist', gpu_id = 0, random_state = 42)

stacked_regressor = StackingRegressor(estimators=list(models.items()), cv="prefit", verbose=1, final_estimator=Ridge())

In [24]:
stacked_regressor.fit(X_test, y_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- slp_ActiveEnergyBurned_max_hrs_between
- slp_ActiveEnergyBurned_sum_hrs_between
- slp_BasalEnergyBurned_max_hrs_between
- slp_BasalEnergyBurned_sum_hrs_between
- slp_BodyMassIndex_max_hrs_between
- ...


In [None]:
print('training R2: ', stacked_regressor.score(X_test, y_test))
#0.76
for model_name, coef in zip(models.keys(), stacked_regressor.final_estimator_.coef_):
    print(f'Model: {model_name:<15} {round(coef,4):<10}')

In [None]:
stacked_regressor.predict(X)

In [None]:
df = pd.read_csv("./data/test.csv", low_memory=False)
df["date"] = pd.to_datetime(df["date"]).dt.date

df = df.merge(data.xml_data, on="date", how="left")
df = df.merge(data.activity_data, on="date", how="left") #Add Activity data
df = data._feature_engineering(df)
df = df[data.columns] # Keep only columns that are in the train data
print(df.columns)
sub = pd.DataFrame({"date": df["date"]}) # Create dataframe with date column from df

df = df.drop(columns=["sleep_hours"], errors = 'ignore') # Drop date column from df

preds = stacked_regressor.predict(df) # predictions

# Create submission dataframe with date and predictions
sub["sleep_hours"] = preds


In [None]:
sub.to_csv('submission_stacked.csv', index=False)

In [None]:
sub.plot()

In [None]:
sub.hist()