## Sleep Quality Prediction - Supervised Machine Learning

---

by: Cody Hill

date: 1/18/2023

### Data Source Information

### [ ADD LICENSE INFORMATION AND FAIR USE INFO]

In [276]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Setup and Data Cleaning

In [277]:
# Import data
# TODO: Switch this to the github urls for the data so others can use.
data_folder = '/Users/chill/GitHub/Supervised-Sleep/Data'
sleep_data_folder = '/Users/chill/GitHub/Supervised-Sleep/Data/Sleep_Data'

# Iterate through each file in .Data/ and add it to a dataframe.
file_path = [f'{data_folder}/{file}' for file in os.listdir(data_folder) if '.csv' in file]
# Using a list to concat the dfs with index_col allows to easily merge based on 'day'. More memory usage but fine for this project.
biometric_df = pd.concat([pd.read_csv(file, index_col = 'day') for file in file_path], join = 'outer', ignore_index = False, axis = 1).reset_index()

# Iterate through each file in .Data/Sleep_Data and add it to a dataframe.
# Separated sleep data as it potentially has multiple entries per day. Will merge them later.
file_path_sleep = [f'{sleep_data_folder}/{file}' for file in os.listdir(sleep_data_folder) if '.csv' in file and 'daily' not in file]
sleep_df = pd.concat(map(pd.read_csv, file_path_sleep), join = 'outer', ignore_index = False, axis = 1)

# Import ground truth label sleep score.
file_path_daily_sleep_score = [f'{sleep_data_folder}/{file}' for file in os.listdir(sleep_data_folder) if '.csv' in file and 'daily' in file]
daily_sleep_score_df = pd.read_csv(file_path_daily_sleep_score[0])
daily_sleep_score_df = daily_sleep_score_df[['score', 'day']]

In [278]:
print(biometric_df.shape)
display(biometric_df.head(2))


(350, 42)


Unnamed: 0,day,spo2_percentage,active_calories,average_met_minutes,equivalent_walking_distance,high_activity_met_minutes,high_activity_time,inactivity_alerts,low_activity_met_minutes,low_activity_time,...,temperature_deviation,temperature_trend_deviation,contributors_activity_balance,contributors_hrv_balance,contributors_previous_day_activity,contributors_previous_night,contributors_recovery_index,contributors_resting_heart_rate,contributors_sleep_balance,contributors_body_temperature
0,2023-02-04,98.523,650,1.59375,10728,16,120,0,277,23220,...,-0.38,,,,96.0,74.0,97.0,94.0,,90.0
1,2023-02-05,97.181,498,1.4375,8458,8,60,0,204,15000,...,-0.04,0.15,,,82.0,79.0,100.0,59.0,,100.0


In [279]:
print(sleep_df.shape)
display(sleep_df.head(4))

(685, 47)


Unnamed: 0,average_breath,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day,deep_sleep_duration,efficiency,latency,...,readiness_contributors_body_temperature,readiness_contributors_hrv_balance,readiness_contributors_previous_day_activity,readiness_contributors_previous_night,readiness_contributors_recovery_index,readiness_contributors_resting_heart_rate,readiness_contributors_sleep_balance,readiness_score,readiness_temperature_deviation,readiness_temperature_trend_deviation
0,13.625,63.25,77.0,4440.0,2023-02-04T07:08:22.000-06:00,2023-02-03T22:40:22.000-06:00,2023-02-04,4650.0,85.0,990.0,...,90.0,,96.0,74.0,97.0,94.0,,89.0,-0.38,
1,15.125,92.49,19.0,6210.0,2023-02-05T09:37:28.000-06:00,2023-02-04T23:54:28.000-06:00,2023-02-05,4590.0,82.0,180.0,...,100.0,,82.0,79.0,100.0,59.0,,78.0,-0.04,0.15
2,15.125,87.67,,1920.0,2023-02-05T20:10:57.000-06:00,2023-02-05T19:36:57.000-06:00,2023-02-06,30.0,6.0,0.0,...,,,,,,,,,,
3,15.125,82.5,37.0,720.0,2023-02-05T20:36:02.000-06:00,2023-02-05T20:20:02.000-06:00,2023-02-06,0.0,25.0,0.0,...,,,,,,,,,,


In [280]:
biometric_df.columns

Index(['day', 'spo2_percentage', 'active_calories', 'average_met_minutes',
       'equivalent_walking_distance', 'high_activity_met_minutes',
       'high_activity_time', 'inactivity_alerts', 'low_activity_met_minutes',
       'low_activity_time', 'medium_activity_met_minutes',
       'medium_activity_time', 'meters_to_target', 'non_wear_time',
       'resting_time', 'sedentary_met_minutes', 'sedentary_time', 'steps',
       'target_calories', 'target_meters', 'total_calories', 'score',
       'class_5_min', 'contributors_meet_daily_targets',
       'contributors_move_every_hour', 'contributors_recovery_time',
       'contributors_stay_active', 'contributors_training_frequency',
       'contributors_training_volume', 'met_1_min', 'ring_met_1_min', 'score',
       'temperature_deviation', 'temperature_trend_deviation',
       'contributors_activity_balance', 'contributors_hrv_balance',
       'contributors_previous_day_activity', 'contributors_previous_night',
       'contributors_recov

As we can see, the Oura Ring tracks and records many biometrics, and with that raw biometric data they use different equations and feature engineering to assign a daily score to categories such as sleep, recovery, readiness, activity, etc. totalling 86 features. Since the purpose of this is to create our own sleep score predictive model, we can use Oura's sleep score as our ground truth label (y_train) in training and validation. Furthermore, we must remove the features with labels that include "contributors" as these columns contain normalized scores which Oura's models have output that are then used to average out into the final score.

In [281]:
# Remove contributor and score columns (multiple scores involved, will add truth label later).
biometric_df = biometric_df.loc[:, ~biometric_df.columns.str.contains('contributors|score')]
biometric_df.shape

sleep_df = sleep_df.loc[:, ~sleep_df.columns.str.contains('contributors|score')]
sleep_df.shape

(685, 30)

We've removed 31 feature columns containing the word contributors.

Now we will remove all columns that won't be used in feature engineering or the final model.
Then complete data munging by formatting all the features (float, int, dates, dummy/indicator encoding).

In [282]:
biometric_df.columns

Index(['day', 'spo2_percentage', 'active_calories', 'average_met_minutes',
       'equivalent_walking_distance', 'high_activity_met_minutes',
       'high_activity_time', 'inactivity_alerts', 'low_activity_met_minutes',
       'low_activity_time', 'medium_activity_met_minutes',
       'medium_activity_time', 'meters_to_target', 'non_wear_time',
       'resting_time', 'sedentary_met_minutes', 'sedentary_time', 'steps',
       'target_calories', 'target_meters', 'total_calories', 'class_5_min',
       'met_1_min', 'ring_met_1_min', 'temperature_deviation',
       'temperature_trend_deviation'],
      dtype='object')

In [283]:
# # TODO: oura_sleep_2024-01.csv
# X - Nap on day encoding
#  X - list(where [type] != long_sleep && between 10 AM - 7 PM)
#       Does the nap affect day of or next day? I chose day of
#  X - sum(types of sleep duration)
# - restless_periods vs sum(movement_30_sec)??
# X - Only one day per entry
#   X - Sum each day sleep durations, restless_periods, awake_time, time_in_bed, total_sleep_duration
#       - awake_time = time_in_bed - total_sleep_duration ??
#   X - Save only the [type] == long_sleep, average_breath, average_heart_rate, average_hrv, latency, 
#       lowest_heart_rate, betime_start_delta
# X - Remove: efficiency, period, score, segment_state, sleep_midpoint, sleep_phase_5_min, movement_30_sec, timezone, 
#       betime_end_delta, midpoint_at_delta, heart_rate_5_min, hrv_5_min

# # TODO: oura_daily-activity_2024-01.csv
# X - Remove: average_met_minutes, equivalent_walking_distance, high_activity_met_minutes, inactivity_alerts, 
#       low_activity_met_minutes, medium_activity_met_minutes, sedentary_met_minutes, target_calories, target_meters, score,
#       class_5_min, met_1_min, ring_met_1_min
# - Workout daily boolean (Maybe unnecessary with other metrics compare models w/ and w/o)

# # TODO: oura_daily-readiness_2024-01.csv
# X - Remove: score, temperature_trend_deviation

# # TODO: oura_daily--spo2_2024-01.csv
# - N/A
# -
# -

# # TODO: General
# - Collinearity between features checks in model selection
# - Any features need to be normalized?
# - Create Readme
# - Create systeminfo output
# - Reroute filepath to github url
# - Deal with NaNs!

In [284]:
biometric_df.loc[biometric_df['day'] == '2024-01-14']

Unnamed: 0,day,spo2_percentage,active_calories,average_met_minutes,equivalent_walking_distance,high_activity_met_minutes,high_activity_time,inactivity_alerts,low_activity_met_minutes,low_activity_time,...,sedentary_time,steps,target_calories,target_meters,total_calories,class_5_min,met_1_min,ring_met_1_min,temperature_deviation,temperature_trend_deviation
324,2024-01-14,,439,1.375,7630,0,0,0,212,15000,...,20100,7651,500,10000,2713,1111111111111111111111111111111111111111111111...,1.2;1.2;1.0;1.0;0.9;0.9;1.1;0.9;0.9;1.0;0.9;0....,1.2;1.2;1.0;1.0;0.9;0.9;1.1;0.9;0.9;1.0;0.9;0....,-0.12,0.1


In [285]:
biometric_df['day'].duplicated().sum()

0

In [286]:
# Columns to drop
drop_col_sleep = ['efficiency', 'period', 'segment_state',
            'sleep_midpoint', 'sleep_phase_5_min', 'movement_30_sec',
            'timezone', 'bedtime_end_delta', 'midpoint_at_delta',
            'heart_rate_5_min', 'hrv_5_min'] # timezone? might use later
drop_col_bio = ['average_met_minutes', 'equivalent_walking_distance', 
                'high_activity_met_minutes', 'inactivity_alerts', 
                'low_activity_met_minutes', 'medium_activity_met_minutes', 
                'sedentary_met_minutes', 'target_calories', 
                'target_meters', 'class_5_min', 
                'met_1_min', 'ring_met_1_min', 
                'temperature_trend_deviation']
# Columns to format to dates in sleep_df
date_col = ['bedtime_end', 'bedtime_start', 'day']

sleep_df.drop(drop_col_sleep, axis = 1, inplace = True)
sleep_df[date_col] = sleep_df[date_col].apply(pd.to_datetime, utc = True, errors = 'coerce')
biometric_df.drop(drop_col_bio, axis = 1, inplace = True)
biometric_df['day'] = biometric_df['day'].apply(pd.to_datetime, utc = True, errors = 'coerce')
daily_sleep_score_df['day'] = daily_sleep_score_df['day'].apply(pd.to_datetime, utc = True, errors = 'coerce')

In [287]:
tzone = -240
f"{int(tzone / 10 / 6)}:00"

'-4:00'

In [288]:
print(sleep_df.shape)
display(sleep_df.head(4))
sleep_df.dtypes

(685, 19)


Unnamed: 0,average_breath,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day,deep_sleep_duration,latency,light_sleep_duration,lowest_heart_rate,rem_sleep_duration,restless_periods,time_in_bed,total_sleep_duration,type,bedtime_start_delta,readiness_temperature_deviation,readiness_temperature_trend_deviation
0,13.625,63.25,77.0,4440.0,2023-02-04 13:08:22+00:00,2023-02-04 04:40:22+00:00,2023-02-04 00:00:00+00:00,4650.0,990.0,15570.0,56.0,5820.0,282.0,30480,26040.0,long_sleep,-4778,-0.38,
1,15.125,92.49,19.0,6210.0,2023-02-05 15:37:28+00:00,2023-02-05 05:54:28+00:00,2023-02-05 00:00:00+00:00,4590.0,180.0,14970.0,75.0,9210.0,240.0,34980,28770.0,long_sleep,-332,-0.04,0.15
2,15.125,87.67,,1920.0,2023-02-06 02:10:57+00:00,2023-02-06 01:36:57+00:00,2023-02-06 00:00:00+00:00,30.0,0.0,90.0,86.0,0.0,0.0,2040,120.0,,-15783,,
3,15.125,82.5,37.0,720.0,2023-02-06 02:36:02+00:00,2023-02-06 02:20:02+00:00,2023-02-06 00:00:00+00:00,0.0,0.0,240.0,82.0,0.0,8.0,960,240.0,,-13198,,


average_breath                                       float64
average_heart_rate                                   float64
average_hrv                                          float64
awake_time                                           float64
bedtime_end                              datetime64[ns, UTC]
bedtime_start                            datetime64[ns, UTC]
day                                      datetime64[ns, UTC]
deep_sleep_duration                                  float64
latency                                              float64
light_sleep_duration                                 float64
lowest_heart_rate                                    float64
rem_sleep_duration                                   float64
restless_periods                                     float64
time_in_bed                                            int64
total_sleep_duration                                 float64
type                                                  object
bedtime_start_delta     

In [289]:
# Initialize a time frame we can consider a nap/rest period (UTC format).
nap_upper = pd.to_datetime('23:59:00').time()
nap_lower = pd.to_datetime('14:00:00').time()
# Condition on the start and end time, total sleep duration to filter out false-positives, remove any long_sleep types.
nap_bool = ((sleep_df['bedtime_start'].dt.time >= nap_lower) & 
            (sleep_df['bedtime_end'].dt.time <= nap_upper) & 
            (sleep_df['total_sleep_duration'] > 600) &
            (sleep_df['type'] != 'long_sleep'))
display(sleep_df[nap_bool])
# Insert a nap_today column and binary yes/no for each day.
# Initialize column with zeroes.
sleep_df['nap_today'] = np.zeros_like(sleep_df.shape[0])
# Use boolean array to identify nap days and iterate through to change nap_today to 1.
nap_days = sleep_df[nap_bool]['day']
for day in nap_days:
    sleep_df.loc[sleep_df['day'] == day, 'nap_today'] = 1
    
display(sleep_df.loc[sleep_df['nap_today'] == 1])

Unnamed: 0,average_breath,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day,deep_sleep_duration,latency,light_sleep_duration,lowest_heart_rate,rem_sleep_duration,restless_periods,time_in_bed,total_sleep_duration,type,bedtime_start_delta,readiness_temperature_deviation,readiness_temperature_trend_deviation
31,13.5,74.47,34.0,6270.0,2023-02-20 21:16:13+00:00,2023-02-20 19:05:13+00:00,2023-02-20 00:00:00+00:00,420.0,570.0,840.0,67.0,330.0,25.0,7860,1590.0,,47113,,
34,13.875,70.14,58.0,3540.0,2023-02-21 23:27:03+00:00,2023-02-21 21:59:03+00:00,2023-02-21 00:00:00+00:00,120.0,3600.0,1140.0,60.0,480.0,23.0,5280,1740.0,,57543,,
44,14.75,90.5,25.0,3690.0,2023-02-27 01:11:23+00:00,2023-02-26 23:54:23+00:00,2023-02-27 00:00:00+00:00,300.0,2760.0,540.0,90.0,90.0,24.0,4620,930.0,,60863,,
53,14.75,,,1140.0,2023-03-01 23:27:00+00:00,2023-03-01 22:51:00+00:00,2023-03-01 00:00:00+00:00,570.0,750.0,420.0,,30.0,11.0,2160,1020.0,,60660,,
63,14.625,71.67,,930.0,2023-03-06 22:03:32+00:00,2023-03-06 21:30:32+00:00,2023-03-06 00:00:00+00:00,630.0,390.0,420.0,70.0,0.0,31.0,1980,1050.0,,55832,,
88,13.75,65.15,75.0,1530.0,2023-03-17 21:33:33+00:00,2023-03-17 20:17:33+00:00,2023-03-17 00:00:00+00:00,90.0,450.0,2940.0,62.0,0.0,88.0,4560,3030.0,,55053,,
89,13.75,68.33,63.0,2070.0,2023-03-17 23:52:33+00:00,2023-03-17 22:48:33+00:00,2023-03-18 00:00:00+00:00,540.0,1410.0,780.0,63.0,450.0,12.0,3840,1770.0,late_nap,64113,,
116,13.625,70.4,70.0,3030.0,2023-03-30 20:27:01+00:00,2023-03-30 19:26:01+00:00,2023-03-30 00:00:00+00:00,180.0,690.0,450.0,68.0,0.0,10.0,3660,630.0,,51961,,
152,14.125,61.2,107.0,750.0,2023-04-23 21:10:49+00:00,2023-04-23 20:29:49+00:00,2023-04-23 00:00:00+00:00,1260.0,330.0,450.0,56.0,0.0,15.0,2460,1710.0,sleep,55789,,
180,14.875,71.2,80.0,1080.0,2023-05-10 18:45:42+00:00,2023-05-10 17:44:42+00:00,2023-05-10 00:00:00+00:00,810.0,570.0,1770.0,67.0,0.0,29.0,3660,2580.0,sleep,45882,,


Unnamed: 0,average_breath,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day,deep_sleep_duration,latency,light_sleep_duration,lowest_heart_rate,rem_sleep_duration,restless_periods,time_in_bed,total_sleep_duration,type,bedtime_start_delta,readiness_temperature_deviation,readiness_temperature_trend_deviation,nap_today
30,13.500,65.80,78.0,7350.0,2023-02-20 12:52:29+00:00,2023-02-20 04:44:29+00:00,2023-02-20 00:00:00+00:00,5010.0,1590.0,13770.0,55.0,3150.0,254.0,29280,21930.0,long_sleep,-4531,-0.10,0.15,1
31,13.500,74.47,34.0,6270.0,2023-02-20 21:16:13+00:00,2023-02-20 19:05:13+00:00,2023-02-20 00:00:00+00:00,420.0,570.0,840.0,67.0,330.0,25.0,7860,1590.0,,47113,,,1
32,13.500,77.00,33.0,1410.0,2023-02-21 03:25:01+00:00,2023-02-21 02:54:01+00:00,2023-02-21 00:00:00+00:00,60.0,570.0,210.0,76.0,180.0,9.0,1860,450.0,,-11159,,,1
33,13.875,61.19,109.0,3360.0,2023-02-21 13:55:00+00:00,2023-02-21 05:51:00+00:00,2023-02-21 00:00:00+00:00,7620.0,180.0,12510.0,50.0,5550.0,302.0,29040,25680.0,long_sleep,-540,-0.17,0.02,1
34,13.875,70.14,58.0,3540.0,2023-02-21 23:27:03+00:00,2023-02-21 21:59:03+00:00,2023-02-21 00:00:00+00:00,120.0,3600.0,1140.0,60.0,480.0,23.0,5280,1740.0,,57543,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616,14.875,83.27,15.0,4163.0,2023-12-16 13:31:27+00:00,2023-12-16 05:01:04+00:00,2023-12-16 00:00:00+00:00,4290.0,1230.0,13860.0,79.0,8310.0,218.0,30623,26460.0,long_sleep,-3536,0.17,0.16,1
617,14.750,74.90,45.0,1830.0,2023-12-16 22:11:30+00:00,2023-12-16 21:04:30+00:00,2023-12-16 00:00:00+00:00,150.0,1260.0,2040.0,66.0,0.0,38.0,4020,2190.0,,54270,,0.10,1
652,14.250,,,1261.0,2024-01-02 05:10:34+00:00,2024-01-02 04:43:03+00:00,2024-01-02 00:00:00+00:00,0.0,1110.0,390.0,,0.0,8.0,1651,390.0,,-4617,,0.27,1
653,14.375,75.64,62.0,2958.0,2024-01-02 14:20:52+00:00,2024-01-02 06:33:04+00:00,2024-01-02 00:00:00+00:00,4620.0,450.0,15360.0,60.0,5130.0,169.0,28068,25110.0,long_sleep,1984,0.03,0.22,1


In [290]:
sleep_df_long = sleep_df.loc[sleep_df['type'] == 'long_sleep']

# Use merge() to find the differences in dataframes indicating days that didn't record a long sleep.
# Requires a method where the two df indexes don't match.
# Method credit to: https://stackoverflow.com/questions/48647534/find-difference-between-two-data-frames
no_sleep = pd.DataFrame(sleep_df['day']).merge(sleep_df_long['day'], indicator = True, how='left').loc[lambda x : x['_merge']!='both']
display(no_sleep)

# We can remove these days from the main dataframes as they won't have scores anyways.
# Sleep DF
sleep_df.drop(no_sleep.index, axis = 0, inplace = True)

# Biometric DF
# We can use our previous work and just remove the days that the two DFs don't share.
no_sleep_bio_days =  pd.DataFrame(biometric_df['day']).merge(sleep_df['day'], indicator = True, how='left').loc[lambda x : x['_merge']!='both']['day']
no_sleep_bio_index = biometric_df[biometric_df['day'].isin(no_sleep_bio_days)].index

biometric_df.drop(index = no_sleep_bio_index, axis = 0, inplace = True)

# Daily Sleep Score DF
no_sleep_score_index = pd.DataFrame(daily_sleep_score_df['day']).merge(biometric_df['day'], indicator = True, how='left').loc[lambda x : x['_merge']!='both']['day'].index
daily_sleep_score_df.drop(index = no_sleep_score_index, axis = 0, inplace = True)
# We can now remove the day column from the daily_sleep_score_df
daily_sleep_score_df = daily_sleep_score_df['score']

Unnamed: 0,day,_merge
103,2023-03-24 00:00:00+00:00,left_only
140,2023-04-15 00:00:00+00:00,left_only
161,2023-05-01 00:00:00+00:00,left_only
162,2023-05-01 00:00:00+00:00,left_only
163,2023-05-02 00:00:00+00:00,left_only
173,2023-05-08 00:00:00+00:00,left_only
223,2023-05-28 00:00:00+00:00,left_only
224,2023-05-28 00:00:00+00:00,left_only
326,2023-07-24 00:00:00+00:00,left_only
402,2023-08-31 00:00:00+00:00,left_only


In [291]:
# Sum each day's sleep metrics (for days with interrupted sleep or naps).
sleep_df = sleep_df[['day', 
         'deep_sleep_duration', 
         'light_sleep_duration', 
         'rem_sleep_duration', 
         'restless_periods', 
         'awake_time', 
         'time_in_bed', 
         'total_sleep_duration']].groupby(['day']).sum().reset_index()

In [292]:
# Features extracted only from the long sleep to include in the final dateframe.
sleep_df_long = sleep_df_long[['day', 
                               'average_breath', 
                               'average_heart_rate', 
                               'average_hrv', 
                               'latency', 
                               'lowest_heart_rate', 
                               'bedtime_start_delta',
                               'nap_today']]

In [293]:
sleep_df = sleep_df.merge(sleep_df_long, on = 'day', how = 'outer')
sleep_df.head(2)

Unnamed: 0,day,deep_sleep_duration,light_sleep_duration,rem_sleep_duration,restless_periods,awake_time,time_in_bed,total_sleep_duration,average_breath,average_heart_rate,average_hrv,latency,lowest_heart_rate,bedtime_start_delta,nap_today
0,2023-02-04 00:00:00+00:00,4650.0,15570.0,5820.0,282.0,4440.0,30480,26040.0,13.625,63.25,77.0,990.0,56.0,-4778,0
1,2023-02-05 00:00:00+00:00,4590.0,14970.0,9210.0,240.0,6210.0,34980,28770.0,15.125,92.49,19.0,180.0,75.0,-332,0


In [294]:
sleep_df.shape

(334, 15)

In [295]:
biometric_df.shape

(334, 13)

In [296]:
daily_sleep_score_df.shape

(334,)

In [297]:
bio_sleep_df = sleep_df.merge(biometric_df, on = 'day', how = 'outer')
bio_sleep_df.drop(['day'], axis = 1, inplace = True)
display(bio_sleep_df)

Unnamed: 0,deep_sleep_duration,light_sleep_duration,rem_sleep_duration,restless_periods,awake_time,time_in_bed,total_sleep_duration,average_breath,average_heart_rate,average_hrv,...,high_activity_time,low_activity_time,medium_activity_time,meters_to_target,non_wear_time,resting_time,sedentary_time,steps,total_calories,temperature_deviation
0,4650.0,15570.0,5820.0,282.0,4440.0,30480,26040.0,13.625,63.25,77.0,...,120,23220,2940,0,0,25740,34380,11638,3075,-0.38
1,4590.0,14970.0,9210.0,240.0,6210.0,34980,28770.0,15.125,92.49,19.0,...,60,15000,2580,2700,0,35280,33480,9001,2814,-0.04
2,8580.0,15210.0,7350.0,302.0,6240.0,37380,31140.0,13.625,68.76,59.0,...,120,13440,2160,5300,0,31080,39600,7544,2748,-0.22
3,4500.0,17700.0,6030.0,284.0,5010.0,33240,28230.0,13.500,67.25,54.0,...,0,16560,3180,2300,4800,31020,30840,8668,2852,-0.25
4,4920.0,15630.0,6420.0,245.0,6210.0,33180,26970.0,13.875,69.50,49.0,...,300,18840,3300,-200,120,28260,35580,11605,3027,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,3090.0,19680.0,5760.0,253.0,9469.0,37999,28530.0,13.625,66.03,67.0,...,0,15000,1860,1100,14580,34860,20100,7651,2713,-0.12
330,5610.0,17970.0,5340.0,288.0,6456.0,35376,28920.0,13.875,69.69,67.0,...,240,18840,3300,-300,540,27960,35520,11561,3064,0.05
331,5910.0,11310.0,4230.0,240.0,4734.0,26184,21450.0,13.500,57.40,106.0,...,0,19920,1740,900,300,26880,37560,8525,2879,-0.33
332,5640.0,10230.0,7020.0,221.0,3768.0,26658,22890.0,13.875,57.75,140.0,...,60,17940,2160,2800,4800,28620,32820,9267,2846,-0.46


In [298]:
bio_sleep_df.columns

Index(['deep_sleep_duration', 'light_sleep_duration', 'rem_sleep_duration',
       'restless_periods', 'awake_time', 'time_in_bed', 'total_sleep_duration',
       'average_breath', 'average_heart_rate', 'average_hrv', 'latency',
       'lowest_heart_rate', 'bedtime_start_delta', 'nap_today',
       'spo2_percentage', 'active_calories', 'high_activity_time',
       'low_activity_time', 'medium_activity_time', 'meters_to_target',
       'non_wear_time', 'resting_time', 'sedentary_time', 'steps',
       'total_calories', 'temperature_deviation'],
      dtype='object')

In [299]:
# Explore and deal with NaN values.
# A good option to deal with each feature individually.
# >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
# >>> df.fillna(value=values)
display(bio_sleep_df.isna().any())
bio_sleep_df.fillna(bio_sleep_df.median(), inplace = True)

deep_sleep_duration      False
light_sleep_duration     False
rem_sleep_duration       False
restless_periods         False
awake_time               False
time_in_bed              False
total_sleep_duration     False
average_breath           False
average_heart_rate       False
average_hrv              False
latency                  False
lowest_heart_rate        False
bedtime_start_delta      False
nap_today                False
spo2_percentage           True
active_calories          False
high_activity_time       False
low_activity_time        False
medium_activity_time     False
meters_to_target         False
non_wear_time            False
resting_time             False
sedentary_time           False
steps                    False
total_calories           False
temperature_deviation    False
dtype: bool

### Exploratory Data Analysis (EDA)

In [300]:
daily_sleep_score_df.describe()

count    334.000000
mean      75.688623
std        8.384459
min       36.000000
25%       71.000000
50%       77.000000
75%       81.000000
max       94.000000
Name: score, dtype: float64

### Model Selection

In [301]:
X_train, X_test, y_train, y_test = train_test_split(bio_sleep_df, daily_sleep_score_df, test_size = 0.1, random_state = 11)

In [302]:
dt = DecisionTreeClassifier(max_depth = None, max_leaf_nodes = None, random_state = 21).fit(X_train, y_train)
print(dt.score(X_train, y_train))
print(dt.score(X_test, y_test))
#path = dt.cost_complexity_pruning_path(X_train, y_train)
#ccp_alphas, impurities = path.ccp_alphas, path.impurities

1.0
0.17647058823529413
