## Sleep Quality Prediction - Supervised Machine Learning

---

by: Cody Hill

date: 1/18/2023

### Data Source Information

### [ ADD LICENSE INFORMATION AND FAIR USE INFO]

In [117]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

### Exploratory Data Analysis (EDA)

In [118]:
# Import data
# TODO: Switch this to the github urls for the data so others can use.
data_folder = '/Users/chill/GitHub/Supervised-Sleep/Data'
sleep_data_folder = '/Users/chill/GitHub/Supervised-Sleep/Data/Sleep_Data'

# Iterate through each file in .Data/ and add it to a dataframe.
file_path = [f'{data_folder}/{file}' for file in os.listdir(data_folder) if '.csv' in file]
# Using a list to concat the dfs with index_col allows to easily merge based on 'day'. More memory usage but fine for this project.
biometric_df = pd.concat([pd.read_csv(file, index_col = 'day') for file in file_path], join = 'outer', ignore_index = False, axis = 1).reset_index()

# Iterate through each file in .Data/Sleep_Data and add it to a dataframe.
# Separated sleep data as it potentially has multiple entries per day. Will merge them later.
file_path_sleep = [f'{sleep_data_folder}/{file}' for file in os.listdir(sleep_data_folder) if '.csv' in file and 'daily' not in file]
sleep_df = pd.concat(map(pd.read_csv, file_path_sleep), join = 'outer', ignore_index = False, axis = 1)

In [119]:
print(biometric_df.shape)
display(biometric_df.head(2))


(350, 42)


Unnamed: 0,day,spo2_percentage,active_calories,average_met_minutes,equivalent_walking_distance,high_activity_met_minutes,high_activity_time,inactivity_alerts,low_activity_met_minutes,low_activity_time,...,temperature_deviation,temperature_trend_deviation,contributors_activity_balance,contributors_hrv_balance,contributors_previous_day_activity,contributors_previous_night,contributors_recovery_index,contributors_resting_heart_rate,contributors_sleep_balance,contributors_body_temperature
0,2023-02-04,98.523,650,1.59375,10728,16,120,0,277,23220,...,-0.38,,,,96.0,74.0,97.0,94.0,,90.0
1,2023-02-05,97.181,498,1.4375,8458,8,60,0,204,15000,...,-0.04,0.15,,,82.0,79.0,100.0,59.0,,100.0


In [120]:
print(sleep_df.shape)
display(sleep_df.head(2))

(685, 47)


Unnamed: 0,average_breath,average_heart_rate,average_hrv,awake_time,bedtime_end,bedtime_start,day,deep_sleep_duration,efficiency,latency,...,readiness_contributors_body_temperature,readiness_contributors_hrv_balance,readiness_contributors_previous_day_activity,readiness_contributors_previous_night,readiness_contributors_recovery_index,readiness_contributors_resting_heart_rate,readiness_contributors_sleep_balance,readiness_score,readiness_temperature_deviation,readiness_temperature_trend_deviation
0,13.625,63.25,77.0,4440.0,2023-02-04T07:08:22.000-06:00,2023-02-03T22:40:22.000-06:00,2023-02-04,4650.0,85.0,990.0,...,90.0,,96.0,74.0,97.0,94.0,,89.0,-0.38,
1,15.125,92.49,19.0,6210.0,2023-02-05T09:37:28.000-06:00,2023-02-04T23:54:28.000-06:00,2023-02-05,4590.0,82.0,180.0,...,100.0,,82.0,79.0,100.0,59.0,,78.0,-0.04,0.15


In [121]:
biometric_df.columns

Index(['day', 'spo2_percentage', 'active_calories', 'average_met_minutes',
       'equivalent_walking_distance', 'high_activity_met_minutes',
       'high_activity_time', 'inactivity_alerts', 'low_activity_met_minutes',
       'low_activity_time', 'medium_activity_met_minutes',
       'medium_activity_time', 'meters_to_target', 'non_wear_time',
       'resting_time', 'sedentary_met_minutes', 'sedentary_time', 'steps',
       'target_calories', 'target_meters', 'total_calories', 'score',
       'class_5_min', 'contributors_meet_daily_targets',
       'contributors_move_every_hour', 'contributors_recovery_time',
       'contributors_stay_active', 'contributors_training_frequency',
       'contributors_training_volume', 'met_1_min', 'ring_met_1_min', 'score',
       'temperature_deviation', 'temperature_trend_deviation',
       'contributors_activity_balance', 'contributors_hrv_balance',
       'contributors_previous_day_activity', 'contributors_previous_night',
       'contributors_recov

As we can see, the Oura Ring tracks and records many biometrics, and with that raw biometric data they use different equations and feature engineering to assign a daily score to categories such as sleep, recovery, readiness, activity, etc. totalling 86 features. Since the purpose of this is to create our own sleep score predictive model, we can use Oura's sleep score as our ground truth label (y_train) in training and validation. Furthermore, we must remove the features with labels that include "contributors" as these columns contain normalized scores which Oura's models have output that are then used to average out into the final score.

In [122]:
# Remove contributor and score columns (multiple scores involved, will add truth label later).
biometric_df = biometric_df.loc[:, ~biometric_df.columns.str.contains('contributors|score')]
biometric_df.shape

(350, 26)

We've removed 31 feature columns containing the word contributors.

Now we will remove all columns that won't be used in feature engineering or the final model.
Then complete data munging by formatting all the features (float, int, dates, dummy/indicator encoding).

In [123]:
biometric_df.columns

Index(['day', 'spo2_percentage', 'active_calories', 'average_met_minutes',
       'equivalent_walking_distance', 'high_activity_met_minutes',
       'high_activity_time', 'inactivity_alerts', 'low_activity_met_minutes',
       'low_activity_time', 'medium_activity_met_minutes',
       'medium_activity_time', 'meters_to_target', 'non_wear_time',
       'resting_time', 'sedentary_met_minutes', 'sedentary_time', 'steps',
       'target_calories', 'target_meters', 'total_calories', 'class_5_min',
       'met_1_min', 'ring_met_1_min', 'temperature_deviation',
       'temperature_trend_deviation'],
      dtype='object')

In [124]:
# # TODO: oura_sleep_2024-01.csv
# - Nap on day encoding
#   - list(where [type] != long_sleep && between 10 AM - 7 PM)
#   - sum(types of sleep duration)
# - restless_periods vs sum(movement_30_sec)??
# - Only one day per entry
#   - Sum each day sleep durations, restless_periods, awake_time, time_in_bed, total_sleep_duration
#       - awake_time = time_in_bed - total_sleep_duration ??
#   - Save only the [type] == long_sleep, average_breath, average_heart_rate, average_hrv, latency, 
#       lowest_heart_rate, betime_start_delta
# - Remove: efficiency, period, score, segment_state, sleep_midpoint, sleep_phase_5_min, movement_30_sec, timezone, 
#       betime_end_delta, midpoint_at_delta, heart_rate_5_min, hrv_5_min

# # TODO: oura_daily-activity_2024-01.csv
# - Remove: average_met_minutes, equivalent_walking_distance, high_activity_met_minutes, inactivity_alerts, 
#       low_activity_met_minutes, medium_activity_met_minutes, sedentary_met_minutes, target_calories, target_meters, score,
#       class_5_min, met_1_min, ring_met_1_min

# # TODO: oura_daily-readiness_2024-01.csv
# - Remove: score, temperature_trend_deviation

# # TODO: oura_daily--spo2_2024-01.csv
# - N/A
# -
# -

# # TODO: General
# - Collinearity between features checks in model selection
# - 
# - 

In [125]:
biometric_df.loc[biometric_df['day'] == '2024-01-14']

Unnamed: 0,day,spo2_percentage,active_calories,average_met_minutes,equivalent_walking_distance,high_activity_met_minutes,high_activity_time,inactivity_alerts,low_activity_met_minutes,low_activity_time,...,sedentary_time,steps,target_calories,target_meters,total_calories,class_5_min,met_1_min,ring_met_1_min,temperature_deviation,temperature_trend_deviation
324,2024-01-14,,439,1.375,7630,0,0,0,212,15000,...,20100,7651,500,10000,2713,1111111111111111111111111111111111111111111111...,1.2;1.2;1.0;1.0;0.9;0.9;1.1;0.9;0.9;1.0;0.9;0....,1.2;1.2;1.0;1.0;0.9;0.9;1.1;0.9;0.9;1.0;0.9;0....,-0.12,0.1


In [126]:
biometric_df['day'].duplicated().sum()

0