##### This notebook splits the initial dataframe into 2 sub-dataframes: one for the labeling of the clusters and another one for the training of the clustering algorithm

In [24]:
import warnings
import numpy as np
import pandas as pd
from dataprep.eda import create_report

warnings.filterwarnings("ignore")

Load the initial dataframe containing all the information of the MongoDB

In [25]:
df = pd.read_pickle('../data/fitbit_surveys_semas.pkl')
df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,sensor_type,nightly_temperature,nremhr,spo2,rmssd,...,self_reevaluation_category,social_liberation_category,counterconditioning_category,helping_relationships_category,reinforcement_management_category,self_liberation_category,stimulus_control_category,step_goal,mood,place
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,SKIN,35.02573,,,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,SKIN,34.866951,,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,SKIN,35.349583,,,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,SKIN,34.495486,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,TIRED,WORK/SCHOOL
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,RESTED/RELAXED,HOME
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,HAPPY,TRANSIT
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,RESTED/RELAXED,HOME


Define the lists of features for the training and labeling dataframes

In [26]:
training_features = ['id', 'date', 'hour', 'exertion_points', 'altitude', 'distance', 'lightly_active_minutes', 'moderately_active_minutes', 'sedentary_minutes', 'steps', 'very_active_minutes', 'exercise', 'exercise_calories', 'exercise_duration', 'exercise_steps', 'exercise_sedentary_minutes', 'exercise_lightly_minutes', 'exercise_fairly_minutes', 'exercise_very_minutes', 'step_goal', 'minutes_below_zone_1', 'minutes_in_zone_1', 'minutes_in_zone_2', 'minutes_in_zone_3', 'exercise_avg_hr', 'exercise_Out of Range_zone_minutes', 'exercise_Out of Range_zone_calories', 'exercise_Fat Burn_zone_minutes', 'exercise_Fat Burn_zone_calories', 'exercise_Cardio_zone_minutes', 'exercise_Cardio_zone_calories', 'exercise_Peak_zone_minutes', 'exercise_Peak_zone_calories', 'sleep_points', 'deep', 'light', 'rem', 'wake', 'sleep_duration', 'minutes_to_fall_asleep', 'minutes_asleep', 'minutes_awake', 'minutes_after_wakeup', 'time_in_bed', 'sleep_efficiency', 'main_sleep', 'calories', 'water_amount', 'badge_type', 'badge_value', 'mindfulness_goal', 'place', 'consciousness_raising_category', 'counterconditioning_category', 'helping_relationships_category', 'stimulus_control_category']

labeling_features = ['id', 'date', 'hour', 'ecg', 'heart_rate_alert', 'nremhr', 'spo2', 'rmssd', 'wrist_temperature', 'oxygen_variation', 'resting_heart_rate', 'bpm', 'stress_score', 'responsiveness_points', 'scl_avg', 'mindfulness_start_heart_rate', 'mindfulness_end_heart_rate', 'mood_value', 'positive_affect_score', 'negative_affect_score', 'stai_stress', 'mood', 'vo2max', 'nightly_temperature', 'full_sleep_breathing_rate', 'deep_sleep_breathing_rate', 'light_sleep_breathing_rate', 'rem_sleep_breathing_rate', 'gender', 'age', 'bmi', 'self_determination', 'ttm_stage', 'dramatic_relief_category', 'environmental_reevaluation_category', 'self_reevaluation_category', 'social_liberation_category', 'reinforcement_management_category', 'self_liberation_category', 'extraversion', 'agreeableness', 'conscientiousness', 'stability', 'intellect']

Create the two sub-dataframes

In [27]:
training_df = df[df.columns[df.columns.isin(training_features)]]
training_df

Unnamed: 0,id,date,hour,sleep_points,exertion_points,altitude,badge_type,badge_value,calories,distance,...,minutes_after_wakeup,time_in_bed,sleep_efficiency,main_sleep,consciousness_raising_category,counterconditioning_category,helping_relationships_category,stimulus_control_category,step_goal,place
0,621e2ff067b776a2403eb737,2021-12-22,19,,,10.0,,,27.35,19650.0,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,,,,2.35,0.0,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,20.0,,,44.50,26880.0,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,20.0,,,46.06,32050.0,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,,,,2.35,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,WORK/SCHOOL
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,HOME
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,TRANSIT
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,HOME


In [28]:
labeling_df = df[df.columns[df.columns.isin(labeling_features)]]
labeling_df

Unnamed: 0,id,date,hour,ecg,heart_rate_alert,nightly_temperature,nremhr,spo2,rmssd,full_sleep_breathing_rate,...,negative_affect_score,stai_stress,ttm_stage,dramatic_relief_category,environmental_reevaluation_category,self_reevaluation_category,social_liberation_category,reinforcement_management_category,self_liberation_category,mood
0,621e2ff067b776a2403eb737,2021-12-22,19,NSR,NONE,,,,,,...,,,,,,,,,,
1,621e2ff067b776a2403eb737,2021-11-18,0,,,35.02573,,,,,...,,,,,,,,,,
2,621e2ff067b776a2403eb737,2021-11-18,21,,,34.866951,,,,,...,,,,,,,,,,
3,621e2ff067b776a2403eb737,2021-11-20,0,,,35.349583,,,,,...,,,,,,,,,,
4,621e2ff067b776a2403eb737,2021-11-20,23,,,34.495486,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165086,621e36f967b776a240e5e7c9,2021-05-20,16,,,,,,,,...,,,,,,,,,,TIRED
165087,621e36f967b776a240e5e7c9,2021-05-20,20,,,,,,,,...,,,,,,,,,,RESTED/RELAXED
165088,621e36f967b776a240e5e7c9,2021-05-21,17,,,,,,,,...,,,,,,,,,,HAPPY
165089,621e36f967b776a240e5e7c9,2021-05-21,22,,,,,,,,...,,,,,,,,,,RESTED/RELAXED
