## Import Packages

In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

## Import Simple Data

#### Distance

In [2]:
df_distance_raw = pd.read_csv('usable_data/simple/distance.csv')
df_distance_raw['dateTime'] = pd.to_datetime(df_distance_raw['dateTime'])
df_distance_raw['dateTime'] = df_distance_raw['dateTime'].apply(lambda x: str(x.date()))
df_distance_grouped = df_distance_raw.groupby('dateTime').sum()
df_distance_grouped.rename(columns={'value':'distance_traveled_miles'}, inplace=True)
df_distance_grouped['distance_traveled_miles'] = df_distance_grouped['distance_traveled_miles'].apply(lambda x: x/160934)
# df_distance_grouped.head()

#### Lightly active

In [3]:
df_lightly_active_raw = pd.read_csv('usable_data/simple/lightly_active.csv')
df_lightly_active_raw['dateTime'] = pd.to_datetime(df_lightly_active_raw['dateTime'])
df_lightly_active_raw['dateTime'] = df_lightly_active_raw['dateTime'].apply(lambda x: str(x.date()))
df_lightly_active_grouped = df_lightly_active_raw.groupby('dateTime').sum()
df_lightly_active_grouped.rename(columns={'value':'lightly_active_min'}, inplace=True)
# df_lightly_active_grouped.head()

#### Moderately active

In [4]:
df_moderately_active_raw = pd.read_csv('usable_data/simple/moderately_active.csv')
df_moderately_active_raw['dateTime'] = pd.to_datetime(df_moderately_active_raw['dateTime'])
df_moderately_active_raw['dateTime'] = df_moderately_active_raw['dateTime'].apply(lambda x: str(x.date()))
df_moderately_active_grouped = df_moderately_active_raw.groupby('dateTime').sum()
df_moderately_active_grouped.rename(columns={'value':'moderately_active_min'}, inplace=True)
# df_moderately_active_grouped.head()

#### Very active

In [5]:
df_very_active_raw = pd.read_csv('usable_data/simple/very_active.csv')
df_very_active_raw['dateTime'] = pd.to_datetime(df_very_active_raw['dateTime'])
df_very_active_raw['dateTime'] = df_very_active_raw['dateTime'].apply(lambda x: str(x.date()))
df_very_active_grouped = df_very_active_raw.groupby('dateTime').sum()
df_very_active_grouped.rename(columns={'value':'very_active_min'}, inplace=True)
# df_very_active_grouped.head()

#### Sedentary

In [6]:
df_sedentary_raw = pd.read_csv('usable_data/simple/sedentary.csv')
df_sedentary_raw['dateTime'] = pd.to_datetime(df_sedentary_raw['dateTime'])
df_sedentary_raw['dateTime'] = df_sedentary_raw['dateTime'].apply(lambda x: str(x.date()))
df_sedentary_grouped = df_sedentary_raw.groupby('dateTime').sum()
df_sedentary_grouped.rename(columns={'value':'sedentary_min'}, inplace=True)
# df_sedentary_grouped.head()

#### Steps

In [7]:
df_steps_raw = pd.read_csv('usable_data/simple/steps.csv')
df_steps_raw['dateTime'] = pd.to_datetime(df_steps_raw['dateTime'])
df_steps_raw['dateTime'] = df_steps_raw['dateTime'].apply(lambda x: str(x.date()))
df_steps_grouped = df_steps_raw.groupby('dateTime').sum()
df_steps_grouped.rename(columns={'value':'steps_taken'}, inplace=True)
# df_steps_grouped.head()

## Import Complex Data

#### Weight

In [8]:
df_weight_raw = pd.read_csv('usable_data/complex/weight.csv')
df_weight_raw['dateTime'] = pd.to_datetime(df_weight_raw['date'])
df_weight_raw['dateTime'] = df_weight_raw['dateTime'].apply(lambda x: str(x.date()))
df_weight_grouped = df_weight_raw.groupby('dateTime').max()
df_weight_grouped = df_weight_grouped.drop(columns=['logId', 'time', 'source', 'date'])
# df_weight_grouped.head()

#### Swim

In [9]:
df_swim_raw = pd.read_csv('usable_data/complex/swim.csv')
df_swim_raw['dateTime'] = pd.to_datetime(df_swim_raw['dateTime'])
df_swim_raw['dateTime'] = df_swim_raw['dateTime'].apply(lambda x: str(x.date()))
df_swim_raw['value'] = df_swim_raw['value'].apply(lambda x: eval(x))
df_swim_raw['lapDurationSec'] = df_swim_raw['value'].apply(lambda x: x['lapDurationSec'])
df_swim_raw['strokeCount'] = df_swim_raw['value'].apply(lambda x: x['strokeCount'])
df_swim_grouped = df_swim_raw.groupby('dateTime').sum()
df_swim_grouped.rename(columns={'lapDurationSec':'swim_min', 'strokeCount':'stroke_count'}, inplace=True)
# df_swim_grouped.head()

#### Resting Heart Rate

In [10]:
df_resting_hr_raw = pd.read_csv('usable_data/complex/resting_hr.csv')
df_resting_hr_raw = df_resting_hr_raw[df_resting_hr_raw['value'] != '{"date":null,"value":0,"error":0}']
df_resting_hr_raw = df_resting_hr_raw[['value']]
df_resting_hr_raw['value'] = df_resting_hr_raw['value'].apply(lambda x: eval(x))
df_resting_hr_raw['dateTime'] = df_resting_hr_raw['value'].apply(lambda x: str(pd.to_datetime(x['date']).date()))
df_resting_hr_raw['error'] = df_resting_hr_raw['value'].apply(lambda x: x['error'])
df_resting_hr_raw['value'] = df_resting_hr_raw['value'].apply(lambda x: x['value'])
df_resting_hr_grouped = df_resting_hr_raw.groupby('dateTime').mean()
df_resting_hr_grouped.rename(columns={'value':'resting_heart_rate', 'error':'error_margin'}, inplace=True)
# df_resting_hr_grouped.head()

#### Sleep Score

In [11]:
df_sleep_score = pd.read_csv('usable_data/complex/sleep_score.csv')
df_sleep_score['timestamp'] = pd.to_datetime(df_sleep_score['timestamp'])
df_sleep_score['dateTime'] = df_sleep_score['timestamp'].apply(lambda x: str(x.date() - pd.Timedelta(days=1)))
df_sleep_score = df_sleep_score.groupby('dateTime').mean().drop(columns=['sleep_log_entry_id'])
df_sleep_score.rename(columns={'overall_score':'sleep_overall_score','composition_score':'sleep_composition_score','revitalization_score':'sleep_revitalization_score', 'duration_score':'sleep_duration_score','deep_sleep_in_minutes':'sleep_deep_sleep_min','resting_heart_rate':'sleep_resting_heart_rate','restlessness':'sleep_restlessness_score'}, inplace=True)
# df_sleep_score.head()

#### Sleep Metrics

In [12]:
df_sleep_1 = pd.read_csv('usable_data/complex/sleep.csv')
df_sleep_2 = pd.read_csv('usable_data/complex/sleep_v2.csv')
df_sleep = pd.concat([df_sleep_1, df_sleep_2], axis=0).reset_index(drop=True)
df_sleep.drop_duplicates(subset=['logId'], inplace=True)
df_sleep['dateTime'] = df_sleep['dateOfSleep'].apply(lambda x: str(pd.to_datetime(x).date() - pd.Timedelta(days=1))) # this will shift the day of sleep to match the day of activity
df_sleep = df_sleep.drop(columns=['startTime', 'endTime', 'minutesAfterWakeup', 'timeInBed', 'infoCode', 'mainSleep', 'efficiency', 'dateOfSleep']).set_index('dateTime')
df_sleep['duration'] = df_sleep['duration'].apply(lambda x: x/(1000*60))
df_sleep = df_sleep.sort_values(by='dateTime', ascending=True)
df_sleep['levels'] = df_sleep['levels'].apply(lambda x: eval(x)['summary'])
df_sleep_classic = df_sleep[df_sleep['type'] == 'classic']
df_sleep_stages = df_sleep[df_sleep['type'] == 'stages']
df_sleep_stages['deep'] = df_sleep_stages['levels'].apply(lambda x: x['deep']['minutes'])
df_sleep_stages['light'] = df_sleep_stages['levels'].apply(lambda x: x['light']['minutes'])
df_sleep_stages['rem'] = df_sleep_stages['levels'].apply(lambda x: x['rem']['minutes'])
df_sleep = pd.concat([df_sleep_classic, df_sleep_stages], axis=0).sort_values('dateTime', ascending=True).drop(columns=['levels', 'type', 'logId']) #.reset_index() 
df_sleep.rename(columns={'duration':'time_sleeping', 'minutesToFallAsleep':'minutes_to_fall_asleep', 'minutesToFallAsleep':'minutes_to_fall_asleep', 'minutesAsleep':'minutes_asleep', 'minutesAwake':'minutes_awake', 'deep':'deep_sleep_min', 'light':'light_sleep_min', 'rem':'rem_sleep_min'}, inplace=True)
df_sleep_grouped = df_sleep.groupby('dateTime').sum()
# df_sleep_grouped.head()

## Merging DFs

In [13]:
df = pd.merge(df_distance_grouped, df_lightly_active_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_moderately_active_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_very_active_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_sedentary_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_steps_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_weight_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_swim_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_sleep_score, how='outer', on='dateTime')
df = pd.merge(df, df_resting_hr_grouped, how='outer', on='dateTime')
df = pd.merge(df, df_sleep_grouped, how='outer', on='dateTime')
df.sort_values('dateTime', ascending=True, inplace=True)

In [14]:
for column in df:
    print(column, f'{round(df[column].notna().sum()/len(df)*100, 4)}% filled')

distance_traveled_miles 43.3987% filled
lightly_active_min 100.0% filled
moderately_active_min 100.0% filled
very_active_min 100.0% filled
sedentary_min 100.0% filled
steps_taken 44.3137% filled
weight 3.0719% filled
bmi 3.0719% filled
fat 2.8105% filled
swim_min 28.2353% filled
stroke_count 28.2353% filled
sleep_overall_score 9.3464% filled
sleep_composition_score 9.3464% filled
sleep_revitalization_score 9.3464% filled
sleep_duration_score 9.3464% filled
sleep_deep_sleep_min 9.3464% filled
sleep_resting_heart_rate 9.3464% filled
sleep_restlessness_score 9.3464% filled
resting_heart_rate 31.2418% filled
error_margin 31.2418% filled
time_sleeping 31.8954% filled
minutes_to_fall_asleep 31.8954% filled
minutes_asleep 31.8954% filled
minutes_awake 31.8954% filled
deep_sleep_min 31.8954% filled
light_sleep_min 31.8954% filled
rem_sleep_min 31.8954% filled


In [15]:
df.to_csv(r'usable_data/merged_data.csv')