In [None]:
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
# plt.style.use("fivethirtyeight")
#plt.style.use('plot-style.mplstyle')

import seaborn as sns
import math

In [None]:
#import data since apple watch purchase:
#daily_df = pd.read_csv('daily_health_agg_new.csv').set_index('day')
#daily_df.index = pd.DatetimeIndex(daily_df.index)

#all data:
record_data = pd.read_parquet('record_data.parquet')
workout_data = pd.read_parquet('workout_data.parquet')


In [None]:
agg_types = {
    'ActiveEnergyBurned': 'sum',
 'AppleExerciseTime': 'sum',
 'AppleStandHour': 'sum',
 'AppleStandTime': 'sum',
 'AppleWalkingSteadiness': 'mean',
 'AudioExposureEvent': 'sum',
 'BasalEnergyBurned': 'sum',
 'BloodPressureDiastolic': ['mean', 'min', 'median', 'max'],
 'BloodPressureSystolic': ['mean', 'min', 'median', 'max'],
 'BodyFatPercentage': 'mean',
 'BodyMass': 'mean',
 'BodyMassIndex': 'mean',
 'DietaryWater': 'sum',
 'DistanceCycling': 'sum',
 'DistanceSwimming': 'sum',
 'DistanceWalkingRunning': 'sum',
 'EnvironmentalAudioExposure': 'mean',
 'FlightsClimbed': 'sum',
 'HKDataTypeSleepDurationGoal': 'last',
 'HeadphoneAudioExposure': 'mean',
 'HeadphoneAudioExposureEvent': 'sum',
 'HeartRate': ['mean','min','median','max'],
 'HeartRateVariabilitySDNN': ['mean', 'min', 'median', 'max'],
 'Height': 'mean',
 'HighHeartRateEvent': 'sum',
 'LeanBodyMass': 'mean',
 'MindfulSession': 'sum',
 'OxygenSaturation': 'mean',
 'RespiratoryRate': ['mean', 'min', 'max'],
 'RestingHeartRate': 'mean',
 'SixMinuteWalkTestDistance': 'mean',
 'SleepAnalysis': 'sum',
 'StairAscentSpeed': 'mean',
 'StairDescentSpeed': 'mean',
 'StepCount': 'sum',
 'SwimmingStrokeCount': 'sum',
 'VO2Max': 'mean',
 'WalkingAsymmetryPercentage': 'mean',
 'WalkingDoubleSupportPercentage': 'mean',
 'WalkingHeartRateAverage': 'mean',
 'WalkingSpeed': 'mean',
 'WalkingStepLength': 'mean'
}

def get_data(metric:str,agg=False,agg_field='day'): 
    """Returns a timeseries or daily aggregate of the metric requested
    agg_field to 'day' or 'week' as long as this field exists"""
    
    tmp = record_data[['creationDate','startDate','endDate','day']+[metric]].dropna().sort_values('startDate')
    
    if agg:
        return tmp.groupby(agg_field).agg({metric:agg_types.get(metric)}) 
    
    else:
        return tmp.set_index('startDate')[metric]



In [None]:
record_data['measure_time_secs'] = (record_data['endDate'] - record_data['startDate']).dt.total_seconds()

GROUP DAILY!

In [None]:
output=[]
for metric in agg_types:
    output.append(get_data(metric,agg=True))
    
daily_df = pd.concat(output,axis=1)

correlations:

In [None]:
daily_df.corr().loc[[('BloodPressureDiastolic', 'max')]].T.sort_values(('BloodPressureDiastolic', 'max'), 
                                                                       ascending=False)

In [None]:
get_data('BloodPressureDiastolic',agg=True).plot()
plt.show()

In [None]:
get_data('RespiratoryRate',agg=False)#.plot()

In [None]:
get_data('RespiratoryRate',agg=True).plot()

In [None]:
# FIND OUT WHAT ELSE IS GOING ON WHEN RESP RATE IS HIGH

# WORKOUTS

In [None]:
workout_data = pd.read_parquet('workout_data.parquet')
workout_data.head(2)

In [None]:
fig,axes = plt.subplots(ncols=2, nrows=2, figsize=(14,10))

ax=axes[0][0]
sns.boxplot(
    x='workoutType',
    y='duration',
    data=workout_data,
    ax=ax)
ax.set_title('duration')
ax.xaxis.set_tick_params(rotation=45)

ax=axes[1][0]
sns.boxplot(
    x='workoutType',
    y='totalDistance',
    data=workout_data,
    ax=ax)
ax.set_title('totalDistance')
ax.xaxis.set_tick_params(rotation=45)

ax=axes[0][1]
sns.boxplot(
    x='workoutType',
    y='totalEnergyBurned',
    data=workout_data,
    ax=ax)
ax.set_title('totalEnergyBurned')
ax.xaxis.set_tick_params(rotation=45)

# plt.setp(ax.xticks(), rotation=45)
ax=axes[1][1]
workout_data['energy_per_min'] = (workout_data.totalEnergyBurned / workout_data.duration)
sns.boxplot(x='workoutType',
            y='energy_per_min',
            data=workout_data,
            ax=ax)
ax.set_title('energy_per_min')
ax.xaxis.set_tick_params(rotation=45)

plt.suptitle('Workouts',fontsize='xx-large',fontweight='bold')

In [None]:
fig,ax = plt.subplots()
workout_data.groupby('workoutType')['duration'].agg(['min','mean','max']).sort_values('max',ascending=False
                                                                                     ).plot(ax=ax)
ax.xaxis.set_tick_params(rotation=45)
handles,labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1],labels[::-1])
ax.set_ylabel('workout duration')

ax.set_title('Workout Duration by Type')
fig.show()

In [None]:

get_data('EnvironmentalAudioExposure').plot()




In [None]:

n_plots = len(daily_df.columns)
n_cols = 4

fig,axes = plt.subplots(
    ncols=n_cols, 
    nrows=math.ceil(n_plots/n_cols),
    figsize=(14,14))

for idx, c in enumerate(daily_df):
    col_ix = idx % n_cols
    row_ix = idx // n_cols
    
    ax = axes[row_ix][col_ix]
    
    d = daily_df.reindex(pd.DatetimeIndex(daily_df.index)).copy()
    d[c].plot(ax=ax, label=c)
    
    ax.set_title(c)
    
fig.tight_layout()
fig.show()

In [None]:

n_plots = len(daily_df.columns)
n_cols = 4

fig,axes = plt.subplots(
    ncols=n_cols, 
    nrows=math.ceil(n_plots/n_cols),
    figsize=(16,16))

for idx, c in enumerate(daily_df):
    col_ix = idx % n_cols
    row_ix = idx // n_cols
    
    ax = axes[row_ix][col_ix]
    
    d = daily_df.reindex(pd.DatetimeIndex(daily_df.index)).copy()
    sns.kdeplot(d[c], ax=ax)
    
    ax.set_title(c+' \nKDE PLOT')
    
fig.tight_layout()
fig.show()

In [None]:
for month in range(1,13):
    sns.kdeplot(
        daily_df.loc[pd.DatetimeIndex(daily_df.index).month == month]['BodyFatPercentage'],
    label=f'month {month}')

# plt.title('')
plt.legend()

In [None]:
#PLOT ALL DISTRIBUTIONS BY MONTH 

n_plots = len(daily_df.columns)
n_cols = 4

fig,axes = plt.subplots(
    ncols=n_cols, 
    nrows=math.ceil(n_plots/n_cols),
    figsize=(16,16))

for idx, c in enumerate(daily_df):
    col_ix = idx % n_cols
    row_ix = idx // n_cols
    ax = axes[row_ix][col_ix]
    
    for month in range(1,13):
        d = daily_df.loc[pd.DatetimeIndex(daily_df.index).month == month]
    

#         d = daily_df.reindex(pd.DatetimeIndex(daily_df.index)).copy()
        
        sns.kdeplot(d[c], ax=ax)
    
    ax.set_title(c+' \nKDE PLOT')
    
fig.tight_layout()
fig.show()

In [None]:
thresh = 0.4

fig,ax = plt.subplots(figsize=(16,16))

d_filt_corr = (d.corr() >= thresh) | (d.corr() <= - thresh)

sns.heatmap(d.corr()[d_filt_corr], annot=True, ax=ax)

ax.grid()
ax.set_title('correlations\n')

POTENTIALLY INTERESTING:

BloodPressure Diastolic inverse relationship with VO2 max. is VO2max a fitness measure? probably. goes up if fitter => hence relationship? or VO2 max is calcuated based on recovery time after workouts? so high blood pressure might make recovery slower?

Stand and step count correlated, no surprise here really. Days I barely move I also barely stand.

Audio event and blood pressure? warnings occur when I'm:
    - on the tube sometimes (work /stress?  /commute /exercise/higher heart rate related)
    - at a loud music event - higher heartrate/exercise dancing linked / alcohol
    - using a hand drier (not work related, more likely bars/pubs etc or traveling)
    - swimming/in water (exercise related)

In [None]:
#BLOOD PRESSURE
bp = record_data[record_data.sourceName == '1byone Health'].copy()
bp['day'] = bp['startDate'].dt.date
bp['week'] = bp['startDate'].apply(lambda x:x.date() - pd.Timedelta(days=x.weekday()))
bp['month'] = pd.to_datetime(bp['startDate'].dt.strftime('%Y-%m-01')).dt.date

#WEEKLY BP
min_dt = bp.week.min()
max_dt = bp.week.max()
daterange_full = pd.DataFrame(pd.date_range(min_dt, max_dt,freq='7d'))
daterange_full = daterange_full.set_index(daterange_full[0].dt.date).drop(columns=[0])

bp_weekly = pd.concat([
    daterange_full,
    bp.groupby(['week','type'])['value'].mean().unstack()
], axis=1)

bp_weekly.plot()

In [None]:
# RESTING HR
rest_hr = record_data[(
    record_data.sourceName == 'George’s Apple\xa0Watch') & (
    record_data.type == 'RestingHeartRate')].copy()

rest_hr['day'] = rest_hr['startDate'].dt.date
rest_hr['week'] = rest_hr['startDate'].apply(lambda x:x.date() - pd.Timedelta(days=x.weekday()))
rest_hr['month'] = pd.to_datetime(rest_hr['startDate'].dt.strftime('%Y-%m-01')).dt.date

#WEEKLY
min_dt = rest_hr.week.min()
max_dt = rest_hr.week.max()
daterange_full = pd.DataFrame(pd.date_range(min_dt, max_dt,freq='7d'))
daterange_full = daterange_full.set_index(daterange_full[0].dt.date).drop(columns=[0])

rest_hr_weekly = pd.concat([
    daterange_full,
    rest_hr.groupby(['week','type'])['value'].mean().unstack()
], axis=1)

rest_hr_weekly.plot()

In [None]:
# BODY MASS
bm = record_data[(
    record_data.type == 'BodyMass')].copy()

bm['day'] = bm['startDate'].dt.date
bm['week'] = bm['startDate'].apply(lambda x:x.date() - pd.Timedelta(days=x.weekday()))
bm['month'] = pd.to_datetime(bm['startDate'].dt.strftime('%Y-%m-01')).dt.date

#WEEKLY
min_dt = bm.week.min()
max_dt = bm.week.max()
daterange_full = pd.DataFrame(pd.date_range(min_dt, max_dt,freq='1d'))
daterange_full = daterange_full.set_index(daterange_full[0].dt.date).drop(columns=[0])

bm_daily = pd.concat([
    daterange_full,
    bm.groupby(['day','type'])['value'].mean().unstack()
], axis=1)

bm_daily.tail(350).plot()

BELOW IS INCORRECT - I"VE DEFINITELY HAD MANY ALERTS ABOVE 100DB but this indiciates the max was 90 ish??

In [None]:
#AUDIO EXPOSURE EVENTS;
# HeadphoneAudioExposure


# BODY MASS
ae = record_data[(
    record_data.type == 'HeadphoneAudioExposure')].copy()

ae['day'] = ae['startDate'].dt.date
ae['week'] = ae['startDate'].apply(lambda x:x.date() - pd.Timedelta(days=x.weekday()))
ae['month'] = pd.to_datetime(ae['startDate'].dt.strftime('%Y-%m-01')).dt.date

#WEEKLY
min_dt = ae.week.min()
max_dt = ae.week.max()
daterange_full = pd.DataFrame(pd.date_range(min_dt, max_dt,freq='1d'))
daterange_full = daterange_full.set_index(daterange_full[0].dt.date).drop(columns=[0])

ae_mean_daily = pd.concat([
    daterange_full,
    ae.groupby(['day','type'])['value'].mean().unstack()
], axis=1)

ae_max_daily = pd.concat([
    daterange_full,
    ae.groupby(['day','type'])['value'].max().unstack()
], axis=1)

fig,ax = plt.subplots(figsize=(12,5))

ae_mean_daily['HeadphoneAudioExposure'].tail(400).plot(label='mean', ax=ax)
ae_max_daily['HeadphoneAudioExposure'].tail(400).plot(label='max', ax=ax)

ax.legend()
fig.show()


In [None]:
fig,ax = plt.subplots(figsize=(14,5))
sns.boxplot(
    x='month',
    y='value',
    hue='type',
    data=bp
)

In [None]:
# bp.groupby(['type','month'])['value'].mean()

In [None]:
# bp.groupby(['type','month'])['value'].mean().unstack().T.plot()

In [None]:
# bp.groupby(['type','startDate'])['value'].mean().unstack().T

# REAGG TO GET MAX BEFORE/INSTEAD OF THE MEAN AGG (useful for audio exposure etc)

In [None]:
agg_types = {'BodyMass':'mean',
'LeanBodyMass':'mean',
'BodyFatPercentage':'mean',
'OxygenSaturation':['mean','min'],
'AppleStandHour':'sum',
'EnvironmentalAudioExposure':['mean','median','max'],
'HeadphoneAudioExposure':['mean','median','max'],
'VO2Max':'mean',
'HeartRate':['mean','median','max','min'],
'RestingHeartRate':'mean',
'WalkingHeartRateAverage':'mean',
'BloodPressureSystolic':'mean',
'BloodPressureDiastolic':'mean',
'HeartRateVariabilitySDNN':'mean',
'StepCount':'sum',
'AppleStandTime':'sum',
'SleepAnalysis':'sum',
'HighHeartRateEvent':'sum',
'AudioExposureEvent':'sum',
'HeadphoneAudioExposureEvent':'sum'}

re_agg = record_data.copy()
re_agg['day'] = re_agg['startDate'].dt.date
re_agg_daily = re_agg.groupby('day').agg(agg_types)

# PROPER HEART RATE FROM RAW DATA

In [None]:
fig,axes = plt.subplots(nrows=3, figsize=(12,10))

d = record_data[(record_data['type']=='HeartRate') & (record_data['day'] >= pd.Timestamp('2021-07-01'))].copy()


ax=axes[0]
ax.set_title('Heart Rate')
d.groupby('day')['value'].max( ).plot(label='daily max' , ax=ax, color='red',alpha=0.3)
d.groupby('day')['value'].mean().plot(label='daily mean', ax=ax, color='black')
d.groupby('day')['value'].min( ).plot(label='daily min' , ax=ax, color='blue',alpha=0.3)
ax.legend()
ax.set_ylim(0)

ax=axes[1]
d.groupby('week')['value'].max( ).plot(label='weekly max' , ax=ax, color='red',alpha=0.3)
d.groupby('week')['value'].mean().plot(label='weekly mean', ax=ax, color='black')
d.groupby('week')['value'].min( ).plot(label='weekly min' , ax=ax, color='blue',alpha=0.3)
ax.legend()
ax.set_ylim(0)

ax=axes[2]
d.groupby('month')['value'].max( ).plot(label='monthly max' , ax=ax, color='red',alpha=0.3)
d.groupby('month')['value'].mean().plot(label='monthly mean', ax=ax, color='black')
d.groupby('month')['value'].min( ).plot(label='monthly min' , ax=ax, color='blue',alpha=0.3)
ax.legend()
ax.set_ylim(0)

fig.tight_layout()
fig.show()

In [None]:
for period in ['day','week','month']:
    for agg_type in ['max','mean','min']:
        print(f"{period} {agg_type} | std: {d.groupby(period)['value'].agg(agg_type).std():.1f}")