In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Call fitbit_data_exploration & fitbit_get_daily_hr scripts first

In [2]:
hr_df = pd.read_csv('../../data/public_data/cleaned/fitbit/cleaned_heart_rate.csv')
hr_df['Time'] = pd.to_datetime(hr_df['Time'])

In [3]:
# resample to minute heart rate values from 15-20s
def get_mean_min_vals(group):
    mean_minute_vals = group.resample('1Min', on='Time')['Value'].mean() 
    return mean_minute_vals.to_frame()
    
hr_per_min_df = hr_df.groupby('Id').apply(get_mean_min_vals)
hr_per_min_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Value
Id,Time,Unnamed: 2_level_1
2022484408,2016-04-01 07:54:00,99.6
2022484408,2016-04-01 07:55:00,110.111111
2022484408,2016-04-01 07:56:00,92.555556
2022484408,2016-04-01 07:57:00,90.4
2022484408,2016-04-01 07:58:00,97.875


In [4]:
# Combine with minutes activity data
mins_df = pd.read_csv('../../data/public_data/cleaned/fitbit/cleaned_activity_minutes.csv')
# removing not needed data fields:
mins_df = mins_df.drop(columns=['Calories', 'METs', 'Steps'])
# rename index for merging
hr_per_min_df.index.names = ['Id', 'ActivityMinute']
mins_df['ActivityMinute'] = pd.to_datetime(mins_df['ActivityMinute'])
mins_df = mins_df.set_index(['Id', 'ActivityMinute'])

hr_intensity_df = pd.merge(hr_per_min_df, mins_df, left_index=True, right_index=True)
hr_intensity_df = hr_intensity_df.reset_index().set_index('Id')
hr_intensity_df.head()

Unnamed: 0_level_0,ActivityMinute,Value,Intensity
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022484408,2016-04-01 07:54:00,99.6,1
2022484408,2016-04-01 07:55:00,110.111111,1
2022484408,2016-04-01 07:56:00,92.555556,1
2022484408,2016-04-01 07:57:00,90.4,0
2022484408,2016-04-01 07:58:00,97.875,1


In [5]:
hr_intensity_df.isna().sum()

ActivityMinute         0
Value             293706
Intensity              0
dtype: int64

In [6]:
# drop null cases - non existant hr values don't provide use for hr indicators
hr_intensity_df = hr_intensity_df.dropna()
hr_intensity_df.isna().sum()

ActivityMinute    0
Value             0
Intensity         0
dtype: int64

In [7]:
def get_max_hr(group):
    return group['Value'].max()

def get_min_hr(group):
    return group['Value'].min()

max_hr = hr_intensity_df.groupby(['Id', hr_intensity_df['ActivityMinute'].dt.date]).apply(get_max_hr)
max_hr.name = 'Max Hr'
max_hr_df = max_hr.to_frame()
# median_hrs_df = pd.merge(median_hrs_df, max_hr_df, left_index=True, right_index=True)


min_hr = hr_intensity_df.groupby(['Id', hr_intensity_df['ActivityMinute'].dt.date]).apply(get_min_hr)
min_hr.name = 'Min Hr'
min_hr_df = min_hr.to_frame()
min_max_hr_df = pd.merge(max_hr_df, min_hr_df, left_index=True, right_index=True)

min_max_hr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Hr,Min Hr
Id,ActivityMinute,Unnamed: 2_level_1,Unnamed: 3_level_1
2022484408,2016-04-01,174.600000,55.500000
2022484408,2016-04-02,98.333333,50.333333
2022484408,2016-04-03,124.333333,50.000000
2022484408,2016-04-04,147.375000,55.000000
2022484408,2016-04-05,140.500000,54.333333
...,...,...,...
8877689391,2016-05-08,128.600000,55.571429
8877689391,2016-05-09,173.521739,52.500000
8877689391,2016-05-10,137.666667,52.142857
8877689391,2016-05-11,176.565217,49.555556


In [8]:
def get_daily_total_intensity(group):
    return group['Intensity'].sum()

total_daily_intensity = hr_intensity_df.groupby(['Id', hr_intensity_df['ActivityMinute'].dt.date]).apply(get_daily_total_intensity)
total_daily_intensity

Id          ActivityMinute
2022484408  2016-04-01        459
            2016-04-02        236
            2016-04-03        398
            2016-04-04        455
            2016-04-05        480
                             ... 
8877689391  2016-05-08        304
            2016-05-09        474
            2016-05-10        299
            2016-05-11        501
            2016-05-12        188
Length: 469, dtype: int64

In [9]:
total_daily_intensity.name = 'Daily Total Intensity'
total_daily_intensity_df = total_daily_intensity.to_frame()
total_daily_intensity_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Daily Total Intensity
Id,ActivityMinute,Unnamed: 2_level_1
2022484408,2016-04-01,459
2022484408,2016-04-02,236
2022484408,2016-04-03,398
2022484408,2016-04-04,455
2022484408,2016-04-05,480
...,...,...
8877689391,2016-05-08,304
8877689391,2016-05-09,474
8877689391,2016-05-10,299
8877689391,2016-05-11,501


In [10]:
min_max_hr_intensity_df = pd.merge(min_max_hr_df, total_daily_intensity_df, left_index=True, right_index=True)
min_max_hr_intensity_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Max Hr,Min Hr,Daily Total Intensity
Id,ActivityMinute,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022484408,2016-04-01,174.6,55.5,459
2022484408,2016-04-02,98.333333,50.333333,236
2022484408,2016-04-03,124.333333,50.0,398
2022484408,2016-04-04,147.375,55.0,455
2022484408,2016-04-05,140.5,54.333333,480


In [11]:
min_max_hr_intensity_df.isna().sum()

Max Hr                   0
Min Hr                   0
Daily Total Intensity    0
dtype: int64

In [12]:
min_max_hr_intensity_df = min_max_hr_intensity_df.reset_index() #drop=True
min_max_hr_intensity_df = min_max_hr_intensity_df.rename(columns={'ActivityMinute': 'ActivityDay'})
min_max_hr_intensity_df.head()

Unnamed: 0,Id,ActivityDay,Max Hr,Min Hr,Daily Total Intensity
0,2022484408,2016-04-01,174.6,55.5,459
1,2022484408,2016-04-02,98.333333,50.333333,236
2,2022484408,2016-04-03,124.333333,50.0,398
3,2022484408,2016-04-04,147.375,55.0,455
4,2022484408,2016-04-05,140.5,54.333333,480


In [13]:
min_max_hr_intensity_df.describe()

Unnamed: 0,Id,Max Hr,Min Hr,Daily Total Intensity
count,469.0,469.0,469.0,469.0
mean,5580065000.0,132.708367,54.796699,345.328358
std,2088188000.0,21.663556,7.555009,167.582773
min,2022484000.0,76.714286,36.8,0.0
25%,4388162000.0,118.625,50.333333,232.0
50%,5577150000.0,129.125,54.166667,359.0
75%,6962181000.0,146.416667,58.5,452.0
max,8877689000.0,202.166667,83.571429,904.0


In [14]:
save_folder_path = '../../data/public_data/cleaned/fitbit/'
min_max_hr_intensity_df.to_csv(save_folder_path + '/cleaned_daily_min_max_intensity.csv', index=False)