In [1]:
import pandas as pd
import numpy as np

import json
from pandas.io.json import json_normalize

import datetime
from pytz import timezone

from os import listdir

Fitbit data arrives broken up into a series of JSON files, each containing a different range of dates and the corresponding values on a type of metric (exercise, heart rate, sleep, etc.). 

To speed construction of the data into a single dataframe, the following code pulls similarly labeled files (such as all of those with 'steps' in the filename), normalizes the JSON into a 'flat' dataframe and each of the desired values are appended to lists. When each file has been read, the lists are zipped into a single pandas dataframe.

Additional processing steps, such as converting time zones to local (from UTC), ensuring the dates are in a consistent format (for matching the dataframes by date), and converting some metrics to more readily interpretable forms (milliseconds to seconds).

## Daily Steps Taken

In [2]:
dir_path = 'health/Data/'
matching_phrase = 'steps-'

date = []
value = []

for file in listdir(dir_path):
    if matching_phrase in file:
        with open(dir_path + file) as f:
            data = json.load(f)
            df = pd.json_normalize(data)
            df.columns = ['date', 'value']

            for i in df['date']:
                date.append(i)

            for i in df['value']:
                value.append(i)

# zip lists into single dataframe
df = pd.DataFrame(list(zip(date, value)), 
   columns =['date', 'value'])

# convert time data to local timezone (CST)
df['date'] = pd.to_datetime(df['date']).dt.tz_localize('America/Chicago', 
                                                       ambiguous = 'NaT',
                                                       nonexistent='shift_forward')

# remove time from date for grouping
df['day'] = pd.to_datetime(df['date']).dt.date

# convert value to numeric
df['value'] = pd.to_numeric(df['value'], errors = 'coerce')

Steps data from Fitbit come in minute intervals. For analysis, only the daily level is of interest so the dataframe to be returned needs these values aggregated by date.

In [3]:
steps = df.copy()

steps = steps.groupby(["day"]).agg(steps=("value","sum")).reset_index()

steps = steps.rename(columns={'day': 'date'})

steps

Unnamed: 0,date,steps
0,2018-02-09,10
1,2018-02-10,7964
2,2018-02-11,7582
3,2018-02-12,12535
4,2018-02-13,18006
...,...,...
625,2019-11-04,0
626,2019-11-05,32
627,2019-11-06,0
628,2019-11-07,0


## Daily Average Resting Heart Rate

In [None]:
matching_phrase = 'heart'

date = []
value = []

for file in listdir(dir_path):
    if matching_phrase in file:
        with open(dir_path + file) as f:
            data = json.load(f)
            df = pd.json_normalize(data)
            # this json has four columns and the JSON is formatted differently than the steps data
            df.columns = ['date_utc', 'date', 'rest_avg_hr', 'error']

            for i in df['date']:
                date.append(i)

            for i in df['rest_avg_hr']:
                value.append(i)

df = pd.DataFrame(list(zip(date, value)), 
   columns =['date', 'rest_avg_hr'])

# gather only dates with data
heart_rate = df.copy()

heart_rate = heart_rate[heart_rate['date'].notna()]

heart_rate['date'] = pd.to_datetime(heart_rate['date']).dt.date

In [None]:
heart_rate

## Daily Sleep Data

In [None]:
matching_phrase = 'sleep'

date = []
start_time = []
end_time = []
duration = []
asleep_min = []
awake_min = []
rem_min = []
rem_30_day_avg = []

for file in listdir(dir_path):
    if matching_phrase in file:
        with open(dir_path + file) as f:
            data = json.load(f)
            df = pd.json_normalize(data)
            # this json has several levels of nested data, select only those desired for analysis later
            df = df[['dateOfSleep', 'startTime', 'endTime','duration','minutesAsleep','minutesAwake',
               'levels.summary.rem.minutes','levels.summary.rem.thirtyDayAvgMinutes']]
            
            for i in df['dateOfSleep']:
                date.append(i)
                
            for i in df['startTime']:
                start_time.append(i)
                
            for i in df['endTime']:
                end_time.append(i)
                
            for i in df['duration']:
                duration.append(i)
                
            for i in df['minutesAsleep']:
                asleep_min.append(i)
                
            for i in df['minutesAwake']:
                awake_min.append(i)

            for i in df['levels.summary.rem.minutes']:
                rem_min.append(i)
                
            for i in df['levels.summary.rem.thirtyDayAvgMinutes']:
                rem_30_day_avg.append(i)         
                
sleep = pd.DataFrame(list(zip(date, start_time, end_time, duration, asleep_min, awake_min, rem_min, rem_30_day_avg)), 
   columns =['date', 'sleep_start_time', 'sleep_end_time', 'sleep_duration', 'asleep_min', 'awake_min', 'rem_min', 'rem_30_day_avg'])

# duration is in ms (divide by 60000 to obtain minutes)
sleep['sleep_duration'] = sleep['sleep_duration'] / 60000

# ensure date formatted similarly
sleep['date'] = pd.to_datetime(sleep['date']).dt.date

In [None]:
sleep

## Combined Data

In [None]:
del(dfs)

In [None]:
from functools import reduce
dfs = [steps, heart_rate, sleep]
#df_final = reduce(lambda left,right: pd.merge(left,right,on='date'), dfs)
#df_final

In [None]:
[dfs.set_index('date') for df in dfs]
pd.DataFrame().join(dfs, on='date', how="outer")

In [None]:
#dfs = [df1, df2, df3]
[dfz.set_index('date') for df in dfz]
dfz = dfz[0].join(dfz[1:])
dfz

In [None]:
fitbit_df = pd.merge(steps, heart_rate, on='date', how = 'left')

In [None]:
fitbit_df = pd.merge(fitbit_df, sleep, on='date', how = 'left')

In [None]:
fitbit_df

In [None]:
fitbit_df.to_csv('fitbit_clean.csv', index=False)