### Contents

- [Header](#Header)
- [Functions](#Functions)


- [Import Json and Create df](#Import-Json-and-Create-df)
- [Verify Data](#Verify-Data)
- [Output Data](#Output-Data)





### Header

In [1]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
from datetime import datetime
#import re
import os

In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Functions

In [3]:
# source: https://help.fitbit.com/articles/en_US/Help_article/1565

# peak zone: hr >= 85%
# cardio zone: 70% <= hr < 85%
# fat burn zone: 50% <= hr < 69%
# out of zone: hr < 50%

avg_age = 40

max_hr = 220 - avg_age
peak_hr = int(max_hr * 0.85)
cardio_hr = int(max_hr * 0.70)
fatburn_hr = int(max_hr * 0.50)

print('max_hr',max_hr)
print('peak_hr',peak_hr)
print('cardio_hr',cardio_hr)
print('fatburn_hr',fatburn_hr)

max_hr 180
peak_hr 153
cardio_hr 125
fatburn_hr 90


In [4]:
# def compute_hr_zone(heart_rate):
    
#     zone = [0,0,0,0]
    
#     for hr in heart_rate:
        
#         if hr < fatburn_hr:
#             zone[0] += 1
#         elif hr < cardio_hr:
#             zone[1] += 1
#         elif hr < peak_hr:
#             zone[2] += 1
#         else:
#             zone[3] += 1
            
#     total = sum(zone)
    
#     zone[0] /= round(total,3)
#     zone[1] /= round(total,3)
#     zone[2] /= round(total,3)
#     zone[3] /= round(total,3)
                      
#     return zone           

In [5]:
def compute_hr_zone(heart_rate,time_diff):
    
    zone = [0,0,0,0]
    
    for idx,hr in enumerate(heart_rate):       
        
        td = time_diff[idx]
        #print(idx,hr,td)        
        
        if hr < fatburn_hr:
            zone[0] += td

        elif hr < cardio_hr:
            zone[1] += td
        elif hr < peak_hr:
            zone[2] += td
        else:
            zone[3] += td
            
    total = sum(zone)
    
    zone[0] /= round(total,3)
    zone[1] /= round(total,3)
    zone[2] /= round(total,3)
    zone[3] /= round(total,3)
                      
    return zone           

In [6]:
def create_df(data):
    
    #cols = ['id','userId','gender','timestamp','sport','speed','heart_rate','latitude','longitude','altitude']
    df = pd.DataFrame()
    
    index = 0
    
    data_pts = 450 # omit last 50 points (exercise cooldown threshold)
    #data_pts = 500 # all points 

    for key,val in data.items():
    
        #print(index)

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']
        df.at[index,'url'] = val['url']

        timestamp = val['timestamp'][:data_pts]
        heart_rate = val['heart_rate'][:data_pts]
        latitude = val['latitude'][:data_pts]
        longitude = val['longitude'][:data_pts]
        altitude = val['altitude'][:data_pts]
        time_diff = val['time_diff'][:data_pts]

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60 # minutes

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lon_start'] = longitude[0]
        
        df.at[index,'lat_end'] = latitude[-1]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_05'] = np.quantile(altitude,0.05)
        df.at[index,'alt_25'] = np.quantile(altitude,0.25)
        df.at[index,'alt_75'] = np.quantile(altitude,0.75)
        df.at[index,'alt_95'] = np.quantile(altitude,0.95)
        df.at[index,'alt_max'] = np.max(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_05'] = np.quantile(heart_rate,0.05)
        df.at[index,'hr_25'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_75'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_95'] = np.quantile(heart_rate,0.95)
        df.at[index,'hr_max'] = np.max(heart_rate)
        
        #hr_zone = compute_hr_zone(heart_rate)
        hr_zone = compute_hr_zone(heart_rate,time_diff)
        #print(hr_zone)
        
        df.at[index,'hr_outof'] = hr_zone[0]
        df.at[index,'hr_fatburn'] = hr_zone[1]
        df.at[index,'hr_cardio'] = hr_zone[2]
        df.at[index,'hr_peak'] = hr_zone[3]

        if 'speed' in val.keys():
            speed = val['speed'][:data_pts]

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_05'] = np.quantile(speed,0.05)
            df.at[index,'spd_25'] = np.quantile(speed,0.25)
            df.at[index,'spd_75'] = np.quantile(speed,0.75)
            df.at[index,'spd_95'] = np.quantile(speed,0.95)
            df.at[index,'spd_max'] = np.max(speed)
            
        if 'distance' in val.keys():
            df.at[index,'impute'] = 1
        else:
            df.at[index,'impute'] = 0

        index += 1
        
    return df

### Import Json and Create df

In [7]:
# select json file for splitting

file_index = 4

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",    
            "endomondoHR_proper/",
            "endomondoHR_proper_dist_spd/",
            "endomondoHR_proper_dist_spd_time/", 
            "processed_endomondoHR_proper/",
            "processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",    
            "endomondoHR_proper_summary.csv",
            "endomondoHR_proper_dist_spd_summary.csv",
            "endomondoHR_proper_dist_spd_time_summary.csv",    
            "processed_endomondoHR_proper_summary.csv",
            "processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/1_input/fitrec/endomondoHR_proper_dist_spd_time/
../../data/2_clean/fitrec/endomondoHR_proper_dist_spd_time_summary.csv


In [8]:
start_time = datetime.now()

df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    #print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)
        
end_time = datetime.now()
cell_time = end_time - start_time
print('')
print('cell completed in:',cell_time)


cell completed in: 0:26:52.028811


### Verify Data

In [9]:
print(df_all.shape)
df_all.head()

(167783, 38)


Unnamed: 0,id,userId,gender,sport,url,time_start,time_end,time_dur,lat_start,lon_start,...,hr_cardio,hr_peak,spd_avg,spd_min,spd_05,spd_25,spd_75,spd_95,spd_max,impute
0,1,10921915.0,male,bike,https://www.endomondo.com/users/10921915/worko...,1408899000.0,1408905000.0,112.133333,60.173349,24.64977,...,0.507878,0.484096,26.152328,6.8652,13.9491,19.9044,31.7025,41.02704,57.4596,1.0
1,2,10921915.0,male,bike,https://www.endomondo.com/users/10921915/worko...,1408222000.0,1408226000.0,65.15,60.173248,24.649855,...,0.612177,0.376567,27.636272,9.0792,15.47802,21.9303,32.4558,43.17102,54.7704,1.0
2,3,10921915.0,male,bike,https://www.endomondo.com/users/10921915/worko...,1407858000.0,1407865000.0,100.833333,60.173262,24.649957,...,0.790413,0.123802,26.159896,4.0464,14.4144,20.7909,31.05,39.8304,59.2092,1.0
3,4,10921915.0,male,bike,https://www.endomondo.com/users/10921915/worko...,1407432000.0,1407436000.0,65.633333,60.173286,24.649874,...,0.674708,0.294058,27.135904,4.3272,15.84306,21.3957,31.8411,42.42096,57.9852,1.0
4,5,10921915.0,male,bike (transport),https://www.endomondo.com/users/10921915/worko...,1406909000.0,1406911000.0,20.333333,60.173293,24.649798,...,0.07623,0.918033,31.241183,0.0,18.3528,24.47906,37.426941,46.51405,51.635122,1.0


### Output Data

In [10]:
# output combined df to csv

df_all.to_csv(out_path,index=False)