### Contents

- [Header](#Header)
- [Functions](#Functions)


- [Import Json and Create df](#Import-Json-and-Create-df)
- [Verify Data](#Verify-Data)
- [Output Data](#Output-Data)





### Header

In [1]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
from datetime import datetime
#import re
import os

In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Functions

In [3]:
# source: https://help.fitbit.com/articles/en_US/Help_article/1565

# peak zone: hr >= 85%
# cardio zone: 70% <= hr < 85%
# fat burn zone: 50% <= hr < 69%
# out of zone: hr < 50%

avg_age = 40

max_hr = 220 - avg_age
peak_hr = int(max_hr * 0.85)
cardio_hr = int(max_hr * 0.70)
fatburn_hr = int(max_hr * 0.50)

print('max_hr',max_hr)
print('peak_hr',peak_hr)
print('cardio_hr',cardio_hr)
print('fatburn_hr',fatburn_hr)

max_hr 180
peak_hr 153
cardio_hr 125
fatburn_hr 90


In [4]:
# def compute_hr_zone(heart_rate):
    
#     zone = [0,0,0,0]
    
#     for hr in heart_rate:
        
#         if hr < fatburn_hr:
#             zone[0] += 1
#         elif hr < cardio_hr:
#             zone[1] += 1
#         elif hr < peak_hr:
#             zone[2] += 1
#         else:
#             zone[3] += 1
            
#     total = sum(zone)
    
#     zone[0] /= round(total,3)
#     zone[1] /= round(total,3)
#     zone[2] /= round(total,3)
#     zone[3] /= round(total,3)
                      
#     return zone           

In [5]:
def compute_hr_zone(heart_rate,time_diff):
    
    zone = [0,0,0,0]
    
    for idx,hr in enumerate(heart_rate):       
        
        td = time_diff[idx]
        print(idx,hr,td)        
        
        if hr < fatburn_hr:
            zone[0] += td

        elif hr < cardio_hr:
            zone[1] += td
        elif hr < peak_hr:
            zone[2] += td
        else:
            zone[3] += td
            
    total = sum(zone)
    
    zone[0] /= round(total,3)
    zone[1] /= round(total,3)
    zone[2] /= round(total,3)
    zone[3] /= round(total,3)
                      
    return zone           

In [6]:
def create_df(data):
    
    #cols = ['id','userId','gender','timestamp','sport','speed','heart_rate','latitude','longitude','altitude']
    df = pd.DataFrame()
    
    index = 0
    
    data_pts = 450 # omit last 50 points (exercise cooldown threshold)
    #data_pts = 500 # all points 

    for key,val in data.items():
    
        #print(index)

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']
        df.at[index,'url'] = val['url']

        timestamp = val['timestamp'][:data_pts]
        heart_rate = val['heart_rate'][:data_pts]
        latitude = val['latitude'][:data_pts]
        longitude = val['longitude'][:data_pts]
        altitude = val['altitude'][:data_pts]
        time_diff = val['time_diff'][:data_pts]

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60 # minutes

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lon_start'] = longitude[0]
        
        df.at[index,'lat_end'] = latitude[-1]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_05'] = np.quantile(altitude,0.05)
        df.at[index,'alt_25'] = np.quantile(altitude,0.25)
        df.at[index,'alt_75'] = np.quantile(altitude,0.75)
        df.at[index,'alt_95'] = np.quantile(altitude,0.95)
        df.at[index,'alt_max'] = np.max(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_05'] = np.quantile(heart_rate,0.05)
        df.at[index,'hr_25'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_75'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_95'] = np.quantile(heart_rate,0.95)
        df.at[index,'hr_max'] = np.max(heart_rate)
        
        #hr_zone = compute_hr_zone(heart_rate)
        hr_zone = compute_hr_zone(heart_rate,time_diff)
        #print(hr_zone)
        
        df.at[index,'hr_outof'] = hr_zone[0]
        df.at[index,'hr_fatburn'] = hr_zone[1]
        df.at[index,'hr_cardio'] = hr_zone[2]
        df.at[index,'hr_peak'] = hr_zone[3]

        if 'speed' in val.keys():
            speed = val['speed'][:data_pts]

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_05'] = np.quantile(speed,0.05)
            df.at[index,'spd_25'] = np.quantile(speed,0.25)
            df.at[index,'spd_75'] = np.quantile(speed,0.75)
            df.at[index,'spd_95'] = np.quantile(speed,0.95)
            df.at[index,'spd_max'] = np.max(speed)
            
        if 'distance' in val.keys():
            df.at[index,'impute'] = 1
        else:
            df.at[index,'impute'] = 0

        index += 1
        
    return df

### Import Json and Create df

In [7]:
# select json file for splitting

file_index = 4

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",    
            "endomondoHR_proper/",
            "endomondoHR_proper_dist_spd/",
            "endomondoHR_proper_dist_spd_time/", 
            "processed_endomondoHR_proper/",
            "processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",    
            "endomondoHR_proper_summary.csv",
            "endomondoHR_proper_dist_spd_summary.csv",
            "endomondoHR_proper_dist_spd_time_summary.csv",    
            "processed_endomondoHR_proper_summary.csv",
            "processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/1_input/fitrec/endomondoHR_proper_dist_spd_time/
../../data/2_clean/fitrec/endomondoHR_proper_dist_spd_time_summary.csv


In [8]:
start_time = datetime.now()

df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    #print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)
        
end_time = datetime.now()
cell_time = end_time - start_time
print('')
print('cell completed in:',cell_time)

0 100 0 1
1 111 8 1
2 120 11 1
3 119 13 1
4 120 16 1
5 116 6 1
6 125 23 2
7 128 16 2
8 131 23 2
9 132 29 2
10 135 23 2
11 137 24 2
12 142 24 2
13 143 13 2
14 143 15 2
15 148 19 2
16 148 34 2
17 148 28 2
18 142 25 2
19 141 12 2
20 143 13 2
21 146 19 2
22 151 22 2
23 149 16 2
24 146 21 2
25 147 16 2
26 157 21 3
27 155 15 3
28 159 9 3
29 152 25 2
30 146 27 2
31 147 12 2
32 154 19 3
33 157 18 3
34 153 13 3
35 143 15 2
36 142 7 2
37 138 15 2
38 135 8 2
39 134 16 2
40 133 12 2
41 133 9 2
42 134 9 2
43 135 15 2
44 143 40 2
45 142 14 2
46 146 18 2
47 151 11 2
48 155 6 3
49 163 10 3
50 161 15 3
51 150 27 2
52 140 26 2
53 139 32 2
54 142 43 2
55 147 37 2
56 158 23 3
57 158 12 3
58 159 10 3
59 166 13 3
60 167 10 3
61 166 8 3
62 164 21 3
63 163 7 3
64 157 25 3
65 147 25 2
66 143 7 2
67 142 8 2
68 143 7 2
69 143 6 2
70 142 8 2
71 139 6 2
72 141 13 2
73 152 17 2
74 160 41 3
75 148 34 2
76 150 31 2
77 154 14 3
78 148 34 2
79 144 49 2
80 149 23 2
81 154 8 3
82 162 10 3
83 162 19 3
84 163 9 3
85 164 21

NameError: name 'stop' is not defined

### Verify Data

In [None]:
print(df_all.shape)
df_all.head()

### Output Data

In [None]:
# output combined df to csv

df_all.to_csv(out_path,index=False)