### Contents

- [EDA](#EDA)

### Header

In [1]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
from datetime import datetime
#import re
import os

In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Import Data

In [3]:
# import clean data

# file = 'endomondoHR_proper_head_1000.json'
# file = 'endomondoHR_proper_head_10000.json'

# path = input_path + file

# with open(path, "r") as read_file:
#     data = json.load(read_file)

### Functions

In [4]:
# source: https://help.fitbit.com/articles/en_US/Help_article/1565

# peak zone: hr >= 85%
# cardio zone: 70% <= hr < 85%
# fat burn zone: 50% <= hr < 69%
# out of zone: hr < 50%

avg_age = 40

max_hr = 220 - avg_age
peak_hr = int(max_hr * 0.85)
cardio_hr = int(max_hr * 0.70)
fatburn_hr = int(max_hr * 0.50)

print('max_hr',max_hr)
print('peak_hr',peak_hr)
print('cardio_hr',cardio_hr)
print('fatburn_hr',fatburn_hr)

max_hr 180
peak_hr 153
cardio_hr 125
fatburn_hr 90


In [5]:
def compute_hr_zone(heart_rate):
    
    zone = [0,0,0,0]
    
    for hr in heart_rate:
        
        if hr < fatburn_hr:
            zone[0] += 1
        elif hr < cardio_hr:
            zone[1] += 1
        elif hr < peak_hr:
            zone[2] += 1
        else:
            zone[3] += 1
            
    total = sum(zone)
    
    zone[0] /= round(total,3)
    zone[1] /= round(total,3)
    zone[2] /= round(total,3)
    zone[3] /= round(total,3)
                      
    return zone           

In [6]:
def create_df(data):
    
    #cols = ['id','userId','gender','timestamp','sport','speed','heart_rate','latitude','longitude','altitude']
    df = pd.DataFrame()
    
    index = 0
    
    data_pts = 450 # omit last 50 points (exercise cooldown threshold)
    #data_pts = 500 # all points 

    for key,val in data.items():
    
        #print(index)

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']

        timestamp = val['timestamp'][:data_pts]
        heart_rate = val['heart_rate'][:data_pts]
        latitude = val['latitude'][:data_pts]
        longitude = val['longitude'][:data_pts]
        altitude = val['altitude'][:data_pts]

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60 # minutes

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lon_start'] = longitude[0]
        
        df.at[index,'lat_end'] = latitude[-1]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_05'] = np.quantile(altitude,0.05)
        df.at[index,'alt_25'] = np.quantile(altitude,0.25)
        df.at[index,'alt_75'] = np.quantile(altitude,0.75)
        df.at[index,'alt_95'] = np.quantile(altitude,0.95)
        df.at[index,'alt_max'] = np.max(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_05'] = np.quantile(heart_rate,0.05)
        df.at[index,'hr_25'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_75'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_95'] = np.quantile(heart_rate,0.95)
        df.at[index,'hr_max'] = np.max(heart_rate)
        
        hr_zone = compute_hr_zone(heart_rate)
        #print(hr_zone)
        
        df.at[index,'hr_outof'] = hr_zone[0]
        df.at[index,'hr_fatburn'] = hr_zone[1]
        df.at[index,'hr_cardio'] = hr_zone[2]
        df.at[index,'hr_peak'] = hr_zone[3]

        if 'speed' in val.keys():
            speed = val['speed'][:data_pts]

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_05'] = np.quantile(speed,0.05)
            df.at[index,'spd_25'] = np.quantile(speed,0.25)
            df.at[index,'spd_75'] = np.quantile(speed,0.75)
            df.at[index,'spd_95'] = np.quantile(speed,0.95)
            df.at[index,'spd_max'] = np.max(speed)
            
        if 'distance' in val.keys():
            df.at[index,'impute'] = 1
        else:
            df.at[index,'impute'] = 0

        index += 1
        
    return df

### Import Json and Create/Output df

In [7]:
# select json file for splitting

file_index = 3

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",    
            "endomondoHR_proper/",
            "endomondoHR_proper_dist_spd/",    
            "processed_endomondoHR_proper/",
            "processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",    
            "endomondoHR_proper_summary.csv",
            "endomondoHR_proper_dist_spd_summary.csv",    
            "processed_endomondoHR_proper_summary.csv",
            "processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/1_input/fitrec/endomondoHR_proper_dist_spd/
../../data/2_clean/fitrec/endomondoHR_proper_dist_spd_summary.csv


In [8]:
start_time = datetime.now()

df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    #print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)
        
end_time = datetime.now()
cell_time = end_time - start_time
print('')
print('cell completed in:',cell_time)

endomondoHR_proper_dist_spd_0001.json
endomondoHR_proper_dist_spd_0002.json
endomondoHR_proper_dist_spd_0003.json
endomondoHR_proper_dist_spd_0004.json
endomondoHR_proper_dist_spd_0005.json
endomondoHR_proper_dist_spd_0006.json
endomondoHR_proper_dist_spd_0007.json
endomondoHR_proper_dist_spd_0008.json
endomondoHR_proper_dist_spd_0009.json
endomondoHR_proper_dist_spd_0010.json
endomondoHR_proper_dist_spd_0011.json
endomondoHR_proper_dist_spd_0012.json
endomondoHR_proper_dist_spd_0013.json
endomondoHR_proper_dist_spd_0014.json
endomondoHR_proper_dist_spd_0015.json
endomondoHR_proper_dist_spd_0016.json
endomondoHR_proper_dist_spd_0017.json
endomondoHR_proper_dist_spd_0018.json
endomondoHR_proper_dist_spd_0019.json
endomondoHR_proper_dist_spd_0020.json
endomondoHR_proper_dist_spd_0021.json
endomondoHR_proper_dist_spd_0022.json
endomondoHR_proper_dist_spd_0023.json
endomondoHR_proper_dist_spd_0024.json
endomondoHR_proper_dist_spd_0025.json
endomondoHR_proper_dist_spd_0026.json
endomondoHR_

endomondoHR_proper_dist_spd_0216.json
endomondoHR_proper_dist_spd_0217.json
endomondoHR_proper_dist_spd_0218.json
endomondoHR_proper_dist_spd_0219.json
endomondoHR_proper_dist_spd_0220.json
endomondoHR_proper_dist_spd_0221.json
endomondoHR_proper_dist_spd_0222.json
endomondoHR_proper_dist_spd_0223.json
endomondoHR_proper_dist_spd_0224.json
endomondoHR_proper_dist_spd_0225.json
endomondoHR_proper_dist_spd_0226.json
endomondoHR_proper_dist_spd_0227.json
endomondoHR_proper_dist_spd_0228.json
endomondoHR_proper_dist_spd_0229.json
endomondoHR_proper_dist_spd_0230.json
endomondoHR_proper_dist_spd_0231.json
endomondoHR_proper_dist_spd_0232.json
endomondoHR_proper_dist_spd_0233.json
endomondoHR_proper_dist_spd_0234.json
endomondoHR_proper_dist_spd_0235.json
endomondoHR_proper_dist_spd_0236.json
endomondoHR_proper_dist_spd_0237.json
endomondoHR_proper_dist_spd_0238.json
endomondoHR_proper_dist_spd_0239.json
endomondoHR_proper_dist_spd_0240.json
endomondoHR_proper_dist_spd_0241.json
endomondoHR_

endomondoHR_proper_dist_spd_0431.json
endomondoHR_proper_dist_spd_0432.json
endomondoHR_proper_dist_spd_0433.json
endomondoHR_proper_dist_spd_0434.json
endomondoHR_proper_dist_spd_0435.json
endomondoHR_proper_dist_spd_0436.json
endomondoHR_proper_dist_spd_0437.json
endomondoHR_proper_dist_spd_0438.json
endomondoHR_proper_dist_spd_0439.json
endomondoHR_proper_dist_spd_0440.json
endomondoHR_proper_dist_spd_0441.json
endomondoHR_proper_dist_spd_0442.json
endomondoHR_proper_dist_spd_0443.json
endomondoHR_proper_dist_spd_0444.json
endomondoHR_proper_dist_spd_0445.json
endomondoHR_proper_dist_spd_0446.json
endomondoHR_proper_dist_spd_0447.json
endomondoHR_proper_dist_spd_0448.json
endomondoHR_proper_dist_spd_0449.json
endomondoHR_proper_dist_spd_0450.json
endomondoHR_proper_dist_spd_0451.json
endomondoHR_proper_dist_spd_0452.json
endomondoHR_proper_dist_spd_0453.json
endomondoHR_proper_dist_spd_0454.json
endomondoHR_proper_dist_spd_0455.json
endomondoHR_proper_dist_spd_0456.json
endomondoHR_

endomondoHR_proper_dist_spd_0646.json
endomondoHR_proper_dist_spd_0647.json
endomondoHR_proper_dist_spd_0648.json
endomondoHR_proper_dist_spd_0649.json
endomondoHR_proper_dist_spd_0650.json
endomondoHR_proper_dist_spd_0651.json
endomondoHR_proper_dist_spd_0652.json
endomondoHR_proper_dist_spd_0653.json
endomondoHR_proper_dist_spd_0654.json
endomondoHR_proper_dist_spd_0655.json
endomondoHR_proper_dist_spd_0656.json
endomondoHR_proper_dist_spd_0657.json
endomondoHR_proper_dist_spd_0658.json
endomondoHR_proper_dist_spd_0659.json
endomondoHR_proper_dist_spd_0660.json
endomondoHR_proper_dist_spd_0661.json
endomondoHR_proper_dist_spd_0662.json
endomondoHR_proper_dist_spd_0663.json
endomondoHR_proper_dist_spd_0664.json
endomondoHR_proper_dist_spd_0665.json
endomondoHR_proper_dist_spd_0666.json
endomondoHR_proper_dist_spd_0667.json
endomondoHR_proper_dist_spd_0668.json
endomondoHR_proper_dist_spd_0669.json
endomondoHR_proper_dist_spd_0670.json
endomondoHR_proper_dist_spd_0671.json
endomondoHR_

endomondoHR_proper_dist_spd_0861.json
endomondoHR_proper_dist_spd_0862.json
endomondoHR_proper_dist_spd_0863.json
endomondoHR_proper_dist_spd_0864.json
endomondoHR_proper_dist_spd_0865.json
endomondoHR_proper_dist_spd_0866.json
endomondoHR_proper_dist_spd_0867.json
endomondoHR_proper_dist_spd_0868.json
endomondoHR_proper_dist_spd_0869.json
endomondoHR_proper_dist_spd_0870.json
endomondoHR_proper_dist_spd_0871.json
endomondoHR_proper_dist_spd_0872.json
endomondoHR_proper_dist_spd_0873.json
endomondoHR_proper_dist_spd_0874.json
endomondoHR_proper_dist_spd_0875.json
endomondoHR_proper_dist_spd_0876.json
endomondoHR_proper_dist_spd_0877.json
endomondoHR_proper_dist_spd_0878.json
endomondoHR_proper_dist_spd_0879.json
endomondoHR_proper_dist_spd_0880.json
endomondoHR_proper_dist_spd_0881.json
endomondoHR_proper_dist_spd_0882.json
endomondoHR_proper_dist_spd_0883.json
endomondoHR_proper_dist_spd_0884.json
endomondoHR_proper_dist_spd_0885.json
endomondoHR_proper_dist_spd_0886.json
endomondoHR_

endomondoHR_proper_dist_spd_1076.json
endomondoHR_proper_dist_spd_1077.json
endomondoHR_proper_dist_spd_1078.json
endomondoHR_proper_dist_spd_1079.json
endomondoHR_proper_dist_spd_1080.json
endomondoHR_proper_dist_spd_1081.json
endomondoHR_proper_dist_spd_1082.json
endomondoHR_proper_dist_spd_1083.json
endomondoHR_proper_dist_spd_1084.json
endomondoHR_proper_dist_spd_1085.json
endomondoHR_proper_dist_spd_1086.json
endomondoHR_proper_dist_spd_1087.json
endomondoHR_proper_dist_spd_1088.json
endomondoHR_proper_dist_spd_1089.json
endomondoHR_proper_dist_spd_1090.json
endomondoHR_proper_dist_spd_1091.json
endomondoHR_proper_dist_spd_1092.json
endomondoHR_proper_dist_spd_1093.json
endomondoHR_proper_dist_spd_1094.json
endomondoHR_proper_dist_spd_1095.json
endomondoHR_proper_dist_spd_1096.json
endomondoHR_proper_dist_spd_1097.json
endomondoHR_proper_dist_spd_1098.json
endomondoHR_proper_dist_spd_1099.json
endomondoHR_proper_dist_spd_1100.json
endomondoHR_proper_dist_spd_1101.json
endomondoHR_

endomondoHR_proper_dist_spd_1291.json
endomondoHR_proper_dist_spd_1292.json
endomondoHR_proper_dist_spd_1293.json
endomondoHR_proper_dist_spd_1294.json
endomondoHR_proper_dist_spd_1295.json
endomondoHR_proper_dist_spd_1296.json
endomondoHR_proper_dist_spd_1297.json
endomondoHR_proper_dist_spd_1298.json
endomondoHR_proper_dist_spd_1299.json
endomondoHR_proper_dist_spd_1300.json
endomondoHR_proper_dist_spd_1301.json
endomondoHR_proper_dist_spd_1302.json
endomondoHR_proper_dist_spd_1303.json
endomondoHR_proper_dist_spd_1304.json
endomondoHR_proper_dist_spd_1305.json
endomondoHR_proper_dist_spd_1306.json
endomondoHR_proper_dist_spd_1307.json
endomondoHR_proper_dist_spd_1308.json
endomondoHR_proper_dist_spd_1309.json
endomondoHR_proper_dist_spd_1310.json
endomondoHR_proper_dist_spd_1311.json
endomondoHR_proper_dist_spd_1312.json
endomondoHR_proper_dist_spd_1313.json
endomondoHR_proper_dist_spd_1314.json
endomondoHR_proper_dist_spd_1315.json
endomondoHR_proper_dist_spd_1316.json
endomondoHR_

endomondoHR_proper_dist_spd_1506.json
endomondoHR_proper_dist_spd_1507.json
endomondoHR_proper_dist_spd_1508.json
endomondoHR_proper_dist_spd_1509.json
endomondoHR_proper_dist_spd_1510.json
endomondoHR_proper_dist_spd_1511.json
endomondoHR_proper_dist_spd_1512.json
endomondoHR_proper_dist_spd_1513.json
endomondoHR_proper_dist_spd_1514.json
endomondoHR_proper_dist_spd_1515.json
endomondoHR_proper_dist_spd_1516.json
endomondoHR_proper_dist_spd_1517.json
endomondoHR_proper_dist_spd_1518.json
endomondoHR_proper_dist_spd_1519.json
endomondoHR_proper_dist_spd_1520.json
endomondoHR_proper_dist_spd_1521.json
endomondoHR_proper_dist_spd_1522.json
endomondoHR_proper_dist_spd_1523.json
endomondoHR_proper_dist_spd_1524.json
endomondoHR_proper_dist_spd_1525.json
endomondoHR_proper_dist_spd_1526.json
endomondoHR_proper_dist_spd_1527.json
endomondoHR_proper_dist_spd_1528.json
endomondoHR_proper_dist_spd_1529.json
endomondoHR_proper_dist_spd_1530.json
endomondoHR_proper_dist_spd_1531.json
endomondoHR_

### Verify df

In [9]:
print(df_all.shape)
df_all.head()

(167783, 37)


Unnamed: 0,id,userId,gender,sport,time_start,time_end,time_dur,lat_start,lon_start,lat_end,...,hr_cardio,hr_peak,spd_avg,spd_min,spd_05,spd_25,spd_75,spd_95,spd_max,impute
0,1,10921915.0,male,bike,1408899000.0,1408905000.0,112.133333,60.173349,24.64977,60.145997,...,0.464444,0.522222,26.152328,6.8652,13.9491,19.9044,31.7025,41.02704,57.4596,0.0
1,2,10921915.0,male,bike,1408222000.0,1408226000.0,65.15,60.173248,24.649855,60.176624,...,0.591111,0.388889,27.636272,9.0792,15.47802,21.9303,32.4558,43.17102,54.7704,0.0
2,3,10921915.0,male,bike,1407858000.0,1407865000.0,100.833333,60.173262,24.649957,60.152897,...,0.782222,0.16,26.159896,4.0464,14.4144,20.7909,31.05,39.8304,59.2092,0.0
3,4,10921915.0,male,bike,1407432000.0,1407436000.0,65.633333,60.173286,24.649874,60.180084,...,0.637778,0.324444,27.135904,4.3272,15.84306,21.3957,31.8411,42.42096,57.9852,0.0
4,5,10921915.0,male,bike (transport),1406909000.0,1406911000.0,20.333333,60.173293,24.649798,60.181585,...,0.08,0.908889,31.241183,0.0,18.3528,24.47906,37.426941,46.51405,51.635122,1.0


### Output df

In [10]:
# output combined df to csv

df_all.to_csv(out_path,index=False)