### Contents

- [EDA](#EDA)

### Header

In [1]:
# user configuration

In [2]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
import datetime as dt
#import re
import os

In [3]:
# file paths

folder = 'fitrec/'

input_path = '../data/1_input/' + folder
clean_path = '../data/2_clean/' + folder
preprocess_path = '../data/3_preprocess/' + folder
output_path = '../data/4_output/' + folder

### Import Data

In [4]:
# import clean data

# file = 'endomondoHR_proper_head_1000.json'
# file = 'endomondoHR_proper_head_10000.json'

# path = input_path + file

# with open(path, "r") as read_file:
#     data = json.load(read_file)

### Functions

In [5]:
# source: https://help.fitbit.com/articles/en_US/Help_article/1565

# peak zone: hr >= 85%
# cardio zone: 70% <= hr < 85%
# fat burn zone: 50% <= hr < 69%
# out of zone: hr < 50%

avg_age = 40

max_hr = 220 - avg_age
peak_hr = int(max_hr * 0.85)
cardio_hr = int(max_hr * 0.70)
fatburn_hr = int(max_hr * 0.50)

print('max_hr',max_hr)
print('peak_hr',peak_hr)
print('cardio_hr',cardio_hr)
print('fatburn_hr',fatburn_hr)

max_hr 180
peak_hr 153
cardio_hr 125
fatburn_hr 90


In [18]:
def compute_hr_zone(heart_rate):
    
    zone = [0,0,0,0]
    
    for hr in heart_rate:
        
        if hr < fatburn_hr:
            zone[0] += 1
        elif hr < cardio_hr:
            zone[1] += 1
        elif hr < peak_hr:
            zone[2] += 1
        else:
            zone[3] += 1
            
    total = sum(zone)
    
    zone[0] /= round(total,3)
    zone[1] /= round(total,3)
    zone[2] /= round(total,3)
    zone[3] /= round(total,3)
                      
    return zone           

In [19]:
def create_df(data):
    
    #cols = ['id','userId','gender','timestamp','sport','speed','heart_rate','latitude','longitude','altitude']
    df = pd.DataFrame()
    
    index = 0

    for key,val in data.items():
    
        #print(index)

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']

        timestamp = val['timestamp']
        heart_rate = val['heart_rate']
        latitude = val['latitude']
        longitude = val['longitude']
        altitude = val['altitude']   

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lat_end'] = latitude[-1]

        df.at[index,'lon_start'] = longitude[0]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_q1'] = np.quantile(altitude,0.25)
        df.at[index,'alt_q3'] = np.quantile(altitude,0.75)
        df.at[index,'alt_max'] = np.max(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_q1'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_q3'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_max'] = np.max(heart_rate)
        
        hr_zone = compute_hr_zone(heart_rate)
        #print(hr_zone)
        
        df.at[index,'hr_outof'] = hr_zone[0]
        df.at[index,'hr_fatburn'] = hr_zone[1]
        df.at[index,'hr_cardio'] = hr_zone[2]
        df.at[index,'hr_peak'] = hr_zone[3]

        if 'speed' in val.keys():
            speed = val['speed']

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_q1'] = np.quantile(speed,0.25)
            df.at[index,'spd_q3'] = np.quantile(speed,0.75)
            df.at[index,'spd_max'] = np.max(speed)

        index += 1
        
    return df

### Import Json and Create/Output df

In [20]:
# select json file for splitting

#file_index = 0
#file_index = 1
file_index = 2

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",
            "endomondoHR_proper/",
            #"processed_endomondoHR_proper/",
            #"processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",
            "endomondoHR_proper_summary.csv",
            #"processed_endomondoHR_proper_summary.csv",
            #"processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../data/1_input/fitrec/endomondoHR_proper/
../data/2_clean/fitrec/endomondoHR_proper_summary.csv


In [21]:
df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)

endomondoHR_proper_001.json
endomondoHR_proper_002.json
endomondoHR_proper_003.json
endomondoHR_proper_004.json
endomondoHR_proper_005.json
endomondoHR_proper_006.json
endomondoHR_proper_007.json
endomondoHR_proper_008.json
endomondoHR_proper_009.json
endomondoHR_proper_010.json
endomondoHR_proper_011.json
endomondoHR_proper_012.json
endomondoHR_proper_013.json
endomondoHR_proper_014.json
endomondoHR_proper_015.json
endomondoHR_proper_016.json
endomondoHR_proper_017.json
endomondoHR_proper_018.json
endomondoHR_proper_019.json
endomondoHR_proper_020.json
endomondoHR_proper_021.json
endomondoHR_proper_022.json
endomondoHR_proper_023.json
endomondoHR_proper_024.json
endomondoHR_proper_025.json
endomondoHR_proper_026.json
endomondoHR_proper_027.json
endomondoHR_proper_028.json
endomondoHR_proper_029.json
endomondoHR_proper_030.json
endomondoHR_proper_031.json
endomondoHR_proper_032.json
endomondoHR_proper_033.json
endomondoHR_proper_034.json
endomondoHR_proper_035.json
endomondoHR_proper_0

### Verify df

In [22]:
print(df_all.shape)
df_all.head()

(167783, 30)


Unnamed: 0,id,userId,gender,sport,time_start,time_end,time_dur,lat_start,lat_end,lon_start,...,hr_max,hr_outof,hr_fatburn,hr_cardio,hr_peak,spd_avg,spd_min,spd_q1,spd_q3,spd_max
0,1,10921915.0,male,bike,1408899000.0,1408906000.0,126.483333,60.173349,60.173354,24.64977,...,177.0,0.0,0.012,0.466,0.522,26.162158,3.8592,19.8972,31.7313,57.4596
1,2,10921915.0,male,bike,1408222000.0,1408226000.0,74.0,60.173248,60.173344,24.649855,...,174.0,0.0,0.018,0.628,0.354,27.218369,8.082,21.8106,31.9257,54.7704
2,3,10921915.0,male,bike,1407858000.0,1407865000.0,112.483333,60.173262,60.173366,24.649957,...,168.0,0.0,0.06,0.782,0.158,26.050774,0.0,20.5992,31.0365,59.2092
3,4,10921915.0,male,bike,1407432000.0,1407437000.0,75.316667,60.173286,60.173303,24.649874,...,178.0,0.0,0.034,0.644,0.322,26.877838,4.3272,21.4848,31.2651,57.9852
4,5,10921915.0,male,bike (transport),1406909000.0,1406911000.0,22.616667,60.173293,60.183756,24.649798,...,183.0,0.0,0.01,0.072,0.918,,,,,


### Output df

In [23]:
# output combined df to csv

df_all.to_csv(out_path,index=False)