### Contents

- [Header](#Header)
- [Functions](#Functions)


- [Import Json and Create df](#Import-Json-and-Create-df)
- [Verify Data](#Verify-Data)
- [Output Data](#Output-Data)





### Header

In [13]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
from datetime import datetime
#import re
import os

In [14]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Functions

In [15]:
# source: https://help.fitbit.com/articles/en_US/Help_article/1565

# peak zone: hr >= 85%
# cardio zone: 70% <= hr < 85%
# fat burn zone: 50% <= hr < 69%
# out of zone: hr < 50%

avg_age = 40

max_hr = 220 - avg_age
peak_hr = int(max_hr * 0.85)
cardio_hr = int(max_hr * 0.70)
fatburn_hr = int(max_hr * 0.50)

print('max_hr',max_hr)
print('peak_hr',peak_hr)
print('cardio_hr',cardio_hr)
print('fatburn_hr',fatburn_hr)

max_hr 180
peak_hr 153
cardio_hr 125
fatburn_hr 90


In [16]:
def compute_hr_zone(heart_rate,time_diff):
    
    zone = [0,0,0,0]
    
    for idx,hr in enumerate(heart_rate):       
        
        td = time_diff[idx]
        #print(idx,hr,td)        
        
        if hr < fatburn_hr:
            zone[0] += td
        elif hr < cardio_hr:
            zone[1] += td
        elif hr < peak_hr:
            zone[2] += td
        else:
            zone[3] += td
            
    total = sum(zone)
    
    zone[0] /= total
    zone[1] /= total
    zone[2] /= total
    zone[3] /= total
    
    zone[0] = round(zone[0],4)
    zone[1] = round(zone[1],4)
    zone[2] = round(zone[2],4)
    zone[3] = round(zone[3],4)
                      
    return zone           

In [17]:
# def compute_hr_zone(heart_rate):
    
#     zone = [0,0,0,0]
    
#     for hr in heart_rate:
        
#         if hr < fatburn_hr:
#             zone[0] += 1
#         elif hr < cardio_hr:
#             zone[1] += 1
#         elif hr < peak_hr:
#             zone[2] += 1
#         else:
#             zone[3] += 1
            
#     total = sum(zone)
    
#     zone[0] /= round(total,3)
#     zone[1] /= round(total,3)
#     zone[2] /= round(total,3)
#     zone[3] /= round(total,3)
                      
#     return zone           

In [18]:
def compute_speed_zone(speed,time_diff):
    
    zone = [0,0,0,0]
    
    for idx,spd in enumerate(speed):       
        
        td = time_diff[idx]
        #print(idx,hr,td)        
        
        if spd < 10:
            zone[0] += td

        elif spd < 20:
            zone[1] += td
        elif spd < 30:
            zone[2] += td
        else:
            zone[3] += td
            
    total = sum(zone)
    
    zone[0] /= total
    zone[1] /= total
    zone[2] /= total
    zone[3] /= total
    
    zone[0] = round(zone[0],4)
    zone[1] = round(zone[1],4)
    zone[2] = round(zone[2],4)
    zone[3] = round(zone[3],4)
                      
    return zone           

In [19]:
def compute_spd_hr_matrix(heart_rate,speed,time_diff):
    
    matrix = np.zeros((4, 4))
    hr_idx = -1
    spd_idx = -1
    
    for idx,td in enumerate(time_diff):
        
        spd = speed[idx]
        hr = heart_rate[idx]
        
        if spd < 10:
            spd_idx = 0
        elif spd < 20:
            spd_idx = 1
        elif spd < 30:
            spd_idx = 2
        else:
            spd_idx = 3
            
        if hr < fatburn_hr:
            hr_idx = 0
        elif hr < cardio_hr:
            hr_idx = 1
        elif hr < peak_hr:
            hr_idx = 2
        else:
            hr_idx = 3
            
        matrix[spd_idx,hr_idx] += td
        
    total = np.sum(matrix)
    matrix /= total    
    matrix = np.around(matrix,decimals=4)
        
    return matrix 

In [20]:
def create_df(data):
    
    df = pd.DataFrame()
    
    index = 0
    
    #start = 0 # use first data point
    start = 25 # omit first 25 data points
    
    end = 450 # omit last 50 data points (exercise cooldown threshold)
    #end = 475 # omit last 25 data points (exercise cooldown threshold)
    #end = 500 # use last data point

    for key,val in data.items():

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']
        df.at[index,'url'] = val['url']

        timestamp = val['timestamp'][start:end]
        heart_rate = val['heart_rate'][start:end]
        latitude = val['latitude'][start:end]
        longitude = val['longitude'][start:end]
        altitude = val['altitude'][start:end]
        time_diff = val['time_diff'][start:end]

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60 # minutes

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lon_start'] = longitude[0]
        
        df.at[index,'lat_end'] = latitude[-1]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_05'] = np.quantile(altitude,0.05)
        df.at[index,'alt_25'] = np.quantile(altitude,0.25)
        df.at[index,'alt_75'] = np.quantile(altitude,0.75)
        df.at[index,'alt_95'] = np.quantile(altitude,0.95)
        df.at[index,'alt_max'] = np.max(altitude)
        df.at[index,'alt_diff'] = np.max(altitude) - np.min(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_05'] = np.quantile(heart_rate,0.05)
        df.at[index,'hr_25'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_75'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_95'] = np.quantile(heart_rate,0.95)
        df.at[index,'hr_max'] = np.max(heart_rate)
        
        #hr_zone = compute_hr_zone(heart_rate)
        hr_zone = compute_hr_zone(heart_rate,time_diff)
        #print(hr_zone)
        
        df.at[index,'hr_outof'] = hr_zone[0]
        df.at[index,'hr_fatburn'] = hr_zone[1]
        df.at[index,'hr_cardio'] = hr_zone[2]
        df.at[index,'hr_peak'] = hr_zone[3]

        if 'speed' in val.keys():
            speed = val['speed'][start:end]

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_05'] = np.quantile(speed,0.05)
            df.at[index,'spd_25'] = np.quantile(speed,0.25)
            df.at[index,'spd_75'] = np.quantile(speed,0.75)
            df.at[index,'spd_95'] = np.quantile(speed,0.95)
            df.at[index,'spd_max'] = np.max(speed)
            
        speed_zone = compute_speed_zone(speed,time_diff)
        
        df.at[index,'spd_low'] = speed_zone[0]
        df.at[index,'spd_med'] = speed_zone[1] 
        df.at[index,'spd_high'] = speed_zone[2]
        df.at[index,'spd_vhigh'] = speed_zone[3]
        
#         hr_spd_matrix = compute_spd_hr_matrix(heart_rate,speed,time_diff)
        
#         for i in range(4):
#             for j in range(4):
#                 col = 'spd_hr_{}_{}'.format(i,j)
#                 df.at[index,col] = hr_spd_matrix[i,j]                
            
        if 'distance' in val.keys():
            df.at[index,'impute'] = 1
        else:
            df.at[index,'impute'] = 0

        index += 1
        
    return df

### Import Json and Create df

In [21]:
# select json file for splitting

file_index = 4

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",    
            "endomondoHR_proper/",
            "endomondoHR_proper_dist_spd/",
            "endomondoHR_proper_dist_spd_time/", 
            "processed_endomondoHR_proper/",
            "processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",    
            "endomondoHR_proper_summary.csv",
            "endomondoHR_proper_dist_spd_summary.csv",
            "endomondoHR_proper_dist_spd_time_summary.csv",    
            "processed_endomondoHR_proper_summary.csv",
            "processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/1_input/fitrec/endomondoHR_proper_dist_spd_time/
../../data/2_clean/fitrec/endomondoHR_proper_dist_spd_time_summary.csv


In [22]:
start_time = datetime.now()

df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    #print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)
        
end_time = datetime.now()
cell_time = end_time - start_time
print('')
print('cell completed in:',cell_time)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'time_start'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2813, in _set_value
    series = self._get_item_cache(col)
  File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py", line 3061, in _get_item_cache
    values = self._data.get(item)
  File "C:\ProgramDat

TypeError: can only concatenate str (not "list") to str

### Verify Data

In [None]:
print(df_all.shape)
df_all.head()

### Output Data

In [None]:
# output combined df to csv

df_all.to_csv(out_path,index=False)