### Contents

- [EDA](#EDA)

### Header

In [1]:
# user configuration

In [2]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
#from pandas.api.types import is_numeric_dtype

# visual
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# html
from IPython.display import Image
from IPython.core.display import HTML

# web
import json

# others
from tqdm import tqdm
import datetime as dt
#import re
import os

In [3]:
# file paths

folder = 'fitrec/'

input_path = '../data/1_input/' + folder
clean_path = '../data/2_clean/' + folder
preprocess_path = '../data/3_preprocess/' + folder
output_path = '../data/4_output/' + folder

### Import Data

In [4]:
# import clean data

# file = 'endomondoHR_proper_head_1000.json'
# file = 'endomondoHR_proper_head_10000.json'

# path = input_path + file

# with open(path, "r") as read_file:
#     data = json.load(read_file)

### Functions

In [5]:
def create_df(data):
    
    #cols = ['id','userId','gender','timestamp','sport','speed','heart_rate','latitude','longitude','altitude']
    df = pd.DataFrame()
    
    index = 0

    for key,val in data.items():
    
        #print(index)

        df.at[index,'id'] = key
        df.at[index,'userId'] = val['userId']
        df.at[index,'gender'] = val['gender']
        df.at[index,'sport'] = val['sport']

        timestamp = val['timestamp']
        heart_rate = val['heart_rate']
        latitude = val['latitude']
        longitude = val['longitude']
        altitude = val['altitude']   

        df.at[index,'time_start'] = np.min(timestamp)
        df.at[index,'time_end'] = np.max(timestamp)
        df.at[index,'time_dur'] = (np.max(timestamp) - np.min(timestamp))/60

        df.at[index,'lat_start'] = latitude[0]
        df.at[index,'lat_end'] = latitude[-1]

        df.at[index,'lon_start'] = longitude[0]
        df.at[index,'lon_end'] = longitude[-1]

        df.at[index,'alt_avg'] = np.mean(altitude)
        df.at[index,'alt_min'] = np.min(altitude)
        df.at[index,'alt_q1'] = np.quantile(altitude,0.25)
        df.at[index,'alt_q3'] = np.quantile(altitude,0.75)
        df.at[index,'alt_max'] = np.max(altitude)

        df.at[index,'hr_avg'] = np.mean(heart_rate)
        df.at[index,'hr_min'] = np.min(heart_rate)
        df.at[index,'hr_q1'] = np.quantile(heart_rate,0.25)
        df.at[index,'hr_q3'] = np.quantile(heart_rate,0.75)
        df.at[index,'hr_max'] = np.max(heart_rate)

        if 'speed' in val.keys():
            speed = val['speed']

            df.at[index,'spd_avg'] = np.mean(speed)
            df.at[index,'spd_min'] = np.min(speed)
            df.at[index,'spd_q1'] = np.quantile(speed,0.25)
            df.at[index,'spd_q3'] = np.quantile(speed,0.75)
            df.at[index,'spd_max'] = np.max(speed)

        index += 1
        
    return df

### Import Json and Create/Output df

In [10]:
# select json file for splitting

#file_index = 0
#file_index = 1
file_index = 2

input_filepath = [
            "endomondoHR/",
            "endomondoMeta/",
            "endomondoHR_proper/",
            #"processed_endomondoHR_proper/",
            #"processed_endomondoHR_proper_interpolate/"        
            ]


output_filepath = [
            "endomondoHR_summary.csv",
            "endomondoMeta_summary.csv",
            "endomondoHR_proper_summary.csv",
            #"processed_endomondoHR_proper_summary.csv",
            #"processed_endomondoHR_proper_interpolate_summary.csv"        
            ]

# init filepaths

in_path = input_path + input_filepath[file_index]
out_path = clean_path + output_filepath[file_index]

print(in_path)
print(out_path)

../data/1_input/fitrec/endomondoHR_proper/
../data/2_clean/fitrec/endomondoHR_proper_summary.csv


In [7]:
df_all = pd.DataFrame()

files = os.listdir(in_path)

for file in files:
    
    print(file)
    path = in_path + file
    
    with open(path, "r") as read_file:
        
        # load each json file
        data = json.load(read_file)
        
        # create summary for json
        df_json = create_df(data)
        
        # combine summary to df_all
        df_all = df_all.append(df_json,ignore_index=True)

endomondoHR_proper_001.json
endomondoHR_proper_002.json
endomondoHR_proper_003.json
endomondoHR_proper_004.json
endomondoHR_proper_005.json
endomondoHR_proper_006.json
endomondoHR_proper_007.json
endomondoHR_proper_008.json
endomondoHR_proper_009.json
endomondoHR_proper_010.json
endomondoHR_proper_011.json
endomondoHR_proper_012.json
endomondoHR_proper_013.json
endomondoHR_proper_014.json
endomondoHR_proper_015.json
endomondoHR_proper_016.json
endomondoHR_proper_017.json
endomondoHR_proper_018.json
endomondoHR_proper_019.json
endomondoHR_proper_020.json
endomondoHR_proper_021.json
endomondoHR_proper_022.json
endomondoHR_proper_023.json
endomondoHR_proper_024.json
endomondoHR_proper_025.json
endomondoHR_proper_026.json
endomondoHR_proper_027.json
endomondoHR_proper_028.json
endomondoHR_proper_029.json
endomondoHR_proper_030.json
endomondoHR_proper_031.json
endomondoHR_proper_032.json
endomondoHR_proper_033.json
endomondoHR_proper_034.json
endomondoHR_proper_035.json
endomondoHR_proper_0

### Verify df

In [8]:
df_all

Unnamed: 0,id,userId,gender,sport,time_start,time_end,time_dur,lat_start,lat_end,lon_start,...,hr_avg,hr_min,hr_q1,hr_q3,hr_max,spd_avg,spd_min,spd_q1,spd_q3,spd_max
0,1,10921915.0,male,bike,1.408899e+09,1.408906e+09,126.483333,60.173349,60.173354,24.649770,...,152.650,100.0,146.00,160.00,177.0,26.162158,3.8592,19.8972,31.7313,57.4596
1,2,10921915.0,male,bike,1.408222e+09,1.408226e+09,74.000000,60.173248,60.173344,24.649855,...,147.710,100.0,140.00,156.00,174.0,27.218369,8.0820,21.8106,31.9257,54.7704
2,3,10921915.0,male,bike,1.407858e+09,1.407865e+09,112.483333,60.173262,60.173366,24.649957,...,140.554,99.0,133.00,149.00,168.0,26.050774,0.0000,20.5992,31.0365,59.2092
3,4,10921915.0,male,bike,1.407432e+09,1.407437e+09,75.316667,60.173286,60.173303,24.649874,...,147.020,99.0,140.00,155.00,178.0,26.877838,4.3272,21.4848,31.2651,57.9852
4,5,10921915.0,male,bike (transport),1.406909e+09,1.406911e+09,22.616667,60.173293,60.183756,24.649798,...,167.154,110.0,164.00,174.00,183.0,,,,,
5,6,10921915.0,male,bike (transport),1.406665e+09,1.406667e+09,33.666667,60.183585,60.173346,24.817700,...,148.120,106.0,138.00,159.00,179.0,,,,,
6,7,10921915.0,male,bike (transport),1.406647e+09,1.406649e+09,23.750000,60.173279,60.183718,24.649817,...,166.084,118.0,162.00,174.00,185.0,29.592281,7.2180,23.7726,35.2998,50.9724
7,8,10921915.0,male,bike (transport),1.406581e+09,1.406583e+09,33.216667,60.183662,60.173386,24.817467,...,144.794,112.0,134.00,155.00,179.0,21.673174,6.8040,16.2369,26.0046,49.1436
8,9,10921915.0,male,bike,1.405282e+09,1.405295e+09,216.916667,60.173318,60.173332,24.649676,...,137.804,90.0,129.00,148.00,184.0,,,,,
9,10,10921915.0,male,bike,1.405101e+09,1.405105e+09,73.700000,60.173246,60.173441,24.650183,...,145.914,98.0,137.00,156.00,178.0,27.497556,11.6820,21.7278,32.4873,57.7404


### Output df

In [12]:
# output combined df to csv

df_all.to_csv(out_path,index=False)