### Contents

- [Configurations and Libraries](#Configurations-and-Libraries)
- [Functions](#Functions)

### Header

In [1]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
from haversine import haversine, Unit

# others
from tqdm import tqdm
import time
import datetime as dt
import re
import os

# web
import json

In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/'

sports_path = '../../data/1_input/sports/'

### Functions

In [3]:
# output smaller json file with incremental filenames e.g. xxx_123.json

def output_json(data,file_no):
    
    file_str = str(file_no)
    file_str = file_str.zfill(4)            
    out_path_2 = out_path + file_str + '.json'

    with open(out_path_2, 'w') as f:
        #print(out_path_2)
        json.dump(data, f)

In [4]:
def compute_distance(record,idx):
    
    #print(record['latitude'])
    #print(record['longitude'])
    
    for i in range(1,500):
        
        cur_loc = (record['latitude'][i],record['longitude'][i])
        prev_loc = (record['latitude'][i-1],record['longitude'][i-1])
        
        # metre
        dist = haversine(cur_loc,prev_loc) * 1000     
        record['distance'][i] = dist
        
        #print(i,prev_loc,cur_loc,dist)
        
    return record

In [5]:
def compute_time_diff(record,idx):
    
    for i in range(1,500):
        
        #seconds
        record['time_diff'][i] = record['timestamp'][i] - record['timestamp'][i-1]
        
    return record

In [6]:
def compute_speed(record,idx):
    
    for i in range(1,500):
        
        # seconds
        #time_diff = record['timestamp'][i] - record['timestamp'][i-1]
        #record['time_diff'][i] = time_diff
        
        time_diff = record['time_diff'][i]
         
        if time_diff > 0:
            
            # metre/seconds
            speed = record['distance'][i] / time_diff
            # km/h
            speed = speed * 3.6
            
            record['speed'][i] = speed
            
            sport = record['sport'].rstrip()           

            speed_max = sport_dict[sport] 
            #print(sport,speed_max)
            
            if speed > speed_max:
                #print('error: world record for {}:{}'.format(idx,i))
                #print(record['time_diff'][i],record['distance'][i],speed)
                
                record['speed'][i] = 0.0             
            
        elif time_diff == 0:
            record['speed'][i] = record['speed'][i-1]
            #print('error: time_diff = 0 for {}:{}'.format(idx,i))
            
        # time_diff < 0
        else:
            record['speed'][i] = record['speed'][i-1]
            #print('error: time_diff < 0 for {}:{}'.format(idx,i))         
        
        #print(i,time_diff,speed)        
        
    return record

### Import Data

In [7]:
# import sports.xlsx and create df

path = sports_path + 'sports.xlsx'
df = pd.read_excel(path)

In [8]:
sport_dict = dict(zip(df['sport'].str.rstrip(), df['speed_max']))
sport_dict

{'aerobics': 45,
 'badminton': 45,
 'basketball': 45,
 'bike': 244,
 'bike (transport)': 244,
 'indoor cycling': 244,
 'mountain bike': 244,
 'circuit training': 45,
 'climbing': 45,
 'core stability training': 45,
 'elliptical': 45,
 'golf': 45,
 'gymnastics': 45,
 'hiking': 45,
 'horseback riding': 71,
 'kayaking': 32,
 'martial arts': 45,
 'orienteering': 45,
 'rowing': 23,
 'rugby': 45,
 'run': 45,
 'treadmill running': 45,
 'sailing': 121,
 'skate': 55,
 'cross-country skiing': 255,
 'downhill skiing': 255,
 'roller skiing': 50,
 'snowboarding': 203,
 'snowshoeing': 45,
 'soccer': 45,
 'squash': 45,
 'stair climing': 45,
 'kite surfing': 99,
 'windsurfing': 99,
 'swimming': 9,
 'table tennis': 45,
 'tennis': 45,
 'fitness walking': 45,
 'treadmill walking': 45,
 'walk': 45,
 'weight training': 45,
 'pilates': 45,
 'yoga': 45}

### Import, Split and Output File

In [9]:
# select json file for splitting

file_index = 4

input_filepath = [
            "endomondoHR.json",
            "endomondoMeta.json",
            "endomondoHR_proper.json", # same input file
            "endomondoHR_proper.json", # same input file
            "endomondoHR_proper.json", # same input file
            "processed_endomondoHR_proper.npy",
            "processed_endomondoHR_proper_interpolate.npy"        
            ]


output_filepath = [
            "endomondoHR/endomondoHR_",
            "endomondoMeta/endomondoMeta_",    
            "endomondoHR_proper/endomondoHR_proper_",                            # input file is endomondoHR_proper.json
            "endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_",          # input file is endomondoHR_proper.json
            "endomondoHR_proper_dist_spd_time/endomondoHR_proper_dist_spd_time_", # input file is endomondoHR_proper.json   
            "processed_endomondoHR_proper_",
            "processed_endomondoHR_proper_interpolate_"        
            ]

# init filepaths

in_path = raw_path + input_filepath[file_index]
out_path = input_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/0_raw/fitrec/endomondoHR_proper.json
../../data/1_input/fitrec/endomondoHR_proper_dist_spd_time/endomondoHR_proper_dist_spd_time_


In [10]:
# import large json file and split into smaller json files (1000 jsons each)

json_per_file = 100
#json_per_file = 1000

if '.json' in in_path:

    data = {}
    index = 1
    file_no = 1
    
    #print('file_no:',file_no)

    with open(in_path) as f:

        for l in f:

            data[index] = eval(l)            
                        
            data[index]['distance'] = [0] * 500
            data[index]['time_diff'] = [0] * 500
            
            data[index] = compute_distance(data[index],index)
            data[index] = compute_time_diff(data[index],index)
                
            if 'speed' not in data[index].keys():
                          
                #data[index]['distance'] = [0] * 500
                #data[index]['time_diff'] = [0] * 500
                data[index]['speed'] = [0] * 500
                
                #data[index] = compute_distance(data[index],index)
                data[index] = compute_speed(data[index],index)

            # each file will contain n jsons
            if index % json_per_file == 0:            
                output_json(data,file_no)

                data = {}                
                file_no += 1
                
                #print('file_no:',file_no)

            index += 1

        # last file contains less than n jsons
        if index % json_per_file != 0:        
            output_json(data,file_no)            

In [11]:
if '.npy' in in_path:
    
    #data = np.load(in_path,mmap_mode='r')
    #data = np.memmap(in_path,mode='r+')
    
    #data = np.memmap(in_path,mode='r+')
    data = np.memmap(in_path,mode='r+',dtype='uint8')