### Contents

- [Configurations and Libraries](#Configurations-and-Libraries)
- [Functions](#Functions)

### Header

In [1]:
# import libraries

# maths
import scipy.stats as stats
import numpy as np
import pandas as pd
from haversine import haversine, Unit

# others
from tqdm import tqdm
import time
import datetime as dt
import re
import os

# web
import json

In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Functions

In [3]:
# output smaller json file with incremental filenames e.g. xxx_123.json

def output_json(data,file_no):
    
    file_str = str(file_no)
    file_str = file_str.zfill(4)            
    out_path_2 = out_path + file_str + '.json'

    with open(out_path_2, 'w') as f:
        print(out_path_2)
        json.dump(data, f)

In [4]:
def compute_distance(record,idx):
    
    #print(record['latitude'])
    #print(record['longitude'])
    
    for i in range(1,500):
        
        cur_loc = (record['latitude'][i],record['longitude'][i])
        prev_loc = (record['latitude'][i-1],record['longitude'][i-1])
        
        # metre
        dist = haversine(cur_loc,prev_loc) * 1000     
        record['distance'][i] = dist
        
        #print(i,prev_loc,cur_loc,dist)
        
    return record

In [5]:
def compute_speed(record,idx):
    
    for i in range(1,500):
        
        # seconds
        time_diff = record['timestamp'][i] - record['timestamp'][i-1]
        record['time_diff'][i] = time_diff
         
        if time_diff > 0:
            
            # metre/seconds
            speed = record['distance'][i] / time_diff
            # km/h
            speed = speed * (60*60/1000)
            
            record['speed'][i] = speed
            
            # source: https://en.wikipedia.org/wiki/List_of_cycling_records
            # olympic world record for cycling: 244 km/h
            if speed > 244:
                print('error: world record for {}:{}'.format(idx,i))
                print(record['time_diff'][i],record['distance'][i],speed)
                
                record['speed'][i] = 0.0             
            
        elif time_diff == 0:
            record['speed'][i] = record['speed'][i-1]
            #print('error: time_diff = 0 for {}:{}'.format(idx,i))
            
        # time_diff < 0
        else:
            record['speed'][i] = record['speed'][i-1]
            #print('error: time_diff < 0 for {}:{}'.format(idx,i))         
        
        #print(i,time_diff,speed)
        
    # set speed for index 0
#     for i in range(1,500):
#         speed = record['speed'][i]
        
#         if speed > 0:
#             record['speed'][0] = speed
#             break        
        
    return record

### Import Data

In [6]:
# sample code from https://sites.google.com/eng.ucsd.edu/fitrec-project/home

# path = Path("data/")
# out_path = str(path / "processed_endomondoHR_proper.npy")
# data = np.load(out_path)[0]

In [7]:
# sample code from https://sites.google.com/eng.ucsd.edu/fitrec-project/home

# data = []
# #with gzip.open('endomondoHR.json.gz') as f:
# with open('endomondoHR_proper.json') as f:
#     for l in f:
#         data.append(eval(l))

In [8]:
# old method

#file = "processed_endomondoHR_proper.npy"
# file = "processed_endomondoHR_proper_interpolate.npy"

# path = input_path + folder + file

#processed_endomondoHR_proper = np.load(path,mmap_mode='r')[0]
#data = np.memmap(path,mode='r+')

In [9]:
# old method

# file = "endomondoHR_proper.json"
# path = input_path + file

# num_workout = 300000
# data = {}
# index = 1

# with open(path) as f:    

#     for l in tqdm(f):
        
#         if index > num_workout:
#             break     

#         data[index] = eval(l)        
#         index += 1

### Output Data

In [10]:
# old method

# file = "endomondoHR_proper_sample.json"
# path = clean_path + folder + file

# with open(path, 'w') as f:
#     json.dump(data, f)

### Import, Split and Output File

In [11]:
# select json file for splitting

file_index = 3

input_filepath = [
            "endomondoHR.json",
            "endomondoMeta.json",
            "endomondoHR_proper.json", # next file is the same
            "endomondoHR_proper.json", # previous file is the same
            "processed_endomondoHR_proper.npy",
            "processed_endomondoHR_proper_interpolate.npy"        
            ]


output_filepath = [
            "endomondoHR/endomondoHR_",
            "endomondoMeta/endomondoMeta_",    
            "endomondoHR_proper/endomondoHR_proper_",                   # input file is endomondoHR_proper.json
            "endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_", # input file is endomondoHR_proper.json    
            "processed_endomondoHR_proper_",
            "processed_endomondoHR_proper_interpolate_"        
            ]

# init filepaths

in_path = raw_path + input_filepath[file_index]
out_path = input_path + output_filepath[file_index]

print(in_path)
print(out_path)

../../data/0_raw/fitrec/endomondoHR_proper.json
../../data/1_input/fitrec/endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_


In [12]:
# import large json file and split into smaller json files (1000 jsons each)

json_per_file = 100
#json_per_file = 1000

if '.json' in in_path:

    data = {}
    index = 1
    file_no = 1
    
    #print('file_no:',file_no)

    with open(in_path) as f:

        for l in f:

            data[index] = eval(l) 
                
            if 'speed' not in data[index].keys():
                          
                data[index]['distance'] = [0] * 500
                data[index]['time_diff'] = [0] * 500
                data[index]['speed'] = [0] * 500
                
                data[index] = compute_distance(data[index],index)
                data[index] = compute_speed(data[index],index)

            # each file will contain n jsons
            if index % json_per_file == 0:            
                output_json(data,file_no)

                data = {}                
                file_no += 1
                
                #print('file_no:',file_no)

            index += 1

        # last file contains less than n jsons
        if index % json_per_file != 0:        
            output_json(data,file_no)            

../../data/1_input/fitrec/endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_0001.json
../../data/1_input/fitrec/endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_0002.json
../../data/1_input/fitrec/endomondoHR_proper_dist_spd/endomondoHR_proper_dist_spd_0003.json
error: world record for 329:269
6 901.833923563926 541.1003541383557


NameError: name 'stop' is not defined

In [None]:
if '.npy' in in_path:
    
    #data = np.load(in_path,mmap_mode='r')
    #data = np.memmap(in_path,mode='r+')
    
    #data = np.memmap(in_path,mode='r+')
    data = np.memmap(in_path,mode='r+',dtype='uint8')