In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
import os
pd.set_option('mode.chained_assignment', None)

In [6]:
# Constants
data_dir = 'geolife-data/Prepared'
output_file = 'processed_data.csv'
files_to_read = 15
earth_radius = 6367

In [7]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    a = np.sin(np.abs(lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(np.abs(lon2-lon1)/2.0)**2
    c = np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return earth_radius * 2 * c

In [14]:
def bearing(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    y = np.sin(np.abs(lon2-lon1)) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(np.abs(lon2-lon1))
    
    return np.degrees(np.arctan2(y,x))

In [10]:
filelist = os.listdir(data_dir)
files_to_read = (len(filelist) if files_to_read>len(filelist) else files_to_read)

data_raw = pd.DataFrame()
for file in filelist[:files_to_read]:
    if os.path.isdir(data_dir + "/" + file):
        continue
    data_raw_temp = traj_df = pd.read_csv(data_dir + "/" + file)
    data_raw = pd.concat([data_raw, data_raw_temp])


In [11]:
data_raw = data_raw.rename(columns={"height": "altitude"})\
    .drop(columns = ['days_total', 'date', 'time'])\
    .dropna()

data_raw['record_dt'] = data_raw['record_dt'].astype("datetime64")

used_classes = ['car', 'walk', 'bike', 'train']

data_raw = data_raw.drop(data_raw[~data_raw['trans_mode'].isin(used_classes)].index)

In [12]:
users = data_raw['user'].unique()

In [18]:
data = pd.DataFrame()
for user in users[:files_to_read]:
    print("User: " + str(user))
    data_user = data_raw[data_raw.user == user]
    trips = data_user['trans_trip'].unique()
    for trip in trips[:]:
        
        # data_traj consist of all trajectories of one singel trip
        
        data_traj = data_user[data_user.trans_trip == trip]
        data_traj = data_traj.sort_values(by=['record_dt'])
        
        # Distance to next entry in meters
        data_traj['dist'] = haversine(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1)) * 1000
        
        data_traj['bearing'] = bearing(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1))
        data_traj['brngChng'] = 180.0 - np.abs(np.abs(data_traj['bearing'] - data_traj['bearing'].shift(-1)) - 180.0)
        
        # Time difference to next entry in seconds
        data_traj['time_delta'] = (data_traj.record_dt.shift(-1) - data_traj.record_dt).astype("int64") / (1000000000.0)
        
        # Velocity until next entry in m/s
        data_traj['velocity'] = data_traj['dist'] / (data_traj['time_delta'])
        
        # Acceleration in m/(sˆ2)
        data_traj['acceleration'] = (data_traj['velocity'].shift(-1) - data_traj['velocity']) / (data_traj['time_delta'])
        
        
        data = pd.concat([data, data_traj])
 

User: 106
User: 138
User: 163
User: 82
User: 64
User: 20
User: 89
User: 112
User: 96
User: 128
User: 92
User: 102
User: 86


In [20]:
data.describe()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,altitude,user,trans_trip,dist,bearing,brngChng,time_delta,velocity,acceleration
count,410600.0,410600.0,410600.0,410600.0,410600.0,410600.0,409565.0,409565.0,408534.0,410600.0,409327.0,408060.0
mean,1451.350543,39.841724,109.730593,472.641111,115.58266,509.783989,36.645141,89.192027,18.504814,-23249360.0,inf,
std,2885.773355,4.194974,39.963554,1010.275983,36.94495,414.203122,509.475885,54.926855,28.79424,462490600.0,,
min,0.0,22.260945,-122.331533,-22139.1,20.0,0.0,0.0,0.0,0.0,-9223372000.0,0.0,-inf
25%,195.0,39.967375,116.273795,118.1,128.0,169.0,2.317364,38.615309,1.676207,2.0,1.164503,-0.1787937
50%,515.0,39.986057,116.327137,160.8,128.0,457.0,5.380679,89.999991,7.163084,2.0,2.561303,0.001972002
75%,1423.0,40.07533,116.37314,262.5,128.0,722.0,35.318324,139.293976,21.794372,6.0,8.411582,0.1870326
max,25578.0,51.486608,126.993377,28969.8,163.0,3151.0,179535.150773,180.0,180.0,46343.0,inf,inf
