# Dataset - Feature Extraction

In this notebook, the dataset used to train the machine learning model is created

In [27]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import time

pd.set_option('mode.chained_assignment', None)

In [28]:
# Constants
data_dir = 'geolife-data/Prepared'
output_file = 'processed_data.csv'
files_to_read = 5
earth_radius = 6367

## Functions
Functions are defined that are used for feature extractrion

In [29]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    a = np.sin(np.abs(lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(np.abs(lon2-lon1)/2.0)**2
    c = np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return earth_radius * 2 * c

In [30]:
def bearing(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    y = np.sin(np.abs(lon2-lon1)) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(np.abs(lon2-lon1))
    
    return np.degrees(np.arctan2(y,x))

## Load files

In [31]:
filelist = os.listdir(data_dir)
files_to_read = (len(filelist) if files_to_read>len(filelist) else files_to_read)

data_raw = pd.DataFrame()
for file in filelist[:files_to_read]:
    if os.path.isdir(data_dir + "/" + file):
        continue
    data_raw_temp = traj_df = pd.read_csv(data_dir + "/" + file)
    data_raw = pd.concat([data_raw, data_raw_temp])


In [32]:
data_raw['trans_mode'].unique()

array(['car', 'walk', nan, 'bike', 'taxi', 'bus', 'subway', 'airplane',
       'train', 'run'], dtype=object)

## Prepare Data
Colums are renamed and unused colums and rows are dropped

In [33]:
data_raw = data_raw.rename(columns={"height": "altitude"})\
    .drop(columns = ['days_total', 'date', 'time'])\
    .dropna()

In [36]:
data_raw['record_dt'] = data_raw['record_dt'].astype("datetime64")

data_raw.loc[data_raw["trans_mode"].str.contains("taxi"), 'trans_mode'] = "car"

In [37]:
data_raw['trans_mode'].unique()

array(['car', 'walk', 'bike', 'bus', 'subway', 'airplane', 'train', 'run'],
      dtype=object)

In [39]:
used_classes = ['car', 'walk', 'bike', 'train']

data_raw = data_raw.drop(data_raw[~data_raw['trans_mode'].isin(used_classes)].index)

In [40]:
data_raw

Unnamed: 0.1,Unnamed: 0,latitude,longitude,altitude,record_dt,user,trans_trip,trans_mode
0,0,40.013683,116.473467,154.199475,2007-10-08 01:56:45,106,0.0,car
1,1,40.012967,116.476683,154.199475,2007-10-08 01:56:58,106,0.0,car
2,2,40.011567,116.479800,147.637795,2007-10-08 01:57:13,106,0.0,car
3,3,40.009733,116.482167,141.076115,2007-10-08 01:57:26,106,0.0,car
4,4,40.000600,116.490833,147.637795,2007-10-08 01:58:37,106,0.0,car
...,...,...,...,...,...,...,...,...
58715,1316,39.982584,116.306947,553.000000,2008-08-23 13:54:38,64,42.0,bike
58716,1317,39.982585,116.306946,554.000000,2008-08-23 13:54:40,64,42.0,bike
58717,1318,39.982585,116.306946,554.000000,2008-08-23 13:54:42,64,42.0,bike
58718,1319,39.982575,116.306920,555.000000,2008-08-23 13:54:44,64,42.0,bike


## Process data

In [41]:
users = data_raw['user'].unique()

In [42]:
data = pd.DataFrame()
for user in users[:files_to_read]:
    print("User: " + str(user))
    data_user = data_raw[data_raw.user == user]
    trips = data_user['trans_trip'].unique()
    for trip in trips[:]:
        
        # data_traj consist of all trajectories of one singel trip
        
        data_traj = data_user[data_user.trans_trip == trip]
        data_traj = data_traj.sort_values(by=['record_dt'])
        
        # Distance to next entry in meters
        data_traj['dist'] = haversine(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1)) * 1000
        
        data_traj['bearing'] = bearing(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1))
        data_traj['brngChng'] = 180.0 - np.abs(np.abs(data_traj['bearing'] - data_traj['bearing'].shift(-1)) - 180.0)
        
        # Time difference to next entry in seconds
        data_traj['time_delta'] = (data_traj.record_dt.shift(-1) - data_traj.record_dt).astype("int64") / (1000000000.0)
        
        # Velocity until next entry in m/s
        data_traj['velocity'] = data_traj['dist'] / (data_traj['time_delta'])
        
        # Acceleration in m/(sˆ2)
        data_traj['acceleration'] = (data_traj['velocity'].shift(-1) - data_traj['velocity']) / (data_traj['time_delta'])
        
        
        # Features 
        
        temp =  pd.DataFrame()
        
        ## Labels, not used for ml model
        temp['user'] = data_traj['user']
        temp['trans_trip'] = data_traj['trans_trip']
    
        
        ## The velocities of the three segments
        for i in range(3):
            temp['v' + str(i)] = data_traj.velocity.shift(-i)
        
        ## The acceletations of the three segments
        for i in range(3):
            temp['a' + str(i)] = data_traj.acceleration.shift(-i)
        
        ## The bearing Change of the three segments
        for i in range(3):
            temp['brCh' + str(i)] = data_traj['brngChng'].shift(-i)
            

        temp['max(a)'] = temp[['a0', 'a1', 'a2']].values.max(1)
        temp['min(a)'] = temp[['a0', 'a1', 'a2']].values.min(1)
        temp['range(a)'] = temp['max(a)'] - temp['min(a)']
        temp['sum(a)'] = temp['a0'] + temp['a2'] + temp['a2']
        temp['avg(a)'] = temp['sum(a)'] / 3
        temp['var(a)'] = ((temp['a0'] - temp['avg(a)'])**2\
                        + (temp['a1'] - temp['avg(a)'])**2\
                        + (temp['a2'] - temp['avg(a)']**2))/3
        

        
        temp['max(v)'] = temp[['v0', 'v1', 'v2']].values.max(1)
        temp['min(v)'] = temp[['v0', 'v1', 'v2']].values.min(1)
        temp['range(v)'] = temp['max(v)'] - temp['min(v)']
        temp['sum(v)'] = temp['v0'] + temp['v1'] + temp['v2']
        temp['avg(v)'] = temp['sum(v)'] / 3
        temp['var(v)'] = ((temp['v0'] - temp['avg(v)'])**2\
                        + (temp['v1'] - temp['avg(v)'])**2\
                        + (temp['v2'] - temp['avg(v)']**2))/3
        

        
        temp['max(brCh)'] = temp[['brCh0', 'brCh1', 'brCh2']].values.max(1)
        temp['min(brCh)'] = temp[['brCh0', 'brCh1', 'brCh2']].values.min(1)
        temp['range(brCh)'] = temp['max(brCh)'] - temp['min(brCh)']
        temp['sum(brCh)'] = temp['brCh0'] + temp['brCh1'] + temp['brCh2']  
        temp['avg(brCh)'] = temp['sum(brCh)']/3
        temp['var(brCh)'] = ((temp['brCh0'] - temp['avg(brCh)'])**2\
                           + (temp['brCh1'] - temp['avg(brCh)'])**2\
                           + (temp['brCh2'] - temp['avg(brCh)']**2))/3

        temp['class'] = data_traj['trans_mode']
        temp = temp.dropna()
        
        data = pd.concat([data,temp])
 

User: 106
User: 138
User: 163
User: 82
User: 64


## Preprocessing

In [43]:
final = data

In [44]:
final

Unnamed: 0,user,trans_trip,v0,v1,v2,a0,a1,a2,brCh0,brCh1,...,sum(v),avg(v),var(v),max(brCh),min(brCh),range(brCh),sum(brCh),avg(brCh),var(brCh),class
0,106,0.0,21.946037,20.514227,22.052250,-0.110139,0.102535,-0.336079,14.171644,14.933392,...,64.512514,21.504171,-146.400635,14.933392,8.662090,6.271302,37.767126,12.589042,-47.273760,car
1,106,0.0,20.514227,22.052250,17.683225,0.102535,-0.336079,0.016664,14.933392,8.662090,...,60.249703,20.083234,-127.196765,29.660693,8.662090,20.998603,53.256175,17.752058,-64.967494,car
2,106,0.0,22.052250,17.683225,18.866391,-0.336079,0.016664,-0.232531,8.662090,29.660693,...,58.601866,19.533955,-117.647337,35.435421,8.662090,26.773331,73.758205,24.586068,-96.571477,car
3,106,0.0,17.683225,18.866391,14.913357,0.016664,-0.232531,-0.256793,29.660693,35.435421,...,51.462973,17.154324,-92.048858,48.211271,29.660693,18.550578,113.307386,37.769129,-435.700965,car
4,106,0.0,18.866391,14.913357,13.629393,-0.232531,-0.256793,0.484412,35.435421,48.211271,...,47.409141,15.803047,-75.310424,52.911282,35.435421,17.475861,136.557975,45.519325,-636.721994,car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58711,64,42.0,0.166792,0.281233,0.055597,0.057220,-0.112818,-0.006498,8.712678,8.712678,...,0.503622,0.167874,0.013422,90.000000,8.712678,81.287322,107.425355,35.808452,92.038905,bike
58712,64,42.0,0.281233,0.055597,0.042601,-0.112818,-0.006498,0.013721,8.712678,90.000000,...,0.379431,0.126477,0.018526,90.000000,8.712678,81.287322,151.251905,50.417302,272.233518,bike
58713,64,42.0,0.055597,0.042601,0.070042,-0.006498,0.013721,-0.035021,90.000000,52.539228,...,0.168241,0.056080,0.022360,90.000000,37.460772,52.539228,180.000000,60.000000,-868.958697,bike
58714,64,42.0,0.042601,0.070042,0.000000,0.013721,-0.035021,0.619666,52.539228,37.460772,...,0.112643,0.037548,-0.000109,116.654432,37.460772,79.193660,206.654432,68.884811,-1124.604808,bike


In [45]:
final = final.drop(final[(final['class'] == 'walk') & ((final['max(v)'] > 7) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'bike') & ((final['max(v)'] > 12) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'car') & ((final['max(v)'] > 50) | (final['max(a)'] >= 10))].index)
final = final.drop(final[(final['class'] == 'train') & ((final['max(v)'] > 34) | (final['max(a)'] >= 3))].index)

In [46]:
final.to_csv(output_file)
print(str(len(final)) + " trajectory segments are saved for classification.")

98817 trajectory segments are saved for classification.


In [47]:
final['class'].unique()

array(['car', 'walk', 'bike', 'train'], dtype=object)

In [48]:
final.groupby(by= "class").count()

Unnamed: 0_level_0,user,trans_trip,v0,v1,v2,a0,a1,a2,brCh0,brCh1,...,range(v),sum(v),avg(v),var(v),max(brCh),min(brCh),range(brCh),sum(brCh),avg(brCh),var(brCh)
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bike,12079,12079,12079,12079,12079,12079,12079,12079,12079,12079,...,12079,12079,12079,12079,12079,12079,12079,12079,12079,12079
car,31364,31364,31364,31364,31364,31364,31364,31364,31364,31364,...,31364,31364,31364,31364,31364,31364,31364,31364,31364,31364
train,6967,6967,6967,6967,6967,6967,6967,6967,6967,6967,...,6967,6967,6967,6967,6967,6967,6967,6967,6967,6967
walk,48407,48407,48407,48407,48407,48407,48407,48407,48407,48407,...,48407,48407,48407,48407,48407,48407,48407,48407,48407,48407
