# Dataset - Feature Extraction

In this notebook, the dataset used to train the machine learning model is created

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import time

pd.set_option('mode.chained_assignment', None)

In [2]:
# Constants
data_dir = 'geolife-data/Prepared'
output_file = 'processed_data.csv'
files_to_read = 15
earth_radius = 6367

## Functions
Functions are defined that are used for feature extractrion

In [3]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    a = np.sin(np.abs(lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(np.abs(lon2-lon1)/2.0)**2
    c = np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return earth_radius * 2 * c

In [4]:
def bearing(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    y = np.sin(np.abs(lon2-lon1)) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(np.abs(lon2-lon1))
    
    return np.degrees(np.arctan2(y,x))

## Load files

In [5]:
filelist = os.listdir(data_dir)
files_to_read = (len(filelist) if files_to_read>len(filelist) else files_to_read)

data_raw = pd.DataFrame()
for file in filelist[:files_to_read]:
    if os.path.isdir(data_dir + "/" + file):
        continue
    data_raw_temp = traj_df = pd.read_csv(data_dir + "/" + file)
    data_raw = pd.concat([data_raw, data_raw_temp])


## Prepare Data
Colums are renamed and unused colums and rows are dropped

In [6]:
data_raw = data_raw.rename(columns={"height": "altitude"})\
    .drop(columns = ['days_total', 'date', 'time'])\
    .dropna()

In [7]:
data_raw['record_dt'] = data_raw['record_dt'].astype("datetime64")

In [8]:
used_classes = ['car', 'walk', 'bike', 'train']

data_raw = data_raw.drop(data_raw[~data_raw['trans_mode'].isin(used_classes)].index)

In [9]:
data_raw

Unnamed: 0.1,Unnamed: 0,latitude,longitude,altitude,record_dt,user,trans_trip,trans_mode
24,24,40.033650,116.520433,164.041995,2007-10-08 02:08:31,106,0.0,car
25,25,40.031683,116.518900,167.322835,2007-10-08 02:08:55,106,0.0,car
26,26,40.031650,116.517667,164.041995,2007-10-08 02:09:08,106,0.0,car
27,27,40.031850,116.516633,160.761155,2007-10-08 02:09:23,106,0.0,car
28,28,40.031800,116.515667,157.480315,2007-10-08 02:09:34,106,0.0,car
...,...,...,...,...,...,...,...,...
55,55,39.982433,116.190717,518.372703,2007-04-30 01:17:04,86,0.0,car
56,56,39.982083,116.190650,541.338583,2007-04-30 01:17:11,86,0.0,car
57,57,39.981783,116.191033,544.619423,2007-04-30 01:17:17,86,0.0,car
58,58,39.981700,116.191317,544.619423,2007-04-30 01:17:21,86,0.0,car


## Process data

In [10]:
users = data_raw['user'].unique()

In [11]:
data = pd.DataFrame()
for user in users[:files_to_read]:
    print("User: " + str(user))
    data_user = data_raw[data_raw.user == user]
    trips = data_user['trans_trip'].unique()
    for trip in trips[:]:
        
        # data_traj consist of all trajectories of one singel trip
        
        data_traj = data_user[data_user.trans_trip == trip]
        data_traj = data_traj.sort_values(by=['record_dt'])
        
        # Distance to next entry in meters
        data_traj['dist'] = haversine(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1)) * 1000
        
        data_traj['bearing'] = bearing(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1))
        
        # Time difference to next entry in seconds
        data_traj['time_delta'] = (data_traj.record_dt.shift(-1) - data_traj.record_dt).astype("int64") / (1000000000.0)
        
        # Velocity until next entry in m/s
        data_traj['velocity'] = data_traj['dist'] / (data_traj['time_delta'])
        
        # Acceleration in m/(sˆ2)
        data_traj['acceleration'] = (data_traj['velocity'].shift(-1) - data_traj['velocity']) / (data_traj['time_delta'])
        
        
        # Features 
        
        temp =  pd.DataFrame()
        temp['user'] = data_traj['user']
        temp['trans_trip'] = data_traj['trans_trip']
    

        for i in range(3):
            temp['v' + str(i)] = data_traj.velocity.shift(-i)

        for i in range(3):
            temp['a' + str(i)] = data_traj.acceleration.shift(-i)
            

        temp['max(a)'] = temp[['a0', 'a1', 'a2']].values.max(1)
        temp['min(a)'] = temp[['a0', 'a1', 'a2']].values.min(1)
        temp['range(a)'] = temp['max(a)'] - temp['min(a)']
        temp['avg(a)'] = (temp.a0 + temp.a1 + temp.a2) / 3
        temp['var(a)'] =   ((temp.a0 - temp['avg(a)'])**2\
                         + (temp.a1 - temp['avg(a)'])**2\
                         + (temp.a2 - temp['avg(a)']**2))/3
        

        
        temp['max(v)'] = temp[['v0', 'v1', 'v2']].values.max(1)
        temp['min(v)'] = temp[['v0', 'v1', 'v2']].values.min(1)
        temp['avg(v)'] = (temp.v0 + temp.v1 + temp.v2) / 3
        temp['var(v)'] =   ((temp.v0 - temp['avg(v)'])**2\
                 + (temp.v1 - temp['avg(v)'])**2\
                 + (temp.v2 - temp['avg(v)']**2))/3
        temp['range(v)'] = temp['max(v)'] - temp['min(v)']
        
        temp['brngChng'] = 180.0 - np.abs(np.abs(data_traj['bearing'] - data_traj['bearing'].shift(-1)) - 180.0)

        
        temp['class'] = data_traj['trans_mode']
        temp = temp.dropna()
        
        data = pd.concat([data,temp])
 

User: 106
User: 138
User: 163
User: 82
User: 64
User: 20
User: 89
User: 112
User: 96
User: 128
User: 92
User: 102
User: 86


## Preprocessing

In [12]:
final = data

In [13]:
final

Unnamed: 0,user,trans_trip,v0,v1,v2,a0,a1,a2,max(a),min(a),range(a),avg(a),var(a),max(v),min(v),avg(v),var(v),range(v),brngChng,class
24,106,0.0,10.611919,8.082485,6.049726,-0.105393,-0.156366,0.096625,0.096625,-0.156366,0.252991,-0.055045,0.035465,10.611919,6.049726,8.248043,-18.788392,4.562193,57.142437,car
25,106,0.0,8.082485,6.049726,7.499100,-0.156366,0.096625,0.058772,0.096625,-0.156366,0.252991,-0.000323,0.030840,8.082485,6.049726,7.210437,-14.127861,2.032759,16.207402,car
26,106,0.0,6.049726,7.499100,8.145592,0.096625,0.058772,-0.054668,0.096625,-0.054668,0.151293,0.033576,-0.017062,8.145592,6.049726,7.231473,-14.226818,2.095866,18.050526,car
27,106,0.0,7.499100,8.145592,7.762913,0.058772,-0.054668,-0.492269,0.058772,-0.492269,0.551041,-0.162722,-0.152671,8.145592,7.499100,7.802535,-17.635625,0.646492,19.091665,car
28,106,0.0,8.145592,7.762913,4.317031,-0.054668,-0.492269,0.300958,0.300958,-0.492269,0.793226,-0.081993,0.154436,8.145592,4.317031,6.741845,-12.707455,3.828561,31.979505,car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,86,0.0,8.057947,7.970676,8.545079,-0.009697,0.063823,0.047273,0.063823,-0.009697,0.073519,0.033800,0.016308,8.545079,7.970676,8.191234,-19.494941,0.574403,1.333434,car
52,86,0.0,7.970676,8.545079,9.443263,0.063823,0.047273,-0.637435,0.063823,-0.637435,0.701258,-0.175447,-0.187121,9.443263,7.970676,8.653006,-21.651342,1.472587,10.546353,car
53,86,0.0,8.545079,9.443263,5.618650,0.047273,-0.637435,0.308895,0.308895,-0.637435,0.946330,-0.093756,0.205194,9.443263,5.618650,7.868997,-17.789024,3.824613,17.687351,car
54,86,0.0,9.443263,5.618650,7.780912,-0.637435,0.308895,-0.219396,0.308895,-0.637435,0.946330,-0.182645,0.065230,9.443263,5.618650,7.614275,-14.289520,3.824613,50.140135,car


In [14]:
final = final.drop(final[(final['class'] == 'walk') & ((final['max(v)'] > 7) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'bike') & ((final['max(v)'] > 12) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'car') & ((final['max(v)'] > 50) | (final['max(a)'] >= 10))].index)
final = final.drop(final[(final['class'] == 'train') & ((final['max(v)'] > 34) | (final['max(a)'] >= 3))].index)

In [16]:
final.to_csv(output_file)
print(str(len(final)) + " trajectory segments are saved for classification.")

343323 trajectory segments are saved for classification.
