# Dataset - Feature Extraction

In this notebook, the dataset used to train the machine learning model is created

In [104]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import time

pd.set_option('mode.chained_assignment', None)

In [105]:
# Constants
data_dir = 'geolife-data/Prepared'
output_file = 'processed_data.csv'
files_to_read = 100
earth_radius = 6367

## Functions
Functions are defined that are used for feature extractrion

In [106]:
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    a = np.sin(np.abs(lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(np.abs(lon2-lon1)/2.0)**2
    c = np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return earth_radius * 2 * c

In [107]:
def bearing(lat1, lon1, lat2, lon2, earth_radius=6371):
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    y = np.sin(np.abs(lon2-lon1)) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(np.abs(lon2-lon1))
    
    return np.degrees(np.arctan2(y,x))

## Load files

In [108]:
filelist = os.listdir(data_dir)
files_to_read = (len(filelist) if files_to_read>len(filelist) else files_to_read)

data_raw = pd.DataFrame()
for file in filelist[:files_to_read]:
    if os.path.isdir(data_dir + "/" + file):
        continue
    data_raw_temp = traj_df = pd.read_csv(data_dir + "/" + file)
    data_raw = pd.concat([data_raw, data_raw_temp])


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [109]:
data_raw['trans_mode'].unique()

array(['car', 'walk', nan, 'bike', 'taxi', 'bus', 'subway', 'airplane',
       'train', 'run', 'boat', 'motorcycle'], dtype=object)

## Prepare Data
Colums are renamed and unused colums and rows are dropped

In [110]:
backup = data_raw
data_raw.count()

Unnamed: 0    6807961
date          6807961
days_total    6807961
height        6807961
latitude      6807961
longitude     6807961
record_dt     6807961
time          6807961
trans_mode    5440616
trans_trip    5440616
user          6807961
dtype: int64

In [117]:
data_raw = backup

In [118]:
data_raw = data_raw.rename(columns={"height": "altitude"})\
    .drop(columns = ['days_total', 'date', 'time'])\
    .dropna()

In [120]:
data_raw

Unnamed: 0.1,Unnamed: 0,altitude,latitude,longitude,record_dt,trans_mode,trans_trip,user
0,0,154.199475,40.013683,116.473467,2007-10-08 01:56:45,car,0.0,106.0
1,1,154.199475,40.012967,116.476683,2007-10-08 01:56:58,car,0.0,106.0
2,2,147.637795,40.011567,116.479800,2007-10-08 01:57:13,car,0.0,106.0
3,3,141.076115,40.009733,116.482167,2007-10-08 01:57:26,car,0.0,106.0
4,4,147.637795,40.000600,116.490833,2007-10-08 01:58:37,car,0.0,106.0
...,...,...,...,...,...,...,...,...
1310,20,157.480315,39.975483,116.329883,2007-10-12 00:59:57,bike,2.0,56.0
1311,21,157.480315,39.975400,116.330300,2007-10-12 01:00:12,bike,2.0,56.0
1312,22,104.986877,39.975417,116.330533,2007-10-12 01:00:25,bike,2.0,56.0
1313,23,157.480315,39.975533,116.330733,2007-10-12 01:00:53,bike,2.0,56.0


In [122]:
data_raw['record_dt'] = data_raw['record_dt'].astype("datetime64")

data_raw.loc[data_raw["trans_mode"].str.contains("taxi"), 'trans_mode'] = "car"

In [123]:
data_raw['trans_mode'].unique()

array(['car', 'walk', 'bike', 'bus', 'subway', 'airplane', 'train', 'run',
       'boat', 'motorcycle'], dtype=object)

In [114]:
used_classes = ['car', 'walk']

data_raw = data_raw.drop(data_raw[~data_raw['trans_mode'].isin(used_classes)].index)

In [128]:
data = pd.DataFrame()
data = pd.concat([data_raw[data_raw["trans_mode"] == "car"], data_raw[data_raw["trans_mode"] == "walk"]])
#data_raw[data_raw["trans_mode"] == "car"]

In [130]:
data.groupby(by="trans_mode").count()

Unnamed: 0_level_0,Unnamed: 0,altitude,latitude,longitude,record_dt,trans_trip,user
trans_mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
car,754343,754343,754343,754343,754343,754343,754343
walk,1582693,1582693,1582693,1582693,1582693,1582693,1582693


In [132]:
data_raw = data

## Process data

In [133]:
users = data_raw['user'].unique()

In [138]:
data = pd.DataFrame()
for user in users[:files_to_read]:
    print("User: " + str(user))
    data_user = data_raw[data_raw.user == user]
    trips = data_user['trans_trip'].unique()
    for trip in trips[:]:
        
        # data_traj consist of all trajectories of one singel trip
        
        data_traj = data_user[data_user.trans_trip == trip]
        data_traj = data_traj.sort_values(by=['record_dt'])
        
        # Distance to next entry in meters
        data_traj['dist'] = haversine(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1)) * 1000
        
        data_traj['bearing'] = bearing(data_traj.latitude, data_traj.longitude, data_traj.latitude.shift(-1), data_traj.longitude.shift(-1))
        data_traj['brngChng'] = 180.0 - np.abs(np.abs(data_traj['bearing'] - data_traj['bearing'].shift(-1)) - 180.0)
        
        # Time difference to next entry in seconds
        data_traj['time_delta'] = (data_traj.record_dt.shift(-1) - data_traj.record_dt).astype("int64") / (1000000000.0)
        
        # Velocity until next entry in m/s
        data_traj['velocity'] = data_traj['dist'] / (data_traj['time_delta'])
        
        # Acceleration in m/(sˆ2)
        data_traj['acceleration'] = (data_traj['velocity'].shift(-1) - data_traj['velocity']) / (data_traj['time_delta'])
        
        
        # Features 
        
        temp =  pd.DataFrame()
        
        ## Labels, not used for ml model
        temp['dist'] = data_traj['dist']
        temp['user'] = data_traj['user']
        temp['trans_trip'] = data_traj['trans_trip']
    
        
        ## The velocities of the three segments
        for i in range(3):
            temp['v' + str(i)] = data_traj.velocity.shift(-i)
        
        ## The acceletations of the three segments
        for i in range(3):
            temp['a' + str(i)] = data_traj.acceleration.shift(-i)
        
        ## The bearing Change of the three segments
        for i in range(3):
            temp['brCh' + str(i)] = data_traj['brngChng'].shift(-i)
            

        temp['max(a)'] = temp[['a0', 'a1', 'a2']].values.max(1)
        temp['min(a)'] = temp[['a0', 'a1', 'a2']].values.min(1)
        temp['range(a)'] = temp['max(a)'] - temp['min(a)']
        temp['sum(a)'] = temp['a0'] + temp['a2'] + temp['a2']
        temp['avg(a)'] = temp['sum(a)'] / 3
        temp['var(a)'] = ((temp['a0'] - temp['avg(a)'])**2\
                        + (temp['a1'] - temp['avg(a)'])**2\
                        + (temp['a2'] - temp['avg(a)']**2))/3
        

        
        temp['max(v)'] = temp[['v0', 'v1', 'v2']].values.max(1)
        temp['min(v)'] = temp[['v0', 'v1', 'v2']].values.min(1)
        temp['range(v)'] = temp['max(v)'] - temp['min(v)']
        temp['sum(v)'] = temp['v0'] + temp['v1'] + temp['v2']
        temp['avg(v)'] = temp['sum(v)'] / 3
        temp['var(v)'] = ((temp['v0'] - temp['avg(v)'])**2\
                        + (temp['v1'] - temp['avg(v)'])**2\
                        + (temp['v2'] - temp['avg(v)']**2))/3
        

        
        temp['max(brCh)'] = temp[['brCh0', 'brCh1', 'brCh2']].values.max(1)
        temp['min(brCh)'] = temp[['brCh0', 'brCh1', 'brCh2']].values.min(1)
        temp['range(brCh)'] = temp['max(brCh)'] - temp['min(brCh)']
        temp['sum(brCh)'] = temp['brCh0'] + temp['brCh1'] + temp['brCh2']  
        temp['avg(brCh)'] = temp['sum(brCh)']/3
        temp['var(brCh)'] = ((temp['brCh0'] - temp['avg(brCh)'])**2\
                           + (temp['brCh1'] - temp['avg(brCh)'])**2\
                           + (temp['brCh2'] - temp['avg(brCh)']**2))/3

        temp['class'] = data_traj['trans_mode']
        temp = temp.dropna()
        
        data = pd.concat([data,temp])
 

User: 106.0
User: 163.0
User: 82.0
User: 89.0
User: 112.0
User: 128.0
User: 102.0
User: 86.0
User: 167.0
User: 21.0
User: 126.0
User: 65.0
User: 139.0
User: 108.0
User: 153.0
User: 75.0
User: 129.0
User: 10.0
User: 179.0
User: 98.0
User: 175.0
User: 53.0
User: 154.0
User: 125.0
User: 58.0
User: 80.0
User: 161.0
User: 62.0
User: 84.0
User: 144.0
User: 69.0
User: 76.0
User: 114.0
User: 52.0
User: 111.0
User: 174.0
User: 105.0
User: 81.0
User: 67.0
User: 78.0
User: 101.0
User: 85.0
User: 115.0
User: 68.0
User: 56.0
User: 138.0
User: 64.0
User: 20.0
User: 96.0
User: 92.0
User: 60.0
User: 107.0
User: 97.0
User: 136.0
User: 117.0
User: 87.0
User: 147.0
User: 110.0
User: 104.0
User: 73.0
User: 141.0
User: 91.0
User: 170.0


## Preprocessing

In [139]:
final = data

In [140]:
final.describe()

Unnamed: 0,dist,user,trans_trip,v0,v1,v2,a0,a1,a2,brCh0,...,range(v),sum(v),avg(v),var(v),max(brCh),min(brCh),range(brCh),sum(brCh),avg(brCh),var(brCh)
count,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,...,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0,1948736.0
mean,19.31598,108.4805,396.3026,4.572292,4.564142,4.561146,-0.01463382,-0.01265045,-0.01231927,22.27616,...,1.408769,13.69758,4.56586,-12.8694,38.84057,7.81275,31.02782,66.79826,22.26609,-60.43959
std,810.3477,43.57164,401.8653,8.013037,7.706786,7.710662,3.821264,3.781633,3.784746,31.94544,...,7.554105,20.83052,6.943507,2874.187,41.39993,12.76854,36.71814,71.75314,23.91771,664.2624
min,0.0,10.0,0.0,0.0,0.0,0.0,-4141.439,-4141.439,-4141.439,0.0,...,0.0,0.0,0.0,-1741.959,0.0,0.0,0.0,0.0,0.0,-10740.0
25%,1.538046,84.0,98.0,0.8009791,0.8006814,0.800162,-0.1725285,-0.1723435,-0.172303,2.179001,...,0.3506354,2.727518,0.9091726,-5.702955,7.58203,0.6279653,4.796508,13.50904,4.503013,-66.77601
50%,3.297438,115.0,302.0,1.494983,1.494151,1.49356,0.0,0.0,0.0,9.695908,...,0.7327886,4.541849,1.51395,-0.1360334,24.28348,3.232672,17.07148,43.32537,14.44179,-1.202815
75%,12.49331,153.0,568.0,5.318063,5.302699,5.291673,0.1692344,0.1690851,0.1689213,27.76992,...,1.487206,15.86996,5.289985,0.0531568,55.22885,9.68112,42.59594,94.18401,31.39467,4.035413
max,1077057.0,179.0,3151.0,4142.182,4142.182,4142.182,2070.374,2070.374,2070.374,180.0,...,4141.439,4145.1,1381.7,2540198.0,180.0,180.0,180.0,540.0,180.0,4800.0


In [141]:
final = final.drop(final[(final['class'] == 'walk') & ((final['max(v)'] > 7) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'bike') & ((final['max(v)'] > 12) | (final['max(a)'] >= 3))].index)
final = final.drop(final[(final['class'] == 'car') & ((final['max(v)'] > 50) | (final['max(a)'] >= 10))].index)
final = final.drop(final[(final['class'] == 'train') & ((final['max(v)'] > 34) | (final['max(a)'] >= 3))].index)

In [142]:
final.to_csv(output_file)
print(str(len(final)) + " trajectory segments are saved for classification.")

1654779 trajectory segments are saved for classification.


In [143]:
final['class'].unique()

array(['car', 'walk'], dtype=object)

In [144]:
final.groupby(by= "class").count()

Unnamed: 0_level_0,dist,user,trans_trip,v0,v1,v2,a0,a1,a2,brCh0,...,range(v),sum(v),avg(v),var(v),max(brCh),min(brCh),range(brCh),sum(brCh),avg(brCh),var(brCh)
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
car,608494,608494,608494,608494,608494,608494,608494,608494,608494,608494,...,608494,608494,608494,608494,608494,608494,608494,608494,608494,608494
walk,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285,...,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285,1046285
