In [1]:
import sys
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

pd.options.mode.chained_assignment = None

# Read data
t1 = time.time()
points = pd.read_csv('inc_points_2.csv')
tracks = pd.read_csv('rich_tracks_2.csv')
os = pd.read_csv('os.csv')
t2 = time.time()
print('Loading finished. Elapsed time: ', t2 - t1)

Loading finished. Elapsed time:  5.805583953857422


In [2]:
# Clean data, fix different dimensions of latitude and longitude
t1 = time.time()
points.drop(['PointDate'], axis=1, inplace=True)
points = points.round({'Latitude': 6, 'Longitude': 6})
needed_cols = ['IncomingTrackId', 'TrackOrigin']
merged = pd.merge(points, tracks[needed_cols])
t2 = time.time()
print('Cleaing finished. Elapsed time: ', t2 - t1)

Cleaing finished. Elapsed time:  0.6269469261169434


In [3]:
# Feature engineering, generating train, test dataframes
t1 = time.time()

# Feature vector of each track would contain N points (latitude, logitude), 
# where N=num_points. This hyperparameter greatly affects accuracy.
num_points = 12

cols = ['MeanSpeed',
        'MaxSpeed',
        'MeanAcceleration',
        'MaxAcceleration',
        'MeanDeceleration',
        'MaxDeceleration',
        'MeanAccelerationX',
        'MaxAccelerationX',
        'MeanAccelerationY',
        'MaxAccelerationY',
        'MeanAccelerationZ',
        'MaxAccelerationZ',
        'MeanHeight',
        'StartTimestamp',
        'EndTimestamp',
        'TrackOrigin']

num_feat_cols = len(cols)

for i in range(num_points):
    cols.append('Point' + str(i) + 'Latitude')
    cols.append('Point' + str(i) + 'Longitude')

train = pd.DataFrame(columns=cols)

# Create feature vector for every unique track 
for i, id in enumerate(merged.IncomingTrackId.unique()):
    track = merged.loc[merged['IncomingTrackId']==id]

    if num_points == 1:
        train = train.append(track.iloc[0])
    else:
        step = math.ceil(len(track)/(num_points))
        j = 0

        if len(track) < num_points:
            print('len(track) < num_points')

        # Adding features
        train = train.append({'MeanSpeed':track['Speed'].mean()}, ignore_index=True)
        train['MaxSpeed'].iloc[i] = track['Speed'].max()
        train['MeanAcceleration'].iloc[i] = track['Acceleration'].mean()
        train['MaxAcceleration'].iloc[i] = track['Acceleration'].max()
        train['MeanDeceleration'].iloc[i] = track['Deceleration'].mean()
        train['MaxDeceleration'].iloc[i] = track['Deceleration'].max()
        train['MeanAccelerationX'].iloc[i] = track['AccelerationXOriginal'].mean()
        train['MaxAccelerationX'].iloc[i] = track['AccelerationXOriginal'].max()
        train['MeanAccelerationY'].iloc[i] = track['AccelerationYOriginal'].mean()
        train['MaxAccelerationY'].iloc[i] = track['AccelerationYOriginal'].max()
        train['MeanAccelerationZ'].iloc[i] = track['AccelerationZOriginal'].mean()
        train['MaxAccelerationZ'].iloc[i] = track['AccelerationZOriginal'].max()
        train['MeanHeight'].iloc[i] = track['Height'].mean()
        train['StartTimestamp'].iloc[i] = track['TickTimestamp'].min()
        train['EndTimestamp'].iloc[i] = track['TickTimestamp'].max()
        train['TrackOrigin'].iloc[i] = track['TrackOrigin'].iloc[0]

        # Adding points
        k = 0
        while j < len(track):
            train.iloc[i, k + num_feat_cols] = track.iloc[j, 2]
            train.iloc[i, k + num_feat_cols + 1] = track.iloc[j, 3]
            j += step
            k += 2

train = train.dropna()
driver_mapping = {'OriginalDriver': 1, 'Passanger': 0, 'Taxi': 0}
train['TrackOrigin'] = train['TrackOrigin'].map(driver_mapping)
target = train['TrackOrigin']
train.drop(['TrackOrigin'], axis=1, inplace=True)
t2 = time.time()    
print('train, test generating finished. Elapsed time: ', t2 - t1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


train, test generating finished. Elapsed time:  50.764097690582275


In [4]:
# Cross-Validation of SVM, kNN, Decision Tree, Random Forest and Naive Bayes models
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

model = SVC()
accuracy = cross_val_score(model, train, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('SVM accuracy: ', round(np.mean(accuracy)*100, 2))

model = KNeighborsClassifier(n_neighbors = 9)
accuracy = cross_val_score(model, train, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('KNN accuracy: ', round(np.mean(accuracy)*100, 2))

model = DecisionTreeClassifier()
accuracy = cross_val_score(model, train, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Decision Tree accuracy: ', round(np.mean(accuracy)*100, 2))

model = RandomForestClassifier(n_estimators=14)
accuracy = cross_val_score(model, train, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Random Forest accuracy: ', round(np.mean(accuracy)*100, 2))

model = GaussianNB()
accuracy = cross_val_score(model, train, target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Naive Bayes accuracy: ', round(np.mean(accuracy)*100, 2))

SVM accuracy:  77.23
KNN accuracy:  98.91
Decision Tree accuracy:  99.46
Random Forest accuracy:  99.18
Naive Bayes accuracy:  86.16
