In [1]:
import sys
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

pd.options.mode.chained_assignment = None

# Read data
t1 = time.time()
points = pd.read_csv('inc_points_2.csv')
tracks = pd.read_csv('rich_tracks_2.csv')
os = pd.read_csv('os.csv')
t2 = time.time()
print('Loading finished. Elapsed time: ', t2 - t1)

Loading finished. Elapsed time:  5.861717462539673


In [2]:
# Clean data, fix different (iOS/Android) dimensions of latitude and longitude 
t1 = time.time()
points.drop(['PointDate'], axis=1, inplace=True)
points = points.round({'Latitude': 6, 'Longitude': 6})
needed_cols = ['IncomingTrackId', 'TrackOrigin']
merged = pd.merge(points, tracks[needed_cols])
t2 = time.time()
print('Cleaing finished. Elapsed time: ', t2 - t1)

Cleaing finished. Elapsed time:  0.5926382541656494


In [3]:
# Feature engineering, generating train, test dataframes
t1 = time.time()

# Feature vector of each track would contain N points (latitude, logitude), 
# where N=num_points. This hyperparameter greatly affects accuracy.
num_points = 12

features = ['MeanSpeed',
            'MaxSpeed',
            'MeanAcceleration',
            'MaxAcceleration',
            'MeanDeceleration',
            'MaxDeceleration',
            'MeanAccelerationX',
            'MaxAccelerationX',
            'MeanAccelerationY',
            'MaxAccelerationY',
            'MeanAccelerationZ',
            'MaxAccelerationZ',
            'MeanHeight',
            'StartTimestamp',
            'EndTimestamp',
            'TrackOrigin']

num_feat_cols = len(features)

for i in range(num_points):
    features.append('Point' + str(i) + 'Latitude')
    features.append('Point' + str(i) + 'Longitude')

feat_df = pd.DataFrame(columns=features)

# Create feature vector for every unique track 
for i, id in enumerate(merged.IncomingTrackId.unique()):
    track = merged.loc[merged['IncomingTrackId']==id]

    if num_points == 1:
        feat_df = train.append(track.iloc[0])
    else:
        step = math.ceil(len(track)/(num_points))
        j = 0

        if len(track) < num_points:
            print('len(track) < num_points')

        # Adding features
        feat_df = feat_df.append({'MeanSpeed':track['Speed'].mean()}, ignore_index=True)
        feat_df['MaxSpeed'].iloc[i] = track['Speed'].max()
        feat_df['MeanAcceleration'].iloc[i] = track['Acceleration'].mean()
        feat_df['MaxAcceleration'].iloc[i] = track['Acceleration'].max()
        feat_df['MeanDeceleration'].iloc[i] = track['Deceleration'].mean()
        feat_df['MaxDeceleration'].iloc[i] = track['Deceleration'].max()
        feat_df['MeanAccelerationX'].iloc[i] = track['AccelerationXOriginal'].mean()
        feat_df['MaxAccelerationX'].iloc[i] = track['AccelerationXOriginal'].max()
        feat_df['MeanAccelerationY'].iloc[i] = track['AccelerationYOriginal'].mean()
        feat_df['MaxAccelerationY'].iloc[i] = track['AccelerationYOriginal'].max()
        feat_df['MeanAccelerationZ'].iloc[i] = track['AccelerationZOriginal'].mean()
        feat_df['MaxAccelerationZ'].iloc[i] = track['AccelerationZOriginal'].max()
        feat_df['MeanHeight'].iloc[i] = track['Height'].mean()
        feat_df['StartTimestamp'].iloc[i] = track['TickTimestamp'].min()
        feat_df['EndTimestamp'].iloc[i] = track['TickTimestamp'].max()
        feat_df['TrackOrigin'].iloc[i] = track['TrackOrigin'].iloc[0]

        # Adding points
        k = 0
        while j < len(track):
            feat_df.iloc[i, k + num_feat_cols] = track.iloc[j, 2]
            feat_df.iloc[i, k + num_feat_cols + 1] = track.iloc[j, 3]
            j += step
            k += 2

feat_df = feat_df.dropna()
driver_mapping = {'OriginalDriver': 1, 'Passanger': 0, 'Taxi': 0}
feat_df['TrackOrigin'] = feat_df['TrackOrigin'].map(driver_mapping)
feat_df = feat_df.sample(frac=1).reset_index(drop=True)

train_part = math.floor(len(feat_df) * 0.7)  # number of rows for train dataframe
train = feat_df.iloc[:train_part, :]
test = feat_df.iloc[train_part:, :]
train_target = train['TrackOrigin']
test_target = test['TrackOrigin']
train.drop(['TrackOrigin'], axis=1, inplace=True)
test.drop(['TrackOrigin'], axis=1, inplace=True)
t2 = time.time()    
print('train, test generating finished. Elapsed time: ', t2 - t1)

train, test generating finished. Elapsed time:  50.80956697463989


In [4]:
# Cross-Validation of SVM, kNN, Decision Tree, Random Forest and Naive Bayes models
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

model = SVC()
accuracy = cross_val_score(model, train, train_target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('SVM accuracy: ', round(np.mean(accuracy)*100, 2))

model = KNeighborsClassifier(n_neighbors = 9)
accuracy = cross_val_score(model, train, train_target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('KNN accuracy: ', round(np.mean(accuracy)*100, 2))

model = DecisionTreeClassifier()
accuracy = cross_val_score(model, train, train_target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Decision Tree accuracy: ', round(np.mean(accuracy)*100, 2))

model = RandomForestClassifier(n_estimators=14)
accuracy = cross_val_score(model, train, train_target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Random Forest accuracy: ', round(np.mean(accuracy)*100, 2))

model = GaussianNB()
accuracy = cross_val_score(model, train, train_target, cv=k_fold, n_jobs=1, scoring='accuracy')
print('Naive Bayes accuracy: ', round(np.mean(accuracy)*100, 2))

SVM accuracy:  76.57
KNN accuracy:  98.95
Decision Tree accuracy:  99.28
Random Forest accuracy:  99.22
Naive Bayes accuracy:  86.78


In [5]:
# Testing
model = RandomForestClassifier(n_estimators=14)
model.fit(train, train_target)
prediction = model.predict(test)
check = pd.DataFrame({"TrackOrigin": test_target, "Prediction": prediction})
print(check.head(20))

      TrackOrigin  Prediction
1801            1           1
1802            0           0
1803            1           1
1804            1           1
1805            1           1
1806            1           1
1807            0           0
1808            1           1
1809            1           1
1810            0           0
1811            1           1
1812            0           0
1813            1           1
1814            1           1
1815            1           1
1816            0           0
1817            0           0
1818            1           1
1819            1           1
1820            1           1
