In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from math import radians, cos, sin, asin, sqrt

%matplotlib inline

In [2]:
dataset = pd.read_csv("train.csv")

In [3]:
dataset.shape

(17176, 14)

In [4]:
dataset.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


# Feature engineering 1

In [5]:
for idx in range(0,len(dataset.index)):
    #Set strings to to integer value
    if dataset.at[idx, 'label'] == 'correct':
        dataset.at[idx, 'label'] = '1'
        
    if dataset.at[idx, 'label'] == 'incorrect':
        dataset.at[idx, 'label'] = '0'
        
    #Round lond decimal point values
    val = round(dataset.at[idx, 'meter_waiting_fare'])
    dataset.at[idx, 'meter_waiting_fare'] = val

In [6]:
dataset.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,1
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,1
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,1
3,189127273,10.5,598.0,271.0,16.0,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,1
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,1


# Feature engineering 2

In [7]:
# Distance calculation from latitude and longiture information

def single_pt_haversine(lat, lng, degrees=True):
    """
    'Single-point' Haversine: Calculates the great circle distance
    between a point on Earth and the (0, 0) lat-long coordinate
    """
    r = 6371 # Earth's radius (km). Have r = 3956 if you want miles

    # Convert decimal degrees to radians
    if degrees:
        lat, lng = map(radians, [lat, lng])

    # 'Single-point' Haversine formula
    a = sin(lat/2)**2 + cos(lat) * sin(lng/2)**2
    d = 2 * r * asin(sqrt(a)) 

    return d

In [8]:
# Get travel distance, round that and include as a new feature

dataset['harvesine_distance_pick'] = [single_pt_haversine(x, y) for x, y in zip(dataset.pick_lat, dataset.pick_lon)]
dataset['harvesine_distance_drop'] = [single_pt_haversine(x, y) for x, y in zip(dataset.drop_lat, dataset.drop_lon)]
dataset['distance'] = [ round(abs(x-y),1) for x, y in zip(dataset.harvesine_distance_drop, dataset.harvesine_distance_pick)]

In [9]:
dataset.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,harvesine_distance_pick,harvesine_distance_drop,distance
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,1,8892.526828,8890.305978,2.2
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,1,8892.482891,8891.875929,0.6
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,1,8888.861343,8894.391719,5.5
3,189127273,10.5,598.0,271.0,16.0,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,1,8891.595506,8892.438479,0.8
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,1,8888.382988,8895.763119,7.4


In [10]:
# Drop NaN rows

X_1 = dataset.dropna()
X_1 = X_1.reset_index(drop=True)
X_1.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label,harvesine_distance_pick,harvesine_distance_drop,distance
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,1,8892.526828,8890.305978,2.2
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,1,8892.482891,8891.875929,0.6
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,1,8888.861343,8894.391719,5.5
3,189127273,10.5,598.0,271.0,16.0,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,1,8891.595506,8892.438479,0.8
4,189129552,10.5,3407.0,182.0,0.0,112.0,11/1/2019 5:38,11/1/2019 6:35,7.13402,79.8969,6.91865,79.8649,1065.02,1,8892.91718,8888.863814,4.1


In [11]:
X = X_1.drop(X_1.columns[0], axis=1) #Trip ID
X = X.drop(X.columns[5], axis=1) # Pickup time
X = X.drop(X.columns[5], axis=1) # Drop time
X = X.drop(X.columns[5], axis=1) # Pick Lat
X = X.drop(X.columns[5], axis=1) # Pick Lon
X = X.drop(X.columns[5], axis=1) # Drop Lat
X = X.drop(X.columns[5], axis=1) # Drop Lon
X = X.drop(X.columns[6], axis=1) # Label
X = X.drop(X.columns[6], axis=1) # Pick Havesine
X = X.drop(X.columns[6], axis=1) # Drop Havesine

y = X_1[X_1.columns[13]] #Label

In [12]:
X.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance
0,10.5,834.0,56.0,0.0,64.0,270.32,2.2
1,10.5,791.0,47.0,0.0,134.0,197.85,0.6
2,10.5,1087.0,80.0,0.0,61.0,301.64,5.5
3,10.5,598.0,271.0,16.0,68.0,82.3,0.8
4,10.5,3407.0,182.0,0.0,112.0,1065.02,4.1


In [13]:
accuracy_model = []
# evaluate a model using k-fold cross-validation
def evaluate_model(dataX, dataY, n_folds=5):
	scores, histories = list(), list()
	# prepare cross validation
	kfold = KFold(n_folds, shuffle=True, random_state=1)
	# enumerate splits
	for train_ix, test_ix in kfold.split(dataX):
		# define model
		model = GaussianNB()
		# select rows for train and test
		X_train, X_test = dataX.iloc[train_ix], dataX.iloc[test_ix]
		y_train, y_test = dataY[train_ix], dataY[test_ix]
		# fit model
		model.fit(X_train, y_train)
		# evaluate model
		print(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)
	return model

# Use 5-Fold Cross Validation

In [14]:
k_fold_gnb_model = evaluate_model(X, y)

91.3671184443135
90.95462581025339
89.95285798467884
91.18773946360153
90.6572354848217


In [15]:
# Compare above 5-Fold cross validation results with based test-train set splitting method

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
y_pred = gnb_model.predict(X_test)

In [17]:
accuracy_score(y_test, y_pred)

0.9149479473580829

# Prediction with test data

In [18]:
dataset_test = pd.read_csv("test.csv")
dataset_test.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
1,213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
2,213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
3,213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
4,213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [19]:
# Apply similar feature engineering methods as train set to test set

dataset_test['harvesine_distance_pick'] = [single_pt_haversine(x, y) for x, y in zip(dataset_test.pick_lat, dataset_test.pick_lon)]
dataset_test['harvesine_distance_drop'] = [single_pt_haversine(x, y) for x, y in zip(dataset_test.drop_lat, dataset_test.drop_lon)]
dataset_test['distance'] = [ round(abs(x-y),1) for x, y in zip(dataset_test.harvesine_distance_drop, dataset_test.harvesine_distance_pick)]
test_1 = dataset_test.dropna()
test_1 = test_1.reset_index(drop=True)
test_1.head()
test = test_1.drop(test_1.columns[0], axis=1) #Trip ID
test = test.drop(test.columns[5], axis=1) # Pickup time
test = test.drop(test.columns[5], axis=1) # Drop time
test = test.drop(test.columns[5], axis=1) # Pick Lat
test = test.drop(test.columns[5], axis=1) # Pick Lon
test = test.drop(test.columns[5], axis=1) # Drop Lat
test = test.drop(test.columns[5], axis=1) # Drop Lon
test = test.drop(test.columns[6], axis=1) # Pick Havesine
test = test.drop(test.columns[6], axis=1) # Drop Havesine

test_id = test_1[test_1.columns[0]] #Trip ID

In [20]:
test.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance
0,10.5,924,42,2.4486,148,289.27,0.9
1,10.5,4249,20,0.0,91,1912.7,10.0
2,10.5,1552,255,2.6588,23,394.0,5.6
3,10.5,462,16,0.0,198,154.32,0.0
4,10.5,814,392,12.3692,69,147.47,2.4


In [21]:
# Get predicted labels from the selected model. Selected K-fold cross validated model since it improves on overfitting.

y_pred = k_fold_gnb_model.predict(test)

In [22]:
df = pd.DataFrame()

In [23]:
df['tripid'] = test_id
df['prediction'] = y_pred

In [24]:
df.head()

Unnamed: 0,tripid,prediction
0,213284604,1
1,213286352,0
2,213293973,1
3,213294622,1
4,213298687,1


# Write results to excel

In [25]:
writer = pd.ExcelWriter('report_gnb.xlsx')
df.to_excel(writer)
writer.save()