In [None]:
# import all nessary labaries
import math
import numpy as np
from numpy import mean
from numpy import std
from sklearn.ensemble import VotingClassifier 
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
import pandas as pd

def convertRadian(dr):
    return (dr*math.pi)/180

DataTrain = pd.read_csv('train.csv')
DataTrain['label'] = DataTrain['label'].map({'correct': 1, 'incorrect': 0})
DataTrain = DataTrain.dropna()
DataTrain = DataTrain.drop(['pickup_time','drop_time'], axis = 1)

DataTrain['pick_lat'] = convertRadian(DataTrain['pick_lat'])
DataTrain['pick_lon'] = convertRadian(DataTrain['pick_lon'])
DataTrain['drop_lat'] = convertRadian(DataTrain['drop_lat'])
DataTrain['drop_lon'] = convertRadian(DataTrain['drop_lon'])
dlon = DataTrain['drop_lon'] - DataTrain['pick_lon']
dlat = DataTrain['drop_lat'] - DataTrain['pick_lat']
DataTrain['dist'] = np.sin(dlat/2)**2 + np.cos(DataTrain['pick_lat'])*np.cos(DataTrain['drop_lat'])*np.sin(dlon/2)**2
DataTrain['dist'] = 2*np.arctan2(DataTrain['dist']**0.5,(1-DataTrain['dist'])**0.5)
R = 6373.0
DataTrain['dist'] = R*DataTrain['dist']

DataTrain = DataTrain.drop(['tripid','pick_lat','pick_lon','drop_lat','drop_lon'], axis = 1)
y = DataTrain['label']
X = DataTrain.drop(['label'],axis=1)#all data set which is 16968 rows × 7 columns

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3) 
GradientBoost = GradientBoostingClassifier(n_estimators=600,learning_rate=0.5, max_features=2, max_depth=2,  random_state=0)
GradientBoost.fit(X_train, y_train) 
y_pred = GradientBoost.predict(X_test)
GradientBoost_score = f1_score(y_test, y_pred)
print("Gradient Boost F1 score = {}".format(f1_score(y_test, y_pred)))#print Accuracy of F1 score

CalibratedCV = CalibratedClassifierCV(GaussianNB(), cv=4, method='sigmoid')
CalibratedCV.fit(X_train, y_train) 
y_pred = CalibratedCV.predict(X_test)


CalibratedCV_score = f1_score(y_test, y_pred)
print("CalibratedCV F1 score = {}".format(f1_score(y_test, y_pred)))##print Accuracy of F1 score

XGB = XGBClassifier(n_estimators=1200,gamma=0.6,learning_rate=0.01)
XGB.fit(X_train, y_train) 
y_pred = XGB.predict(X_test)
XGB_score = f1_score(y_test, y_pred)
print("XGboost F1 score = {}".format(f1_score(y_test, y_pred)))#print Accuracy of F1 score


# group / ensemble of models 
estimator = [] 
estimator.append(('GradientBoost', GradientBoost)) 
estimator.append(('CalibratedCV', CalibratedCV)) 
estimator.append(('XGB', XGB))

vot_hard = VotingClassifier(estimators = estimator, voting ='hard') 
vot_hard.fit(X_train, y_train) 
y_pred = vot_hard.predict(X_test) 

# using accuracy_score metric to predict accuracy
hard_score = f1_score(y_test, y_pred)
print("Hard voting F1 score = {}".format(f1_score(y_test, y_pred)))



# Voting Classifier with soft voting 
vot_soft = VotingClassifier(estimators = estimator, voting ='soft') 
vot_soft.fit(X_train, y_train) 
y_pred = vot_soft.predict(X_test)  
# using accuracy_score 

score = accuracy_score(y_test, y_pred)
soft_score = f1_score(y_test, y_pred)
print("Soft voting score = {}".format(f1_score(y_test, y_pred)))
print("GradientBoost score = {}".format(GradientBoost_score))
print("CalibratedCV score = {}".format(CalibratedCV_score))
print("XGB score = {}".format(XGB_score))
print("Hard voting score = {}".format(hard_score))
print("Soft voting score = {}".format(soft_score))

DataTest = pd.read_csv('test.csv')
DataTest = DataTest.dropna()
DataTest = DataTest.drop(['pickup_time','drop_time'], axis = 1)

DataTest['pick_lat'] = convertRadian(DataTest['pick_lat'])
DataTest['pick_lon'] = convertRadian(DataTest['pick_lon'])
DataTest['drop_lat'] = convertRadian(DataTest['drop_lat'])
DataTest['drop_lon'] = convertRadian(DataTest['drop_lon'])
dlon = DataTest['drop_lon'] - DataTest['pick_lon']
dlat = DataTest['drop_lat'] - DataTest['pick_lat']
DataTest['dist'] = np.sin(dlat/2)**2 + np.cos(DataTest['pick_lat'])*np.cos(DataTest['drop_lat'])*np.sin(dlon/2)**2
DataTest['dist'] = 2*np.arctan2(DataTest['dist']**0.5,(1-DataTest['dist'])**0.5)
R = 6373.0
DataTest['dist'] = R*DataTest['dist']
DataTest = DataTest.drop(['tripid','pick_lat','pick_lon','drop_lat','drop_lon'], axis = 1) 
X1 = DataTest

yhat = vot_soft.predict(X1)
fileData_test = pd.read_csv('test.csv')
fileData_test =fileData_test.drop(['meter_waiting_till_pickup','pickup_time','drop_time','pick_lat','pick_lon','drop_lat','drop_lon'], axis = 1)
fileData_new = fileData_test
fileData_new = fileData_new.drop(['additional_fare','duration','meter_waiting','meter_waiting_fare','fare'], axis = 1) 
fileData_new['prediction'] = yhat
fileData_new['prediction'].value_counts()
fileData_new.to_csv("final.csv",index=False)

Gradient Boost F1 score = 0.9718518518518517
CalibratedCV F1 score = 0.9566553950199815
XGboost F1 score = 0.9748129413004532
