# **Part D - XGBoost**

In [2]:
#import required libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

from collections import Counter
from datetime import datetime
import math 
from google.colab import files
import io
import datetime as dt
import re
import pandas_profiling
import pandas_profiling as pp
from math import sqrt

plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')

## **Loading training and test data**

In [4]:
uploaded = files.upload()

Saving nyc_sub_data.csv to nyc_sub_data.csv


In [5]:
#reading to a dataframe
train = pd.read_csv(io.BytesIO(uploaded['nyc_sub_data.csv']))
train.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,0,2010-11-16 19:40:00.00000086,5.7,2010-11-16 19:40:00 UTC,-74.000552,40.727492,-74.005037,40.719123,1
1,1,2012-12-08 23:27:00.000000232,6.0,2012-12-08 23:27:00 UTC,-74.003502,40.738745,-73.993317,40.752022,3
2,2,2013-04-18 20:48:24.0000004,7.0,2013-04-18 20:48:24 UTC,-73.995001,40.760205,-73.986058,40.778839,1
3,3,2013-06-18 13:18:00.00000092,5.5,2013-06-18 13:18:00 UTC,-73.978285,40.737182,-73.970902,40.74724,2
4,4,2010-09-19 20:54:57.0000003,4.5,2010-09-19 20:54:57 UTC,-73.983178,40.767873,-73.979043,40.776613,4


In [6]:
#dropping null values
train = train.dropna(how='any', axis=0)

In [9]:
#uploading test data
uploaded = files.upload()
test = pd.read_csv(io.BytesIO(uploaded['test.csv']))
test.head()

Saving test.csv to test.csv


Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [13]:
# this function will also be used with the test set below
def select_within_test_boundary(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])

BB = (-74.5, -72.8, 40.5, 41.8)
print('Old size: %d' % len(train))
train = train[select_within_test_boundary(train, BB)]
print('New size: %d' % len(train))

Old size: 100000
New size: 97889


## **Feature engineering**

In [14]:
def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df["weekday"] = df.pickup_datetime.dt.weekday
    
    return df

In [15]:
train = prepare_time_features(train)
test = prepare_time_features(test)

### Function to get haversine distance

In [20]:
#function to calculate haversine distance
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [21]:
train['distance_miles'] = distance(train.pickup_latitude, train.pickup_longitude, \
                                      train.dropoff_latitude, train.dropoff_longitude)
test['distance_miles'] = distance(test.pickup_latitude, test.pickup_longitude, \
                                      test.dropoff_latitude, test.dropoff_longitude)


Calculating pickup and drop distances

In [22]:
def transform(data):
    # Distances to nearby airports, 
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)

    data['pickup_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                         data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_jfk'] = distance(jfk[1], jfk[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_ewr'] = distance(ewr[1], ewr[0], 
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_ewr'] = distance(ewr[1], ewr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    data['pickup_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['dropoff_distance_to_lgr'] = distance(lgr[1], lgr[0],
                                           data['dropoff_latitude'], data['dropoff_longitude'])
    
    return data

train = transform(train)
test = transform(test)

In [25]:
train[(train['distance_miles']==0)&(train['fare_amount']==0)]

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,pickup_distance_to_jfk,dropoff_distance_to_jfk,pickup_distance_to_ewr,dropoff_distance_to_ewr,pickup_distance_to_lgr,dropoff_distance_to_lgr
36089,36089,2015-02-21 11:59:26.0000002,0.0,2015-02-21 11:59:00+00:00,-73.93763,40.758221,-73.93763,40.758221,1,11,2,2015,5,0.0,11.622928,11.622928,13.281059,13.281059,3.571049,3.571049


In [26]:
#dropping an instance of train where distance is zero and fare amount is zero
train = train.drop(index= train[(train['distance_miles']==0)&(train['fare_amount']==0)].index, axis=0)

In [28]:
#dropping an instance of train where the fare amount is 0
train = train.drop(index= train[train['fare_amount']==0].index, axis=0)

In [29]:
train[train['fare_amount'] < 2.5].shape

(9, 20)

In [30]:
#dropping the rows where the fare is less that 2.5
print("old size: %d" % len(train))
train = train.drop(index= train[train['fare_amount'] < 2.5].index, axis=0)
print("New size: %d" % len(train))

old size: 97887
New size: 97878


In [31]:
#as there are no records for passenger_count > 7, we can ignore this case
train[train.passenger_count >= 7]

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,pickup_distance_to_jfk,dropoff_distance_to_jfk,pickup_distance_to_ewr,dropoff_distance_to_ewr,pickup_distance_to_lgr,dropoff_distance_to_lgr


## **Test train split**

In [35]:
# create copy of the data set
df_train = train.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
df_test = test.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
print(df_train.shape)
print(df_test.shape)

(97878, 18)
(9914, 16)


In [36]:
#test train split
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('fare_amount', axis=1),
                                                    df_train['fare_amount'], test_size=0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(78302, 17)
(19576, 17)
(78302,)
(19576,)


## **XGBoost model**

In [37]:
import xgboost as xgb

In [38]:
#defining parameters for the XGBoost model
params = {
   
    'max_depth': 7,
    'gamma' :0,
    'eta':.03, 
    'subsample': 1,
    'colsample_bytree': 0.9, 
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 0
}

In [39]:
def XGBmodel(X_train,X_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(X_train,label=y_train)
    matrix_test = xgb.DMatrix(X_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(X_train,X_test,y_train,y_test,params)

[0]	test-rmse:14.1615
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:13.7703
[2]	test-rmse:13.3912
[3]	test-rmse:13.0487
[4]	test-rmse:12.694
[5]	test-rmse:12.3495
[6]	test-rmse:12.0159
[7]	test-rmse:11.6944
[8]	test-rmse:11.382
[9]	test-rmse:11.0808
[10]	test-rmse:10.789
[11]	test-rmse:10.5063
[12]	test-rmse:10.2355
[13]	test-rmse:9.97153
[14]	test-rmse:9.73333
[15]	test-rmse:9.48864
[16]	test-rmse:9.25158
[17]	test-rmse:9.02289
[18]	test-rmse:8.80227
[19]	test-rmse:8.58922
[20]	test-rmse:8.38447
[21]	test-rmse:8.18611
[22]	test-rmse:7.99361
[23]	test-rmse:7.81926
[24]	test-rmse:7.64004
[25]	test-rmse:7.47823
[26]	test-rmse:7.31257
[27]	test-rmse:7.15244
[28]	test-rmse:6.9973
[29]	test-rmse:6.84954
[30]	test-rmse:6.70662
[31]	test-rmse:6.57716
[32]	test-rmse:6.44372
[33]	test-rmse:6.31583
[34]	test-rmse:6.19299
[35]	test-rmse:6.07567
[36]	test-rmse:5.96232
[37]	test-rmse:5.85327
[38]	test-rmse:5.74887
[39]	test-rmse:5.65522
[40]	test-rmse:5.5589
[41]	test-rmse: