In [18]:
import zipfile
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

import pickle

In [2]:
#loading data
taxi_train = pd.read_csv('data/train.csv')
taxi_test = pd.read_csv('data/test.csv')

In [3]:
(taxi_train['trip_duration']/3600).describe()

count    1.458644e+06
mean     2.665256e-01
std      1.454842e+00
min      2.777778e-04
25%      1.102778e-01
50%      1.838889e-01
75%      2.986111e-01
max      9.795228e+02
Name: trip_duration, dtype: float64

In [4]:
def clean_df(input_df):
    
    df = input_df
    
    col_list = ['pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude', 
        'trip_duration',
        'pickup_minute',
        'pickup_hour',
        'pickup_month',
        'pickup_day',
        'pickup_weekday']
    
    #changing datetime column to datetime class
    df['pickup_datetime_hold'] = pd.to_datetime(df['pickup_datetime'])
    
    #adding minute column
    df['pickup_minute'] = df.apply(lambda x: x.pickup_datetime_hold.minute, axis = 1) 

    #adding hour column
    df['pickup_hour'] = df.apply(lambda x: x.pickup_datetime_hold.hour, axis = 1) 

    #adding month column
    df['pickup_month'] = df.apply(lambda x: x.pickup_datetime_hold.month, axis = 1) 

    #adding day of month column
    df['pickup_day'] = df.apply(lambda x: x.pickup_datetime_hold.day, axis = 1) 

    #adding day of week column
    df['pickup_weekday'] = df.apply(lambda x: datetime.weekday(x.pickup_datetime_hold), axis = 1) 
    
    return(df[df.columns.intersection(col_list)])

In [6]:
taxi_train_clean = clean_df(taxi_train)

In [6]:
taxi_test_clean = clean_df(taxi_test)

In [7]:
# hold = pd.concat([taxi_train_clean, taxi_test_clean], axis = 0)
hold = taxi_train_clean

In [8]:
hold.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_minute,pickup_hour,pickup_month,pickup_day,pickup_weekday
0,-73.982155,40.767937,-73.96463,40.765602,455,24,17,3,14,0
1,-73.980415,40.738564,-73.999481,40.731152,663,43,0,6,12,6
2,-73.979027,40.763939,-74.005333,40.710087,2124,35,11,1,19,1
3,-74.01004,40.719971,-74.012268,40.706718,429,32,19,4,6,2
4,-73.973053,40.793209,-73.972923,40.78252,435,30,13,3,26,5


In [9]:
#preparing dependent and independent variables
X = hold.drop(["trip_duration"], axis=1)
y = hold["trip_duration"]

In [10]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 4321)

In [14]:
#grading mean squared log error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [15]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
#     'objective':          'reg:squarederror',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [16]:
#setting the number of rounds
nrounds = 2000

In [19]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#error tracking
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [20]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:5.72218	train-rmse:5.72270
[1]	eval-rmse:5.44011	train-rmse:5.44058
[2]	eval-rmse:5.17119	train-rmse:5.17161
[3]	eval-rmse:4.91612	train-rmse:4.91649
[4]	eval-rmse:4.67422	train-rmse:4.67450
[5]	eval-rmse:4.44413	train-rmse:4.44441
[6]	eval-rmse:4.22667	train-rmse:4.22689
[7]	eval-rmse:4.01913	train-rmse:4.01931
[8]	eval-rmse:3.82327	train-rmse:3.82339
[9]	eval-rmse:3.63680	train-rmse:3.63689
[10]	eval-rmse:3.45963	train-rmse:3.45966
[11]	eval-rmse:3.29149	train-rmse:3.29147
[12]	eval-rmse:3.13198	train-rmse:3.13187
[13]	eval-rmse:2.98064	train-rmse:2.98046
[14]	eval-rmse:2.83831	train-rmse:2.83805
[15]	eval-rmse:2.70163	train-rmse:2.70128
[16]	eval-rmse:2.57254	train-rmse:2.57211
[17]	eval-rmse:2.45061	train-rmse:2.45006
[18]	eval-rmse:2.33506	train-rmse:2.33433
[19]	eval-rmse:2.22594	train-rmse:2.22512
[20]	eval-rmse:2.12070	train-rmse:2.11970
[21]	eval-rmse:2.02304	train-rmse:2.02188
[22]	eval-rmse:1.92906	train-rmse:1.92774
[23]	eval-rmse:1.84157	train-rmse:1.84003
[2

[194]	eval-rmse:0.42233	train-rmse:0.32428
[195]	eval-rmse:0.42229	train-rmse:0.32408
[196]	eval-rmse:0.42196	train-rmse:0.32319
[197]	eval-rmse:0.42192	train-rmse:0.32299
[198]	eval-rmse:0.42173	train-rmse:0.32251
[199]	eval-rmse:0.42160	train-rmse:0.32213
[200]	eval-rmse:0.42156	train-rmse:0.32169
[201]	eval-rmse:0.42155	train-rmse:0.32107
[202]	eval-rmse:0.42139	train-rmse:0.32043
[203]	eval-rmse:0.42133	train-rmse:0.31994
[204]	eval-rmse:0.42126	train-rmse:0.31937
[205]	eval-rmse:0.42123	train-rmse:0.31909
[206]	eval-rmse:0.42120	train-rmse:0.31844
[207]	eval-rmse:0.42113	train-rmse:0.31822
[208]	eval-rmse:0.42079	train-rmse:0.31778
[209]	eval-rmse:0.41980	train-rmse:0.31601
[210]	eval-rmse:0.41979	train-rmse:0.31554
[211]	eval-rmse:0.41967	train-rmse:0.31487
[212]	eval-rmse:0.41947	train-rmse:0.31425
[213]	eval-rmse:0.41862	train-rmse:0.31263
[214]	eval-rmse:0.41851	train-rmse:0.31205
[215]	eval-rmse:0.41848	train-rmse:0.31167
[216]	eval-rmse:0.41838	train-rmse:0.31143
[217]	eval-

[385]	eval-rmse:0.40700	train-rmse:0.24868
[386]	eval-rmse:0.40700	train-rmse:0.24857
[387]	eval-rmse:0.40698	train-rmse:0.24841
[388]	eval-rmse:0.40695	train-rmse:0.24799
[389]	eval-rmse:0.40661	train-rmse:0.24713
[390]	eval-rmse:0.40661	train-rmse:0.24697
[391]	eval-rmse:0.40660	train-rmse:0.24680
[392]	eval-rmse:0.40660	train-rmse:0.24669
[393]	eval-rmse:0.40659	train-rmse:0.24639
[394]	eval-rmse:0.40658	train-rmse:0.24623
[395]	eval-rmse:0.40657	train-rmse:0.24599
[396]	eval-rmse:0.40656	train-rmse:0.24597
[397]	eval-rmse:0.40656	train-rmse:0.24567
[398]	eval-rmse:0.40654	train-rmse:0.24552
[399]	eval-rmse:0.40654	train-rmse:0.24544
[400]	eval-rmse:0.40654	train-rmse:0.24515
[401]	eval-rmse:0.40652	train-rmse:0.24492
[402]	eval-rmse:0.40650	train-rmse:0.24467
[403]	eval-rmse:0.40648	train-rmse:0.24451
[404]	eval-rmse:0.40647	train-rmse:0.24436
[405]	eval-rmse:0.40644	train-rmse:0.24407
[406]	eval-rmse:0.40643	train-rmse:0.24383
[407]	eval-rmse:0.40642	train-rmse:0.24352
[408]	eval-

[576]	eval-rmse:0.40340	train-rmse:0.21436
[577]	eval-rmse:0.40339	train-rmse:0.21421
[578]	eval-rmse:0.40339	train-rmse:0.21409
[579]	eval-rmse:0.40334	train-rmse:0.21390
[580]	eval-rmse:0.40334	train-rmse:0.21374
[581]	eval-rmse:0.40334	train-rmse:0.21365
[582]	eval-rmse:0.40334	train-rmse:0.21349
[583]	eval-rmse:0.40333	train-rmse:0.21331
[584]	eval-rmse:0.40323	train-rmse:0.21301
[585]	eval-rmse:0.40322	train-rmse:0.21300
[586]	eval-rmse:0.40322	train-rmse:0.21284
[587]	eval-rmse:0.40323	train-rmse:0.21276
[588]	eval-rmse:0.40322	train-rmse:0.21267
[589]	eval-rmse:0.40322	train-rmse:0.21258
[590]	eval-rmse:0.40317	train-rmse:0.21235
[591]	eval-rmse:0.40316	train-rmse:0.21199
[592]	eval-rmse:0.40316	train-rmse:0.21193
[593]	eval-rmse:0.40316	train-rmse:0.21180
[594]	eval-rmse:0.40314	train-rmse:0.21169
[595]	eval-rmse:0.40314	train-rmse:0.21161
[596]	eval-rmse:0.40314	train-rmse:0.21143
[597]	eval-rmse:0.40314	train-rmse:0.21139
[598]	eval-rmse:0.40313	train-rmse:0.21114
[599]	eval-

[767]	eval-rmse:0.40175	train-rmse:0.19082
[768]	eval-rmse:0.40175	train-rmse:0.19080
[769]	eval-rmse:0.40175	train-rmse:0.19072
[770]	eval-rmse:0.40175	train-rmse:0.19066
[771]	eval-rmse:0.40172	train-rmse:0.19048
[772]	eval-rmse:0.40171	train-rmse:0.19040
[773]	eval-rmse:0.40171	train-rmse:0.19034
[774]	eval-rmse:0.40170	train-rmse:0.19028
[775]	eval-rmse:0.40169	train-rmse:0.19011
[776]	eval-rmse:0.40169	train-rmse:0.18996
[777]	eval-rmse:0.40169	train-rmse:0.18980
[778]	eval-rmse:0.40168	train-rmse:0.18959
[779]	eval-rmse:0.40168	train-rmse:0.18952
[780]	eval-rmse:0.40168	train-rmse:0.18944
[781]	eval-rmse:0.40167	train-rmse:0.18929
[782]	eval-rmse:0.40168	train-rmse:0.18921
[783]	eval-rmse:0.40168	train-rmse:0.18905
[784]	eval-rmse:0.40168	train-rmse:0.18895
[785]	eval-rmse:0.40168	train-rmse:0.18886
[786]	eval-rmse:0.40168	train-rmse:0.18878
[787]	eval-rmse:0.40164	train-rmse:0.18867
[788]	eval-rmse:0.40164	train-rmse:0.18857
[789]	eval-rmse:0.40164	train-rmse:0.18842
[790]	eval-

[958]	eval-rmse:0.40117	train-rmse:0.17315
[959]	eval-rmse:0.40117	train-rmse:0.17305
[960]	eval-rmse:0.40117	train-rmse:0.17291
[961]	eval-rmse:0.40117	train-rmse:0.17286
[962]	eval-rmse:0.40116	train-rmse:0.17278
[963]	eval-rmse:0.40117	train-rmse:0.17267
[964]	eval-rmse:0.40117	train-rmse:0.17266
[965]	eval-rmse:0.40117	train-rmse:0.17251
[966]	eval-rmse:0.40117	train-rmse:0.17247
[967]	eval-rmse:0.40117	train-rmse:0.17237
[968]	eval-rmse:0.40117	train-rmse:0.17229
[969]	eval-rmse:0.40117	train-rmse:0.17223
[970]	eval-rmse:0.40117	train-rmse:0.17217
[971]	eval-rmse:0.40116	train-rmse:0.17203
[972]	eval-rmse:0.40116	train-rmse:0.17199
[973]	eval-rmse:0.40116	train-rmse:0.17189
[974]	eval-rmse:0.40116	train-rmse:0.17183
[975]	eval-rmse:0.40116	train-rmse:0.17174
[976]	eval-rmse:0.40116	train-rmse:0.17165
[977]	eval-rmse:0.40116	train-rmse:0.17149
[978]	eval-rmse:0.40116	train-rmse:0.17143
[979]	eval-rmse:0.40117	train-rmse:0.17135
[980]	eval-rmse:0.40117	train-rmse:0.17128
[981]	eval-

[1146]	eval-rmse:0.40082	train-rmse:0.15791
[1147]	eval-rmse:0.40083	train-rmse:0.15789
[1148]	eval-rmse:0.40082	train-rmse:0.15780
[1149]	eval-rmse:0.40083	train-rmse:0.15763
[1150]	eval-rmse:0.40083	train-rmse:0.15757
[1151]	eval-rmse:0.40082	train-rmse:0.15750
[1152]	eval-rmse:0.40082	train-rmse:0.15741
[1153]	eval-rmse:0.40082	train-rmse:0.15737
[1154]	eval-rmse:0.40082	train-rmse:0.15722
[1155]	eval-rmse:0.40082	train-rmse:0.15707
[1156]	eval-rmse:0.40082	train-rmse:0.15698
[1157]	eval-rmse:0.40081	train-rmse:0.15694
[1158]	eval-rmse:0.40081	train-rmse:0.15682
[1159]	eval-rmse:0.40081	train-rmse:0.15675
[1160]	eval-rmse:0.40081	train-rmse:0.15666
[1161]	eval-rmse:0.40081	train-rmse:0.15662
[1162]	eval-rmse:0.40081	train-rmse:0.15652
[1163]	eval-rmse:0.40081	train-rmse:0.15646
[1164]	eval-rmse:0.40081	train-rmse:0.15637
[1165]	eval-rmse:0.40080	train-rmse:0.15628
[1166]	eval-rmse:0.40080	train-rmse:0.15611
[1167]	eval-rmse:0.40080	train-rmse:0.15600
[1168]	eval-rmse:0.40080	train-r

[1333]	eval-rmse:0.40068	train-rmse:0.14496
[1334]	eval-rmse:0.40068	train-rmse:0.14491
[1335]	eval-rmse:0.40068	train-rmse:0.14490
[1336]	eval-rmse:0.40068	train-rmse:0.14485
[1337]	eval-rmse:0.40068	train-rmse:0.14484
[1338]	eval-rmse:0.40068	train-rmse:0.14482
[1339]	eval-rmse:0.40068	train-rmse:0.14475
[1340]	eval-rmse:0.40068	train-rmse:0.14471
[1341]	eval-rmse:0.40068	train-rmse:0.14466
[1342]	eval-rmse:0.40068	train-rmse:0.14461
[1343]	eval-rmse:0.40068	train-rmse:0.14457
[1344]	eval-rmse:0.40069	train-rmse:0.14446
[1345]	eval-rmse:0.40069	train-rmse:0.14441
[1346]	eval-rmse:0.40069	train-rmse:0.14438
[1347]	eval-rmse:0.40069	train-rmse:0.14431
[1348]	eval-rmse:0.40069	train-rmse:0.14426
[1349]	eval-rmse:0.40069	train-rmse:0.14425
[1350]	eval-rmse:0.40069	train-rmse:0.14419
[1351]	eval-rmse:0.40068	train-rmse:0.14414
[1352]	eval-rmse:0.40068	train-rmse:0.14401
[1353]	eval-rmse:0.40068	train-rmse:0.14398
[1354]	eval-rmse:0.40068	train-rmse:0.14394
[1355]	eval-rmse:0.40069	train-r

[1520]	eval-rmse:0.40066	train-rmse:0.13415
[1521]	eval-rmse:0.40066	train-rmse:0.13410
[1522]	eval-rmse:0.40066	train-rmse:0.13407
[1523]	eval-rmse:0.40066	train-rmse:0.13402
[1524]	eval-rmse:0.40066	train-rmse:0.13394
[1525]	eval-rmse:0.40066	train-rmse:0.13390
[1526]	eval-rmse:0.40066	train-rmse:0.13385
[1527]	eval-rmse:0.40066	train-rmse:0.13378
[1528]	eval-rmse:0.40066	train-rmse:0.13372
[1529]	eval-rmse:0.40066	train-rmse:0.13367
[1530]	eval-rmse:0.40066	train-rmse:0.13366
[1531]	eval-rmse:0.40066	train-rmse:0.13363
[1532]	eval-rmse:0.40066	train-rmse:0.13359
[1533]	eval-rmse:0.40066	train-rmse:0.13355
[1534]	eval-rmse:0.40066	train-rmse:0.13346
[1535]	eval-rmse:0.40066	train-rmse:0.13336
[1536]	eval-rmse:0.40066	train-rmse:0.13332
[1537]	eval-rmse:0.40066	train-rmse:0.13327
[1538]	eval-rmse:0.40066	train-rmse:0.13320
[1539]	eval-rmse:0.40066	train-rmse:0.13313
[1540]	eval-rmse:0.40065	train-rmse:0.13308
[1541]	eval-rmse:0.40066	train-rmse:0.13301
[1542]	eval-rmse:0.40066	train-r

[1707]	eval-rmse:0.40061	train-rmse:0.12407
[1708]	eval-rmse:0.40061	train-rmse:0.12403
[1709]	eval-rmse:0.40061	train-rmse:0.12395
[1710]	eval-rmse:0.40061	train-rmse:0.12390
[1711]	eval-rmse:0.40061	train-rmse:0.12387
[1712]	eval-rmse:0.40061	train-rmse:0.12385
[1713]	eval-rmse:0.40061	train-rmse:0.12379
[1714]	eval-rmse:0.40061	train-rmse:0.12367
[1715]	eval-rmse:0.40061	train-rmse:0.12359
[1716]	eval-rmse:0.40061	train-rmse:0.12356
[1717]	eval-rmse:0.40061	train-rmse:0.12352
[1718]	eval-rmse:0.40061	train-rmse:0.12351
[1719]	eval-rmse:0.40061	train-rmse:0.12347
[1720]	eval-rmse:0.40061	train-rmse:0.12343
[1721]	eval-rmse:0.40061	train-rmse:0.12339
[1722]	eval-rmse:0.40060	train-rmse:0.12332
[1723]	eval-rmse:0.40060	train-rmse:0.12327
[1724]	eval-rmse:0.40060	train-rmse:0.12323
[1725]	eval-rmse:0.40060	train-rmse:0.12318
[1726]	eval-rmse:0.40060	train-rmse:0.12303
[1727]	eval-rmse:0.40060	train-rmse:0.12298
[1728]	eval-rmse:0.40060	train-rmse:0.12293
[1729]	eval-rmse:0.40060	train-r

[1894]	eval-rmse:0.40058	train-rmse:0.11479
[1895]	eval-rmse:0.40059	train-rmse:0.11474
[1896]	eval-rmse:0.40059	train-rmse:0.11472
[1897]	eval-rmse:0.40059	train-rmse:0.11470
[1898]	eval-rmse:0.40059	train-rmse:0.11466
[1899]	eval-rmse:0.40059	train-rmse:0.11461
[1900]	eval-rmse:0.40059	train-rmse:0.11460
[1901]	eval-rmse:0.40059	train-rmse:0.11457
[1902]	eval-rmse:0.40059	train-rmse:0.11452
[1903]	eval-rmse:0.40059	train-rmse:0.11448
[1904]	eval-rmse:0.40058	train-rmse:0.11443
[1905]	eval-rmse:0.40058	train-rmse:0.11441
[1906]	eval-rmse:0.40058	train-rmse:0.11433
[1907]	eval-rmse:0.40058	train-rmse:0.11429
[1908]	eval-rmse:0.40059	train-rmse:0.11422
[1909]	eval-rmse:0.40059	train-rmse:0.11418
[1910]	eval-rmse:0.40059	train-rmse:0.11416
[1911]	eval-rmse:0.40059	train-rmse:0.11411
[1912]	eval-rmse:0.40058	train-rmse:0.11406
[1913]	eval-rmse:0.40058	train-rmse:0.11399
[1914]	eval-rmse:0.40058	train-rmse:0.11396
[1915]	eval-rmse:0.40058	train-rmse:0.11393
[1916]	eval-rmse:0.40058	train-r

In [21]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [22]:
#mean absolute error
mae = (abs(pred - y_test)).mean()
mae

294.10121522035763

In [56]:
#mean squared error
mse = ((pred - y_test)**2).mean()
mse

58474109.64396217

In [57]:
#feature scores
feature_scores = gbm.get_fscore()
feature_scores

{'dropoff_longitude': 1050549,
 'dropoff_latitude': 1113878,
 'pickup_latitude': 1149541,
 'pickup_minute': 677938,
 'pickup_month': 306068,
 'pickup_day': 530039,
 'pickup_longitude': 1245797,
 'pickup_weekday': 290523,
 'pickup_hour': 515341}

In [58]:
#scale features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'dropoff_longitude': 0.1527033112324799,
 'dropoff_latitude': 0.16190854392228468,
 'pickup_latitude': 0.16709236513241763,
 'pickup_minute': 0.09854216929464972,
 'pickup_month': 0.044488735948825485,
 'pickup_day': 0.07704420296659406,
 'pickup_longitude': 0.18108372576956408,
 'pickup_weekday': 0.04222918120829563,
 'pickup_hour': 0.07490776452488883}

In [59]:
#save the model to be used
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))