In [21]:
import zipfile
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split

In [2]:
#loading data
taxi_train = pd.read_csv('data/train.csv')
taxi_test = pd.read_csv('data/test.csv')

In [15]:
(taxi_train['trip_duration']/3600).describe()

count    1.458644e+06
mean     2.665256e-01
std      1.454842e+00
min      2.777778e-04
25%      1.102778e-01
50%      1.838889e-01
75%      2.986111e-01
max      9.795228e+02
Name: trip_duration, dtype: float64

In [29]:
# taxi_train.sort_values(by='trip_duration', ascending=False)

In [3]:
def clean_df(x, cols):
    
    df = x
    
    #changing datetime column to datetime class
    df['pickup_datetime_hold'] = pd.to_datetime(df['pickup_datetime'])

    #adding hour column
    df['pickup_hour'] = df.apply(lambda x: x.pickup_datetime_hold.hour, axis = 1) 

    #adding month column
    df['pickup_month'] = df.apply(lambda x: x.pickup_datetime_hold.month, axis = 1) 

    #adding day of month column
    df['pickup_day'] = df.apply(lambda x: x.pickup_datetime_hold.day, axis = 1) 

    #adding day of week column
    df['pickup_weekday'] = df.apply(lambda x: datetime.weekday(x.pickup_datetime_hold), axis = 1) 
    
    return(df[df.columns.intersection(cols)])

In [4]:
col_list = ['pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude', 
        'trip_duration', 
        'pickup_hour',
        'pickup_month',
        'pickup_day',
        'pickup_weekday']

In [5]:
taxi_train_clean = clean_df(taxi_train, col_list)

In [6]:
taxi_test_clean = clean_df(taxi_test, col_list)

In [14]:
# hold = pd.concat([taxi_train_clean, taxi_test_clean], axis = 0)
hold = taxi_train_clean

In [15]:
hold.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_hour,pickup_month,pickup_day,pickup_weekday
0,-73.982155,40.767937,-73.96463,40.765602,455,17,3,14,0
1,-73.980415,40.738564,-73.999481,40.731152,663,0,6,12,6
2,-73.979027,40.763939,-74.005333,40.710087,2124,11,1,19,1
3,-74.01004,40.719971,-74.012268,40.706718,429,19,4,6,2
4,-73.973053,40.793209,-73.972923,40.78252,435,13,3,26,5


In [17]:
X = hold.drop(["trip_duration"], axis=1)
y = hold["trip_duration"]

In [22]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 4321)

In [23]:
# def rmsle(y_true, y_pred):
#     assert len(y_true) == len(y_pred)
#     return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

from sklearn.metrics import meansquaredlogerror 

def rmsle(y_true, y_pred):
    return np.sqrt(meansquaredlogerror(y_true, y_pred))

In [24]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
#     'objective':          'reg:squarederror',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [25]:
nrounds = 2000

In [26]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [27]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

[0]	eval-rmse:5.72253	train-rmse:5.72155
[1]	eval-rmse:5.44041	train-rmse:5.43942
[2]	eval-rmse:5.17199	train-rmse:5.17103
[3]	eval-rmse:4.91840	train-rmse:4.91743
[4]	eval-rmse:4.67589	train-rmse:4.67502
[5]	eval-rmse:4.44603	train-rmse:4.44515
[6]	eval-rmse:4.22907	train-rmse:4.22820
[7]	eval-rmse:4.02170	train-rmse:4.02087
[8]	eval-rmse:3.82576	train-rmse:3.82494
[9]	eval-rmse:3.63891	train-rmse:3.63806
[10]	eval-rmse:3.46157	train-rmse:3.46074
[11]	eval-rmse:3.29469	train-rmse:3.29383
[12]	eval-rmse:3.13657	train-rmse:3.13571
[13]	eval-rmse:2.98500	train-rmse:2.98411
[14]	eval-rmse:2.84269	train-rmse:2.84174
[15]	eval-rmse:2.70676	train-rmse:2.70578
[16]	eval-rmse:2.57906	train-rmse:2.57804
[17]	eval-rmse:2.45804	train-rmse:2.45697
[18]	eval-rmse:2.34337	train-rmse:2.34224
[19]	eval-rmse:2.23308	train-rmse:2.23190
[20]	eval-rmse:2.12850	train-rmse:2.12725
[21]	eval-rmse:2.03110	train-rmse:2.02975
[22]	eval-rmse:1.93843	train-rmse:1.93699
[23]	eval-rmse:1.84949	train-rmse:1.84791
[2

[194]	eval-rmse:0.43101	train-rmse:0.34547
[195]	eval-rmse:0.43089	train-rmse:0.34490
[196]	eval-rmse:0.43088	train-rmse:0.34435
[197]	eval-rmse:0.43086	train-rmse:0.34390
[198]	eval-rmse:0.43063	train-rmse:0.34340
[199]	eval-rmse:0.43061	train-rmse:0.34312
[200]	eval-rmse:0.43056	train-rmse:0.34297
[201]	eval-rmse:0.43029	train-rmse:0.34221
[202]	eval-rmse:0.43003	train-rmse:0.34154
[203]	eval-rmse:0.43001	train-rmse:0.34145
[204]	eval-rmse:0.42998	train-rmse:0.34097
[205]	eval-rmse:0.42996	train-rmse:0.34069
[206]	eval-rmse:0.42991	train-rmse:0.34019
[207]	eval-rmse:0.42947	train-rmse:0.33891
[208]	eval-rmse:0.42789	train-rmse:0.33677
[209]	eval-rmse:0.42774	train-rmse:0.33642
[210]	eval-rmse:0.42771	train-rmse:0.33599
[211]	eval-rmse:0.42768	train-rmse:0.33562
[212]	eval-rmse:0.42755	train-rmse:0.33505
[213]	eval-rmse:0.42644	train-rmse:0.33323
[214]	eval-rmse:0.42632	train-rmse:0.33282
[215]	eval-rmse:0.42628	train-rmse:0.33244
[216]	eval-rmse:0.42625	train-rmse:0.33212
[217]	eval-

[385]	eval-rmse:0.41057	train-rmse:0.26701
[386]	eval-rmse:0.41056	train-rmse:0.26687
[387]	eval-rmse:0.41053	train-rmse:0.26667
[388]	eval-rmse:0.41051	train-rmse:0.26649
[389]	eval-rmse:0.41022	train-rmse:0.26602
[390]	eval-rmse:0.41022	train-rmse:0.26591
[391]	eval-rmse:0.41018	train-rmse:0.26559
[392]	eval-rmse:0.41017	train-rmse:0.26537
[393]	eval-rmse:0.41015	train-rmse:0.26518
[394]	eval-rmse:0.41014	train-rmse:0.26499
[395]	eval-rmse:0.41008	train-rmse:0.26462
[396]	eval-rmse:0.41008	train-rmse:0.26445
[397]	eval-rmse:0.41005	train-rmse:0.26416
[398]	eval-rmse:0.41000	train-rmse:0.26395
[399]	eval-rmse:0.41000	train-rmse:0.26366
[400]	eval-rmse:0.41000	train-rmse:0.26349
[401]	eval-rmse:0.40997	train-rmse:0.26335
[402]	eval-rmse:0.40994	train-rmse:0.26309
[403]	eval-rmse:0.40993	train-rmse:0.26293
[404]	eval-rmse:0.40993	train-rmse:0.26279
[405]	eval-rmse:0.40992	train-rmse:0.26249
[406]	eval-rmse:0.40989	train-rmse:0.26224
[407]	eval-rmse:0.40984	train-rmse:0.26196
[408]	eval-

[576]	eval-rmse:0.40462	train-rmse:0.22932
[577]	eval-rmse:0.40462	train-rmse:0.22929
[578]	eval-rmse:0.40462	train-rmse:0.22917
[579]	eval-rmse:0.40462	train-rmse:0.22898
[580]	eval-rmse:0.40462	train-rmse:0.22885
[581]	eval-rmse:0.40462	train-rmse:0.22864
[582]	eval-rmse:0.40462	train-rmse:0.22855
[583]	eval-rmse:0.40461	train-rmse:0.22843
[584]	eval-rmse:0.40461	train-rmse:0.22826
[585]	eval-rmse:0.40461	train-rmse:0.22822
[586]	eval-rmse:0.40460	train-rmse:0.22806
[587]	eval-rmse:0.40460	train-rmse:0.22795
[588]	eval-rmse:0.40461	train-rmse:0.22769
[589]	eval-rmse:0.40460	train-rmse:0.22755
[590]	eval-rmse:0.40460	train-rmse:0.22747
[591]	eval-rmse:0.40459	train-rmse:0.22736
[592]	eval-rmse:0.40459	train-rmse:0.22723
[593]	eval-rmse:0.40458	train-rmse:0.22704
[594]	eval-rmse:0.40448	train-rmse:0.22668
[595]	eval-rmse:0.40449	train-rmse:0.22658
[596]	eval-rmse:0.40449	train-rmse:0.22641
[597]	eval-rmse:0.40449	train-rmse:0.22633
[598]	eval-rmse:0.40448	train-rmse:0.22620
[599]	eval-

[767]	eval-rmse:0.40241	train-rmse:0.20475
[768]	eval-rmse:0.40240	train-rmse:0.20465
[769]	eval-rmse:0.40231	train-rmse:0.20438
[770]	eval-rmse:0.40231	train-rmse:0.20430
[771]	eval-rmse:0.40231	train-rmse:0.20417
[772]	eval-rmse:0.40230	train-rmse:0.20408
[773]	eval-rmse:0.40229	train-rmse:0.20399
[774]	eval-rmse:0.40229	train-rmse:0.20389
[775]	eval-rmse:0.40212	train-rmse:0.20350
[776]	eval-rmse:0.40187	train-rmse:0.20290
[777]	eval-rmse:0.40186	train-rmse:0.20272
[778]	eval-rmse:0.40186	train-rmse:0.20265
[779]	eval-rmse:0.40175	train-rmse:0.20241
[780]	eval-rmse:0.40175	train-rmse:0.20234
[781]	eval-rmse:0.40175	train-rmse:0.20226
[782]	eval-rmse:0.40175	train-rmse:0.20217
[783]	eval-rmse:0.40175	train-rmse:0.20203
[784]	eval-rmse:0.40174	train-rmse:0.20188
[785]	eval-rmse:0.40174	train-rmse:0.20181
[786]	eval-rmse:0.40174	train-rmse:0.20176
[787]	eval-rmse:0.40173	train-rmse:0.20165
[788]	eval-rmse:0.40173	train-rmse:0.20153
[789]	eval-rmse:0.40173	train-rmse:0.20146
[790]	eval-

[958]	eval-rmse:0.40083	train-rmse:0.18598
[959]	eval-rmse:0.40083	train-rmse:0.18587
[960]	eval-rmse:0.40080	train-rmse:0.18571
[961]	eval-rmse:0.40081	train-rmse:0.18559
[962]	eval-rmse:0.40080	train-rmse:0.18552
[963]	eval-rmse:0.40080	train-rmse:0.18547
[964]	eval-rmse:0.40081	train-rmse:0.18543
[965]	eval-rmse:0.40080	train-rmse:0.18537
[966]	eval-rmse:0.40080	train-rmse:0.18531
[967]	eval-rmse:0.40080	train-rmse:0.18526
[968]	eval-rmse:0.40080	train-rmse:0.18522
[969]	eval-rmse:0.40080	train-rmse:0.18517
[970]	eval-rmse:0.40080	train-rmse:0.18510
[971]	eval-rmse:0.40080	train-rmse:0.18507
[972]	eval-rmse:0.40080	train-rmse:0.18500
[973]	eval-rmse:0.40080	train-rmse:0.18491
[974]	eval-rmse:0.40080	train-rmse:0.18484
[975]	eval-rmse:0.40080	train-rmse:0.18475
[976]	eval-rmse:0.40080	train-rmse:0.18472
[977]	eval-rmse:0.40081	train-rmse:0.18464
[978]	eval-rmse:0.40081	train-rmse:0.18457
[979]	eval-rmse:0.40080	train-rmse:0.18448
[980]	eval-rmse:0.40080	train-rmse:0.18442
[981]	eval-

[1146]	eval-rmse:0.40052	train-rmse:0.17240
[1147]	eval-rmse:0.40051	train-rmse:0.17228
[1148]	eval-rmse:0.40052	train-rmse:0.17221
[1149]	eval-rmse:0.40051	train-rmse:0.17218
[1150]	eval-rmse:0.40048	train-rmse:0.17208
[1151]	eval-rmse:0.40045	train-rmse:0.17198
[1152]	eval-rmse:0.40045	train-rmse:0.17183
[1153]	eval-rmse:0.40045	train-rmse:0.17178
[1154]	eval-rmse:0.40045	train-rmse:0.17170
[1155]	eval-rmse:0.40046	train-rmse:0.17162
[1156]	eval-rmse:0.40046	train-rmse:0.17158
[1157]	eval-rmse:0.40046	train-rmse:0.17153
[1158]	eval-rmse:0.40045	train-rmse:0.17139
[1159]	eval-rmse:0.40045	train-rmse:0.17131
[1160]	eval-rmse:0.40045	train-rmse:0.17130
[1161]	eval-rmse:0.40042	train-rmse:0.17120
[1162]	eval-rmse:0.40042	train-rmse:0.17117
[1163]	eval-rmse:0.40042	train-rmse:0.17111
[1164]	eval-rmse:0.40042	train-rmse:0.17102
[1165]	eval-rmse:0.40040	train-rmse:0.17094
[1166]	eval-rmse:0.40039	train-rmse:0.17088
[1167]	eval-rmse:0.40039	train-rmse:0.17086
[1168]	eval-rmse:0.40039	train-r

[1333]	eval-rmse:0.40006	train-rmse:0.16011
[1334]	eval-rmse:0.40006	train-rmse:0.16005
[1335]	eval-rmse:0.40006	train-rmse:0.15999
[1336]	eval-rmse:0.40007	train-rmse:0.15997
[1337]	eval-rmse:0.40006	train-rmse:0.15993
[1338]	eval-rmse:0.40006	train-rmse:0.15986
[1339]	eval-rmse:0.40007	train-rmse:0.15982
[1340]	eval-rmse:0.40007	train-rmse:0.15974
[1341]	eval-rmse:0.40007	train-rmse:0.15963
[1342]	eval-rmse:0.40007	train-rmse:0.15957
[1343]	eval-rmse:0.40007	train-rmse:0.15947
[1344]	eval-rmse:0.40007	train-rmse:0.15941
[1345]	eval-rmse:0.40007	train-rmse:0.15939
[1346]	eval-rmse:0.40008	train-rmse:0.15930
[1347]	eval-rmse:0.40008	train-rmse:0.15925
[1348]	eval-rmse:0.40008	train-rmse:0.15920
[1349]	eval-rmse:0.40008	train-rmse:0.15914
[1350]	eval-rmse:0.40008	train-rmse:0.15906
[1351]	eval-rmse:0.40007	train-rmse:0.15901
[1352]	eval-rmse:0.40007	train-rmse:0.15891
[1353]	eval-rmse:0.40007	train-rmse:0.15886
[1354]	eval-rmse:0.40007	train-rmse:0.15876
[1355]	eval-rmse:0.40007	train-r

[1520]	eval-rmse:0.39975	train-rmse:0.14910
[1521]	eval-rmse:0.39975	train-rmse:0.14905
[1522]	eval-rmse:0.39975	train-rmse:0.14900
[1523]	eval-rmse:0.39975	train-rmse:0.14899
[1524]	eval-rmse:0.39975	train-rmse:0.14892
[1525]	eval-rmse:0.39973	train-rmse:0.14885
[1526]	eval-rmse:0.39973	train-rmse:0.14867
[1527]	eval-rmse:0.39973	train-rmse:0.14860
[1528]	eval-rmse:0.39973	train-rmse:0.14858
[1529]	eval-rmse:0.39973	train-rmse:0.14856
[1530]	eval-rmse:0.39973	train-rmse:0.14852
[1531]	eval-rmse:0.39973	train-rmse:0.14851
[1532]	eval-rmse:0.39973	train-rmse:0.14846
[1533]	eval-rmse:0.39973	train-rmse:0.14836
[1534]	eval-rmse:0.39974	train-rmse:0.14832
[1535]	eval-rmse:0.39973	train-rmse:0.14830
[1536]	eval-rmse:0.39974	train-rmse:0.14819
[1537]	eval-rmse:0.39974	train-rmse:0.14814
[1538]	eval-rmse:0.39974	train-rmse:0.14807
[1539]	eval-rmse:0.39974	train-rmse:0.14802
[1540]	eval-rmse:0.39972	train-rmse:0.14791
[1541]	eval-rmse:0.39972	train-rmse:0.14787
[1542]	eval-rmse:0.39972	train-r

[1707]	eval-rmse:0.39971	train-rmse:0.13926
[1708]	eval-rmse:0.39971	train-rmse:0.13923
[1709]	eval-rmse:0.39971	train-rmse:0.13917
[1710]	eval-rmse:0.39971	train-rmse:0.13914
[1711]	eval-rmse:0.39971	train-rmse:0.13910
[1712]	eval-rmse:0.39971	train-rmse:0.13903
[1713]	eval-rmse:0.39971	train-rmse:0.13900
[1714]	eval-rmse:0.39971	train-rmse:0.13895
[1715]	eval-rmse:0.39972	train-rmse:0.13891
[1716]	eval-rmse:0.39972	train-rmse:0.13887
[1717]	eval-rmse:0.39972	train-rmse:0.13879
[1718]	eval-rmse:0.39971	train-rmse:0.13866
[1719]	eval-rmse:0.39971	train-rmse:0.13862
[1720]	eval-rmse:0.39972	train-rmse:0.13856
[1721]	eval-rmse:0.39971	train-rmse:0.13853
[1722]	eval-rmse:0.39972	train-rmse:0.13848
[1723]	eval-rmse:0.39972	train-rmse:0.13845
[1724]	eval-rmse:0.39972	train-rmse:0.13844
[1725]	eval-rmse:0.39972	train-rmse:0.13842
[1726]	eval-rmse:0.39972	train-rmse:0.13834
[1727]	eval-rmse:0.39971	train-rmse:0.13827
[1728]	eval-rmse:0.39971	train-rmse:0.13821
[1729]	eval-rmse:0.39971	train-r

[1894]	eval-rmse:0.39966	train-rmse:0.13053
[1895]	eval-rmse:0.39966	train-rmse:0.13050
[1896]	eval-rmse:0.39966	train-rmse:0.13045
[1897]	eval-rmse:0.39966	train-rmse:0.13039
[1898]	eval-rmse:0.39964	train-rmse:0.13032
[1899]	eval-rmse:0.39965	train-rmse:0.13028
[1900]	eval-rmse:0.39965	train-rmse:0.13023
[1901]	eval-rmse:0.39964	train-rmse:0.13019
[1902]	eval-rmse:0.39964	train-rmse:0.13011
[1903]	eval-rmse:0.39965	train-rmse:0.13010
[1904]	eval-rmse:0.39963	train-rmse:0.13003
[1905]	eval-rmse:0.39963	train-rmse:0.12997
[1906]	eval-rmse:0.39963	train-rmse:0.12993
[1907]	eval-rmse:0.39961	train-rmse:0.12983
[1908]	eval-rmse:0.39961	train-rmse:0.12980
[1909]	eval-rmse:0.39961	train-rmse:0.12978
[1910]	eval-rmse:0.39961	train-rmse:0.12975
[1911]	eval-rmse:0.39961	train-rmse:0.12971
[1912]	eval-rmse:0.39961	train-rmse:0.12967
[1913]	eval-rmse:0.39961	train-rmse:0.12962
[1914]	eval-rmse:0.39961	train-rmse:0.12959
[1915]	eval-rmse:0.39960	train-rmse:0.12955
[1916]	eval-rmse:0.39960	train-r

In [28]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [29]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

296.9669895181349

In [30]:
#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'dropoff_longitude': 1123467,
 'dropoff_latitude': 1214920,
 'pickup_latitude': 1192711,
 'pickup_hour': 580921,
 'pickup_day': 550885,
 'pickup_longitude': 1243541,
 'pickup_month': 364392,
 'pickup_weekday': 346368}

In [31]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'dropoff_longitude': 0.1697796879498217,
 'dropoff_latitude': 0.1836001756028414,
 'pickup_latitude': 0.18024392473861697,
 'pickup_hour': 0.08778948211518307,
 'pickup_day': 0.08325040557153662,
 'pickup_longitude': 0.18792541563998696,
 'pickup_month': 0.05506735849954777,
 'pickup_weekday': 0.05234354988246548}

In [39]:
import pickle

filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))