In [1]:
import zipfile
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

import pickle

In [2]:
#loading data
taxi_train = pd.read_csv('data/train.csv')
taxi_test = pd.read_csv('data/test.csv')

In [3]:
(taxi_train['trip_duration']/3600).describe()

count    1.458644e+06
mean     2.665256e-01
std      1.454842e+00
min      2.777778e-04
25%      1.102778e-01
50%      1.838889e-01
75%      2.986111e-01
max      9.795228e+02
Name: trip_duration, dtype: float64

In [3]:
def clean_df(input_df):
    
    df = input_df
    
    col_list = ['pickup_longitude', 
        'pickup_latitude', 
        'dropoff_longitude', 
        'dropoff_latitude', 
        'trip_duration',
        'pickup_minute',
        'pickup_hour',
        'pickup_month',
        'pickup_day',
        'pickup_weekday']
    
    #changing datetime column to datetime class
    df['pickup_datetime_hold'] = pd.to_datetime(df['pickup_datetime'])
    
    #adding minute column
    df['pickup_minute'] = df.apply(lambda x: x.pickup_datetime_hold.minute, axis = 1) 

    #adding hour column
    df['pickup_hour'] = df.apply(lambda x: x.pickup_datetime_hold.hour, axis = 1) 

    #adding month column
    df['pickup_month'] = df.apply(lambda x: x.pickup_datetime_hold.month, axis = 1) 

    #adding day of month column
    df['pickup_day'] = df.apply(lambda x: x.pickup_datetime_hold.day, axis = 1) 

    #adding day of week column
    df['pickup_weekday'] = df.apply(lambda x: datetime.weekday(x.pickup_datetime_hold), axis = 1) 
    
    return(df[df.columns.intersection(col_list)])

In [4]:
taxi_train_clean = clean_df(taxi_train)

In [5]:
taxi_test_clean = clean_df(taxi_test)

In [6]:
# hold = pd.concat([taxi_train_clean, taxi_test_clean], axis = 0)
hold = taxi_train_clean

In [8]:
hold.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,pickup_minute,pickup_hour,pickup_month,pickup_day,pickup_weekday
0,-73.982155,40.767937,-73.96463,40.765602,455,24,17,3,14,0
1,-73.980415,40.738564,-73.999481,40.731152,663,43,0,6,12,6
2,-73.979027,40.763939,-74.005333,40.710087,2124,35,11,1,19,1
3,-74.01004,40.719971,-74.012268,40.706718,429,32,19,4,6,2
4,-73.973053,40.793209,-73.972923,40.78252,435,30,13,3,26,5


In [7]:
#preparing dependent and independent variables
X = hold.drop(["trip_duration"], axis=1)
y = hold["trip_duration"]

In [8]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 4321)

In [14]:
#grading mean squared log error
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [15]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
#     'objective':          'reg:squarederror',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [16]:
#setting the number of rounds
nrounds = 2500

In [19]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#error tracking
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [1]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

In [21]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [22]:
#mean absolute error
mae = (abs(pred - y_test)).mean()
mae

294.10121522035763

In [56]:
#mean squared error
mse = ((pred - y_test)**2).mean()
mse

58474109.64396217

In [57]:
#feature scores
feature_scores = gbm.get_fscore()
feature_scores

{'dropoff_longitude': 1050549,
 'dropoff_latitude': 1113878,
 'pickup_latitude': 1149541,
 'pickup_minute': 677938,
 'pickup_month': 306068,
 'pickup_day': 530039,
 'pickup_longitude': 1245797,
 'pickup_weekday': 290523,
 'pickup_hour': 515341}

In [58]:
#scale feature scores
feat_sum = 0
for key in feature_scores:
    feat_sum= feat_sum + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / feat_sum

feature_scores

{'dropoff_longitude': 0.1527033112324799,
 'dropoff_latitude': 0.16190854392228468,
 'pickup_latitude': 0.16709236513241763,
 'pickup_minute': 0.09854216929464972,
 'pickup_month': 0.044488735948825485,
 'pickup_day': 0.07704420296659406,
 'pickup_longitude': 0.18108372576956408,
 'pickup_weekday': 0.04222918120829563,
 'pickup_hour': 0.07490776452488883}

In [59]:
#save the model to be used
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))