### Load libraries

In [3]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt
pd.options.mode.chained_assignment = None
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score, train_test_split
import time
import math

### Loading data

In [4]:
dfTest = pd.read_csv("Data/test.csv")
dfTrain = pd.read_csv("Data/train.csv")

### Define functions needed

In [5]:
#Adding distance in km
# credit to: https://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

# A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
# credit: https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y, y0, log = True):
    assert len(y) == len(y0)
    if log:
        return np.sqrt(np.mean(np.power(np.log(y+1)-np.log(y0+1), 2)))
    else:
        return np.sqrt(np.mean(np.power(y-y0, 2)))
    

### Feature engineering

In [None]:
#Concat data for featuring engineering
features = dfTest.columns
label = "trip_duration"
data = pd.concat([dfTrain[features], dfTest], keys=['train','test'])

#Add datetime infos in diff columns
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['pickup_year'] = data['pickup_datetime'].dt.year
data['pickup_month'] = data['pickup_datetime'].dt.month
data['pickup_weekday'] = data['pickup_datetime'].dt.weekday
data['pickup_day'] = data['pickup_datetime'].dt.day
data['pickup_hour'] = data['pickup_datetime'].dt.hour
data['pickup_minute'] = data['pickup_datetime'].dt.minute

data.drop('pickup_datetime', axis=1, inplace=True)
data.drop('pickup_year', axis=1, inplace=True)

#Change storeflag in continue
data_dict = {'Y':1, 'N':0}
data_tf = data['store_and_fwd_flag'].map(data_dict)
data['store_and_fwd_flag'].update(data_tf)

#Add distance (takes a while)
data['distance'] = data.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

#Drop tails
train_set = data.loc['train']
test_set = data.loc['test']
data_temp = train_set
data_temp[label] = dfTrain[label]
data_temp = data_temp[data_temp[label] < 1800000]
target = data_temp[label]
target_log = np.log(target)
train_set = data_temp
data = train_set

#Drop id and labels
data = train_set.drop(['id', label], axis=1).astype(float)

### Train model

In [31]:
# set n_estimator=5000  to increase score
model = XGBRegressor(n_estimators=5000, max_depth=5,learning_rate=0.1, min_child_weight=1)

X_train, X_test, Y_train, Y_test = train_test_split(data, target_log, train_size=0.85, random_state=1234)

print("X_train:", X_train.shape, " Y_train:", Y_train.shape," X_test:", X_test.shape, " Y_test:", Y_test.shape)

start = time.time()
early_stopping_rounds = 50
model.fit(
    X_train, Y_train, eval_set = [(X_test, Y_test)],
    eval_metric="rmse", early_stopping_rounds=early_stopping_rounds,
    verbose=early_stopping_rounds
)
end = time.time() - start
print(end)

X_train: (1239844, 13)  Y_train: (1239844,)  X_test: (218796, 13)  Y_test: (218796,)
[0]	validation_0-rmse:5.42105
Will train until validation_0-rmse hasn't improved in 50 rounds.
[50]	validation_0-rmse:0.488814
[100]	validation_0-rmse:0.458033
[150]	validation_0-rmse:0.445328
[200]	validation_0-rmse:0.436578
[250]	validation_0-rmse:0.430399
[300]	validation_0-rmse:0.42656
[350]	validation_0-rmse:0.42254
[400]	validation_0-rmse:0.41947
[450]	validation_0-rmse:0.417358
[500]	validation_0-rmse:0.415543
[550]	validation_0-rmse:0.414311
[600]	validation_0-rmse:0.413014
[650]	validation_0-rmse:0.411849
[700]	validation_0-rmse:0.411114
[750]	validation_0-rmse:0.410466
[800]	validation_0-rmse:0.409847
[850]	validation_0-rmse:0.408962
[900]	validation_0-rmse:0.408127
[950]	validation_0-rmse:0.407167
[1000]	validation_0-rmse:0.406649
[1050]	validation_0-rmse:0.406001
[1100]	validation_0-rmse:0.405298
[1150]	validation_0-rmse:0.404763
[1200]	validation_0-rmse:0.404239
[1250]	validation_0-rmse:0.

### Predict

In [32]:
Y_pred = model.predict(X_test)
score = rmsle(Y_test, Y_pred)
scoreBis = rmsle(Y_test, Y_pred, log=False)
print("RMSLE score:", score, " RMSLE without-log:", scoreBis)

RMSLE score: 0.0590733782667  RMSLE without-log: 0.399306653321


### Submission

In [33]:
dataSub = test_set.drop('id', axis=1).astype(float)
Y_eval_log = model.predict(dataSub)
Y_eval = np.exp(Y_eval_log.ravel())
submit_file = pd.DataFrame({'id': test_set['id'], 'trip_duration': Y_eval})
submit_file.to_csv('Data/submission.csv',index=False)