In [1]:
import pandas as pd
import numpy as np
import csv
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import time
import xgboost as xgb
import scipy
from geopy import distance
import geopy
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn import preprocessing


%matplotlib inline
sns.set()
plt.rcParams['figure.figsize'] = [16, 10]

#### Read Data

In [2]:
NYC_DEGREE_KM = 111.05938787411571

def calculate_city_block_distance(df_data):
    delta_lat = np.absolute(df_data.pickup_latitude - df_data.dropoff_latitude) * NYC_DEGREE_KM    
    delta_lon = np.absolute(df_data.pickup_longitude - df_data.dropoff_longitude) * NYC_DEGREE_KM    
    return delta_lat + delta_lon

In [3]:
PATH_TRAIN_DATASET = './data/train.csv'
PATH_TEST_DATASET = './data/test.csv'
PATH_SAMPLE_SUMBISSION = './data/sample_submission.csv'

In [4]:
try:
    df_train.head()
except:
    df_test = pd.read_csv(PATH_TEST_DATASET, infer_datetime_format=True, parse_dates=['pickup_datetime'],  index_col='id')
    df_train = pd.read_csv(PATH_TRAIN_DATASET, infer_datetime_format=True,parse_dates=['pickup_datetime'], index_col='id')
    df_sample_submission = pd.read_csv(PATH_SAMPLE_SUMBISSION)
    
    df_train.drop('dropoff_datetime', axis=1, inplace=True)
    df_train['pickup_datetime'] = df_train['pickup_datetime'].dt.to_pydatetime()
    df_test['pickup_datetime'] = df_test['pickup_datetime'].dt.to_pydatetime()
    
    df_train['pickup_date'] = df_train['pickup_datetime'].dt.date
    df_train['pickup_hour'] = df_train['pickup_datetime'].dt.hour
    df_train['pickup_weekday'] = df_train['pickup_datetime'].dt.day_name()

    holidays = [day.date() for day in calendar().holidays(start=df_train['pickup_date'].min(), end=df_train['pickup_date'].max())]
    df_train['holiday'] = df_train['pickup_date'].isin(holidays)
    df_train.drop('pickup_date', axis=1, inplace=True)
    
    df_train['distance'] = calculate_city_block_distance(df_train)
    df_train['avg_speed'] = df_train['distance']/(df_train['trip_duration']/3600)
    df_original_train = df_train.copy()
    df_original_test = df_test.copy()
finally:
    df_train = df_original_train.copy()
    df_test = df_original_test.copy()
    pass

In [5]:
Q1 = df_train['trip_duration'].quantile(0.25)
Q3 = df_train['trip_duration'].quantile(0.75)
IQR = Q3 - Q1
df_train = df_train[~((df_train['trip_duration'] < (Q1 - 1.5 * IQR)) |(df_train['trip_duration'] > (Q3 + 1.5 * IQR)))]
df_train['trip_duration'].describe()

count    1.384424e+06
mean     7.317026e+02
std      4.481081e+02
min      1.000000e+00
25%      3.840000e+02
50%      6.320000e+02
75%      9.910000e+02
max      2.092000e+03
Name: trip_duration, dtype: float64

In [6]:
NYC_BOUNDING_BOX = [(40.4774,-74.2589), ( 40.9176, -73.7004)]

filter_lat_long = df_train['pickup_latitude'] < NYC_BOUNDING_BOX[1][0]
filter_lat_long &= df_train['pickup_latitude'] > NYC_BOUNDING_BOX[0][0]
filter_lat_long &= df_train['pickup_longitude'] < NYC_BOUNDING_BOX[1][1]
filter_lat_long &= df_train['pickup_longitude'] > NYC_BOUNDING_BOX[0][1]

filter_lat_long &= df_train['dropoff_latitude'] < NYC_BOUNDING_BOX[1][0]
filter_lat_long &= df_train['dropoff_latitude'] > NYC_BOUNDING_BOX[0][0]
filter_lat_long &= df_train['dropoff_longitude'] < NYC_BOUNDING_BOX[1][1]
filter_lat_long &= df_train['dropoff_longitude'] > NYC_BOUNDING_BOX[0][1]

df_train = df_train[filter_lat_long]

In [7]:
df_train = df_train[df_train['avg_speed'] < 100]
df_train = df_train[df_train['avg_speed'] > 1]
df_train = df_train[df_train['distance'] > .25]

df_train = df_train[df_train['trip_duration'] > 1]
df_train = df_train[df_train['trip_duration'] < 7200]

df_train.drop('avg_speed', axis=1, inplace=True)

In [8]:
df_train = df_train[df_train['passenger_count'] > 0]

In [9]:
df_train = pd.get_dummies(df_train, columns=['vendor_id', 'passenger_count', 
                                    'store_and_fwd_flag', 'pickup_weekday', 'pickup_hour', 'holiday'])
df_train.drop(['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=1, inplace=True)

In [14]:
df_train['trip_duration'] = np.log(df_train['trip_duration'] + 1)
df_train['distance'] = np.log(df_train['distance'] + 1)

In [15]:
def kaggle_score(y_true, y_pred):
    y_pred_exp = np.exp(y_pred) - 1
    y_true_exp = np.exp(y_true) - 1
    e_log_square = np.square( np.log(y_pred_exp + 1) - np.log( y_true_exp + 1) )
    score = np.sqrt( (1/len(y_true_exp)) * np.sum(e_log_square) )
    return score


In [16]:
from sklearn.model_selection import train_test_split

df_y_train = df_train['trip_duration']
df_X_train = df_train.drop(columns=['trip_duration'])

X_train, X_test, y_train, y_test = train_test_split(df_X_train,
                                                        df_y_train,
                                                        test_size = 0.3,
                                                        random_state = 3)

## Training the Model

In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Estimating RMSLE on the test set with LinearRegression')
print(kaggle_score( y_test, y_pred))

Estimating RMSLE on the test set with LinearRegression
0.42544939377656404


In [20]:
from sklearn.linear_model import Lasso
model = Lasso(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.6674914761366839


In [21]:
from sklearn.linear_model import Ridge
model = Ridge(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.4254494827225051


In [22]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(random_state=3, l1_ratio=0.0000001)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.5780364160563632


In [23]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=3, max_depth=None, min_samples_split=2)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )


Estimating RMSLE on the test set with Lasso
0.5661372918817497


In [24]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )




Estimating RMSLE on the test set with Lasso
0.45528085083412867


In [25]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.40792254444974163


In [26]:
import xgboost as xgb
model = xgb.XGBRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.4080742167350305


In [28]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# greater_is_better= False: we want to minimize the root mean square logarithmic error
scorer = make_scorer(kaggle_score, greater_is_better= False)

model = Ridge(random_state=3)

parameters = {'alpha': [0.1, 5.0, 10.0, 50.0], 'solver': ['auto', 'lsqr', 'sag', 'svd']}

clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, n_jobs=-1, cv= 3)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_ridge = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_ridge.predict(X_test) ) )

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed:  1.5min remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.6min finished


{'alpha': 50.0, 'solver': 'sag'}
0.4254512103471298


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

model = RandomForestRegressor(random_state=3)
scorer = make_scorer(kaggle_score, greater_is_better= False)

parameters = {'max_depth': [50, 80], 'n_estimators': [150, 250], 'min_samples_leaf': [1, 2, 3],
              'min_samples_split': [2, 3], 'max_features': ['auto']}


clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, cv=2, n_jobs=1)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_random_forest = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_random_forest.predict(X_test) ) )

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=150 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/falkets/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-ce1bd4544dc6>", line 14, in <module>
    grid_fit = clf.fit(X_train, y_train)
  File "/home/falkets/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 722, in fit
    self._run_search(evaluate_candidates)
  File "/home/falkets/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 1191, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/home/falkets/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 711, in evaluate_candidates
    cv.split(X, y, groups)))
  File "/home/falkets/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py", line 917, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/falkets/anaconda3/

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

model = GradientBoostingRegressor(random_state=3)
scorer = make_scorer(kaggle_score, greater_is_better= False)



parameters = {
    'max_depth': [3, 5],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 6],
    'learning_rate': [0.1, 1.0]
}

clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, cv= 2, n_jobs=6)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_gradient_boosting = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_gradient_boosting.predict(X_test) ) )

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# greater_is_better= False: we want to minimize the root mean square logarithmic error
scorer = make_scorer(kaggle_score, greater_is_better=False)

model = xgb.XGBRegressor(random_state=3)

parameters = {'max_depth': [5, 8, 10], 'n_estimators': [200, 300],
              'learning_rate': [0.05, 0.1,], 'reg_lambda': [1.0, 5] }

clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, cv=2, n_jobs=6)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_xgboost = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_xgboost.predict(X_test) ) )