In [None]:
best_overall_model = best_models['XGBoost']
state_avg = []
state_std = []
state_score = []
for i in range(10):
    print(f'Runing Iteration {i}')
    random_state = np.random.randint(low=0, high=100)
    X_train, X_test, y_train, y_test = train_test_split(df_X_train,
                                                        df_y_train,
                                                        test_size = 0.3,
                                                        random_state = random_state)
    clf = sklearn.base.clone(best_overall_model['clf'])
    clf.random_state = random_state
    clf.nthread=6
    clf.fit(df_X_train, df_y_train)
    y_predict = clf.predict(X_test)
    
    state_score.append(kaggle_score(y_test, y_predict))
    y_predict = np.exp(y_predict) - 1
    state_avg.append(y_predict.mean())
    state_std.append(y_predict.std())

df_result_free = pd.DataFrame(
    {'state_score': state_score,
     'state_avg': state_avg,
     'state_std': state_std
    })

In [1]:
import pandas as pd
import numpy as np
import csv
import sklearn
from datetime import datetime, timedelta
import time
import xgboost as xgb
import scipy
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn import preprocessing



PATH_TRAIN_DATASET = './data/train.csv'
PATH_TEST_DATASET = './data/test.csv'
PATH_SAMPLE_SUMBISSION = './data/sample_submission.csv'

NYC_DEGREE_KM = 111.05938787411571
NYC_BOUNDING_BOX = [(40.4774,-74.2589), ( 40.9176, -73.7004)]

def calculate_city_block_distance(df_data):
    delta_lat = np.absolute(df_data.pickup_latitude - df_data.dropoff_latitude) * NYC_DEGREE_KM    
    delta_lon = np.absolute(df_data.pickup_longitude - df_data.dropoff_longitude) * NYC_DEGREE_KM    
    return delta_lat + delta_lon

def kaggle_score(y_true_exp, y_pred_exp):
    y_pred_exp = np.exp(y_pred_exp) - 1
    y_true_exp = np.exp(y_true_exp) - 1
    e_log_square = np.square( np.log(y_pred_exp + 1) - np.log(y_true_exp + 1))
    score = np.sqrt((1/len(y_true_exp)) * np.sum(e_log_square))
    return score

df_test = pd.read_csv(PATH_TEST_DATASET, infer_datetime_format=True, parse_dates=['pickup_datetime'],  index_col='id')
df_train = pd.read_csv(PATH_TRAIN_DATASET, infer_datetime_format=True,parse_dates=['pickup_datetime'], index_col='id')

df_train.drop('dropoff_datetime', axis=1, inplace=True)
df_train['pickup_datetime'] = df_train['pickup_datetime'].dt.to_pydatetime()
df_test['pickup_datetime'] = df_test['pickup_datetime'].dt.to_pydatetime()

Q1 = df_train['trip_duration'].quantile(0.25)
Q3 = df_train['trip_duration'].quantile(0.75)
IQR = Q3 - Q1
df_train = df_train[~((df_train['trip_duration'] < (Q1 - 1.5 * IQR)) |(df_train['trip_duration'] > (Q3 + 1.5 * IQR)))]

df_train = df_train[df_train['trip_duration'] > 1]
df_train = df_train[df_train['trip_duration'] < 7200]

filter_lat_long = df_train['pickup_latitude'] < NYC_BOUNDING_BOX[1][0]
filter_lat_long &= df_train['pickup_latitude'] > NYC_BOUNDING_BOX[0][0]
filter_lat_long &= df_train['pickup_longitude'] < NYC_BOUNDING_BOX[1][1]
filter_lat_long &= df_train['pickup_longitude'] > NYC_BOUNDING_BOX[0][1]

filter_lat_long &= df_train['dropoff_latitude'] < NYC_BOUNDING_BOX[1][0]
filter_lat_long &= df_train['dropoff_latitude'] > NYC_BOUNDING_BOX[0][0]
filter_lat_long &= df_train['dropoff_longitude'] < NYC_BOUNDING_BOX[1][1]
filter_lat_long &= df_train['dropoff_longitude'] > NYC_BOUNDING_BOX[0][1]


df_train['distance'] = calculate_city_block_distance(df_train)
df_train = df_train[df_train['distance'] > .1]
df_train['avg_speed'] = df_train['distance']/(df_train['trip_duration']/3600)
df_train = df_train[df_train['avg_speed'] < 100]
df_train = df_train[df_train['avg_speed'] > 1]

df_train.drop('avg_speed', axis=1, inplace=True)

df_train['pickup_date'] = df_train['pickup_datetime'].dt.date
df_train['pickup_hour'] = df_train['pickup_datetime'].dt.hour
df_train['pickup_weekday'] = df_train['pickup_datetime'].dt.day_name()

holidays = [day.date() for day in calendar().holidays(start=df_train['pickup_date'].min(), end=df_train['pickup_date'].max())]
df_train['holiday'] = df_train['pickup_date'].isin(holidays)
df_train.drop('pickup_date', axis=1, inplace=True)

df_train = df_train[df_train['passenger_count']>0]

df_train = pd.get_dummies(df_train, columns=['vendor_id', 'passenger_count', 
                                    'store_and_fwd_flag', 'pickup_weekday', 'pickup_hour', 'holiday'])

cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
df_train[cols] = df_train[cols].round(3)

df_train.drop(['pickup_datetime'], axis=1, inplace=True)

df_train['trip_duration'] = np.log(df_train['trip_duration'] + 1)
df_train['distance'] = np.log(df_train['distance'] + 1)

from sklearn.model_selection import train_test_split

df_y_train = df_train['trip_duration']
df_X_train = df_train.drop(columns=['trip_duration'])

X_train, X_test, y_train, y_test = train_test_split(df_X_train,
                                                    df_y_train,
                                                    test_size = 0.3,
                                                    random_state = 3)



## Training the Model

In [2]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Estimating RMSLE on the test set with LinearRegression')
print(kaggle_score( y_test, y_pred))
0.4136468008174538

Estimating RMSLE on the test set with LinearRegression
0.4164895901671564


0.4136468008174538

In [3]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
clf = model.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Estimating RMSLE on the test set with LinearRegression')
print(kaggle_score( y_test, y_pred))

Estimating RMSLE on the test set with LinearRegression
0.4164895901671564


In [4]:
from sklearn.linear_model import Lasso
model = Lasso(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.6736416145430251


In [5]:
from sklearn.linear_model import Ridge
model = Ridge(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.41649015670802203


In [6]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(random_state=3, l1_ratio=0.0000001)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )

Estimating RMSLE on the test set with Lasso
0.5824324330743528


In [7]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=3, max_depth=None, min_samples_split=2)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )


Estimating RMSLE on the test set with Lasso
0.46676467032269114


In [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )
0.45528085083412867



Estimating RMSLE on the test set with Lasso
0.34154495163660903


0.45528085083412867

In [9]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )
0.40792254444974163

Estimating RMSLE on the test set with Lasso
0.3923103353785279


0.40792254444974163

In [10]:
import xgboost as xgb
model = xgb.XGBRegressor(random_state=3)
clf = model.fit(X_train, y_train)
print('Estimating RMSLE on the test set with Lasso')
print(kaggle_score( y_test, clf.predict(X_test) ) )
0.4080742167350305

Estimating RMSLE on the test set with Lasso
0.39225595593255225


0.4080742167350305

In [11]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# greater_is_better= False: we want to minimize the root mean square logarithmic error
scorer = make_scorer(kaggle_score, greater_is_better= False)

model = Ridge(random_state=3)

parameters = {'alpha': [0.1, 5.0, 10.0, 50.0], 'solver': ['auto', 'lsqr', 'sag', 'svd']}
# parameters = {'alpha': [0.1], 'solver': ['sag']}

clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, n_jobs=-1, cv= 3)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_ridge = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_ridge.predict(X_test) ) )
0.41648924466344467

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   35.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed:  1.6min remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  1.6min finished


{'alpha': 0.1, 'solver': 'auto'}
0.416490182506801


0.41648924466344467

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

model = GradientBoostingRegressor(random_state=3)
scorer = make_scorer(kaggle_score, greater_is_better= False)



parameters = {
    'max_depth': [3, 5],
    'n_estimators': [100, 200],
    'min_samples_split': [2, 6],
    'learning_rate': [0.1, 1.0]
}

# parameters = {'learning_rate': [0.1], 'max_depth': [5], 'min_samples_split': [6], 'n_estimators': [200]}

clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, cv= 2, n_jobs=6)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_gradient_boosting = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_gradient_boosting.predict(X_test) ) )
0.35810574107314747

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:  4.4min
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:  8.9min
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed: 29.4min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed: 38.3min
[Parallel(n_jobs=6)]: Done  25 out of  32 | elapsed: 49.0min remaining: 13.7min
[Parallel(n_jobs=6)]: Done  29 out of  32 | elapsed: 58.2min remaining:  6.0min
[Parallel(n_jobs=6)]: Done  32 out of  32 | elapsed: 62.4min finished


{'learning_rate': 1.0, 'max_depth': 5, 'min_samples_split': 6, 'n_estimators': 200}
0.3325317765694744


0.35810574107314747

In [13]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# greater_is_better= False: we want to minimize the root mean square logarithmic error
scorer = make_scorer(kaggle_score, greater_is_better=False)

model = xgb.XGBRegressor(random_state=3)

parameters = {'max_depth': [5, 8, 10], 'n_estimators': [200, 300],
              'learning_rate': [0.05, 0.1,], 'reg_lambda': [1.0, 5] }
# parameters = {'learning_rate': [0.1], 'max_depth': [5], 'n_estimators': [300], 'reg_lambda': [5]}
clf = GridSearchCV(model, param_grid= parameters, scoring= scorer, verbose= 10, cv=2, n_jobs=6)

grid_fit = clf.fit(X_train, y_train)

print(grid_fit.best_params_)
best_clf_xgboost = grid_fit.best_estimator_

print(kaggle_score( y_test, best_clf_xgboost.predict(X_test) ) )
0.34973122433139087

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:  9.2min
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed: 14.1min
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed: 48.8min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed: 71.8min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed: 96.0min
[Parallel(n_jobs=6)]: Done  42 out of  48 | elapsed: 129.9min remaining: 18.6min
[Parallel(n_jobs=6)]: Done  48 out of  48 | elapsed: 150.3min finished


{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'reg_lambda': 5}
0.3140347959375159


0.34973122433139087

In [2]:
df_test = pd.read_csv(PATH_TEST_DATASET, infer_datetime_format=True, parse_dates=['pickup_datetime'],  index_col='id')
# df_test['pickup_datetime'] = df_test['pickup_datetime'].dt.to_pydatetime()
df_test['pickup_date'] = df_test['pickup_datetime'].dt.date
df_test['pickup_hour'] = df_test['pickup_datetime'].dt.hour
df_test['pickup_weekday'] = df_test['pickup_datetime'].dt.day_name()

holidays = [day.date() for day in calendar().holidays(start=df_test['pickup_date'].min(), end=df_test['pickup_date'].max())]
df_test['holiday'] = df_test['pickup_date'].isin(holidays)
df_test.drop('pickup_date', axis=1, inplace=True)
df_test['distance'] = calculate_city_block_distance(df_test)
df_test['distance'] = np.log(df_test['distance'] + 1)

df_test = pd.get_dummies(df_test, columns=['vendor_id', 'passenger_count', 
                                    'store_and_fwd_flag', 'pickup_weekday', 'pickup_hour', 'holiday'])
# df_test.drop(['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=1, inplace=True)
df_test.drop(['passenger_count_0', 'passenger_count_9'], axis=1, inplace=True)
df_test.drop(['pickup_datetime'], axis=1, inplace=True)
cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
df_test[cols] = df_test[cols].round(3)


In [8]:
df_X_train.columns

Index(['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'store_and_fwd_flag'],
      dtype='object')

In [13]:
df_train = pd.read_csv(PATH_TRAIN_DATASET, infer_datetime_format=True,parse_dates=['pickup_datetime'], index_col='id')
df_test = pd.read_csv(PATH_TEST_DATASET, infer_datetime_format=True, parse_dates=['pickup_datetime'],  index_col='id')

df_X_train.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)
df_test.drop(['pickup_datetime', 'dropoff_datetime'], axis=1, inplace=True)

df_y_train = df_train['trip_duration']
df_X_train = df_train.drop(columns=['trip_duration'])

from sklearn.linear_model import LinearRegression
model = LinearRegression()
clf = model.fit(df_X_train, df_y_train)
y_predict = clf.predict(df_test)
y_predict = np.exp(y_predict) - 1
df_result = pd.DataFrame(y_predict, columns=['trip_duration'], index=df_test.index.values)
create_txt_file_for_submission(df_result, 'LinearRegression')   

KeyError: "['dropoff_datetime'] not found in axis"

In [4]:
import os
BASE_PATH_KAGGLE_SUBMISISON = './out'

def create_txt_file_for_submission(df_data, file_name):
    
    final_path = os.path.abspath(os.path.join(BASE_PATH_KAGGLE_SUBMISISON,file_name))
    final_path += '.txt'
    df_data.index.name = 'id'
    print(final_path)
    df_data.to_csv(final_path,
                   sep=',',
                   header=True,
                   na_rep=df_data['trip_duration'].quantile(0.5)
                  )
    
final_clf = sklearn.base.clone(best_clf_xgboost)
final_clf.fit(df_X_train, df_y_train)
model_name='xgboost_latlong'
y_predict = final_clf.predict(df_test)
y_predict = np.exp(y_predict) - 1
df_result = pd.DataFrame(y_predict, columns=['trip_duration'], index=df_test.index.values)
create_txt_file_for_submission(df_result, model_name)   

NameError: name 'best_clf_xgboost' is not defined

Final: 0.42976