In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV

Read the data saved after the data cleanup.

In [2]:
df = pd.read_csv('train_clean.csv')
df_y = df.fare_amount
df_X = df.drop(['fare_amount'], axis=1)
df_X.shape

(4837151, 11)

In [3]:
del df

In [4]:
df_X.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,pickup_day,pickup_hour,pickup_month,pickup_year,pickup_day_of_week
0,-73.844311,40.721319,-73.84161,40.712278,1,0.640487,15,17,6,2009,0
1,-74.016048,40.711303,-73.979268,40.782004,1,5.250668,5,16,1,2010,1
2,-73.982738,40.76127,-73.991242,40.750562,2,0.863411,18,0,8,2011,3
3,-73.98713,40.733143,-73.991567,40.758092,1,1.739385,21,4,4,2012,5
4,-73.968095,40.768008,-73.956655,40.783762,1,1.242218,9,7,3,2010,1


In [5]:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [6]:
train_X, test_X, train_y, test_y = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [8]:
def gridSearch(clf, parameters):
    
    # Make an RMSE scoring object.
    scorer = make_scorer(rmse, greater_is_better = False)

    # Perform grid search on the classifier using 'scorer' as the scoring method.
    grid_obj = GridSearchCV(clf, parameters, cv=3, scoring=scorer, n_jobs=-1)

    # Fit the grid search object to the training data and find the optimal parameters.
    grid_fit = grid_obj.fit(train_X, train_y)

    # Get the estimator.
    best_clf = grid_fit.best_estimator_
    
    # Let's also explore what parameters ended up being used in the new model.
    print("Best Model obtained after GridSearch :")
    print(best_clf)

    return best_clf 


In [9]:
def evaluate_model(clf):
    
    # Fit the model.
    clf.fit(train_X, train_y)

    # Make predictions on test data
    pred = clf.predict(test_X)
    print(f'RMSE (Test Data) : {rmse(test_y, pred)}')
    
    # Make predictions on train data
    pred = clf.predict(train_X)
    print(f'RMSE (Train Data) : {rmse(train_y, pred)}')

### Decision Tree Regressor

First I will run the regressor by using the default parameters. Then I will run GridSearchCV to tune the hyperparameters.

In [8]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=42)
print('Without Parameter Tuning :')
evaluate_model(regressor)

#Tune the hyperparameters using Grid Search
parameters = {'max_depth':[10, 20, 50], 'min_samples_leaf': [10, 20, 50], 'min_samples_split' : [20, 50, 100]}

regressor = gridSearch(regressor, parameters)
evaluate_model(regressor)

Without Parameter Tuning :
RMSE (Test Data) : 4.7361202406526015
RMSE (Train Data) : 2.931870123872723e-15
Best Model obtained after GridSearch :
DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=20,
           min_samples_split=100, min_weight_fraction_leaf=0.0,
           presort=False, random_state=42, splitter='best')
RMSE (Test Data) : 3.492933228722207
RMSE (Train Data) : 3.2505450748737843


### Random Forest Regressor

Let's try Random Forest Regressor now.

In [9]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(random_state=42, n_estimators=100)
print('Without Parameter Tuning :')
evaluate_model(regressor)

#Tune the hyperparameters using Grid Search
parameters = {'max_depth': [20, 50], 'min_samples_split' : [5, 10, 15]}

regressor = gridSearch(regressor, parameters)
evaluate_model(regressor)

Without Parameter Tuning :
RMSE (Test Data) : 3.309397942589038
RMSE (Train Data) : 1.2308374489316625
Best Model obtained after GridSearch :
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=15,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
RMSE (Test Data) : 3.2941817505013753
RMSE (Train Data) : 2.139537877157225


The results given by random forest regressor are a slight improvement over the results given by Decision Trees.

### Support Vector Regressor

In [10]:
#Support Vector Regressor
from sklearn.svm import SVR
from warnings import filterwarnings


#Ignore the ConvergenceWarning: Solver terminated early (max_iter=10000).  
#Consider pre-processing your data with StandardScaler or MinMaxScaler. 
filterwarnings('ignore', category= UserWarning)

regressor = SVR(gamma='scale', max_iter = 10000, cache_size=50000)
print('Without Parameter Tuning :')
evaluate_model(regressor)

#Tune the hyperparameters using Grid Search
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'degree': [1, 2, 3, 4], 'epsilon': [0.1, 0.5, 1.5]}

regressor = gridSearch(regressor, parameters)
evaluate_model(regressor)

filterwarnings('default')#Reset the warning filter to default

Without Parameter Tuning :
RMSE (Test Data) : 19.82496475969781
RMSE (Train Data) : 19.829560778709116
Best Model obtained after GridSearch :
SVR(C=1.0, cache_size=50000, coef0=0.0, degree=4, epsilon=1.5, gamma='scale',
   kernel='poly', max_iter=10000, shrinking=True, tol=0.001, verbose=False)
RMSE (Test Data) : 19.73822191721031
RMSE (Train Data) :  19.74277713553644


### Ada Boost Regressor

In [11]:
from sklearn.ensemble import AdaBoostRegressor

regressor = AdaBoostRegressor(n_estimators=1000, random_state=42)
print(f'Without Parameter Tuning :
evaluate_model(regressor)

Without Parameter Tuning :
RMSE (Test Data) : 10.218112425602444
RMSE (Train Data) : 10.222883987432152


Now, I will try using the best DecisionTreeClassfier obtained after we ran GridSearchCV.

In [11]:
base_reg= DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=20,
           min_samples_split=100, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

regressor = AdaBoostRegressor(base_estimator = base_reg, n_estimators=100, random_state=42)
print('After Changing Base Estimator :')
evaluate_model(regressor)

After Changing Base Estimator :
RMSE (Test Data) : 4.575925746791277
RMSE (Train Data) : 3.9494954181721362
