## 3 Grid Search

### 3.1 Preparation

##### Library Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
from day_18_challenge_pipeline_classes import ConvertStringDateToYear, ConvertZeroToN, MergeColumns, ColumnSelector
from day_18_challenge_pipeline_classes import compare_predictions

In [4]:
import warnings
warnings.filterwarnings('ignore') # seaborn shows a lot of ugly warnings, let's suppress this for the analysis part

##### Data Import

In [5]:
df = pd.read_csv('data/dc_housing/DC_Properties_training.csv', index_col=0, low_memory=False)
df = df[~np.isnan(df['PRICE'])]
df = df[~df['SALEDATE'].isnull()]

x_train = df.drop('PRICE', axis=1)
y_train = df.loc[:,['PRICE']]

In [None]:
df_holdout = pd.read_csv('data/dc_housing/holdout_test_data.csv', index_col=0, low_memory=False)
df_holdout = df_holdout[~np.isnan(df_holdout['PRICE'])]

x_test = df_holdout.drop('PRICE', axis=1)
y_test = df_holdout.loc[:,['PRICE']]

In [None]:
cols_num = ['BATHRM','HF_BATHRM','ROOMS','BEDRM','FIREPLACES','YEAR', 'EYB','GBA','LANDAREA']
cols_ord = ['GRADE']
cols_cat = ['AC','SOURCE']

cols_all = cols_num + cols_ord + cols_cat

##### Processing Pipeline Definition

In [8]:
processing_pipeline = make_pipeline(
    
    # RowNanChecker(my_column) # to be implemented if time allows
    MergeColumns('GBA', 'LIVING_GBA'),
    ConvertZeroToN('AC'),
    ConvertStringDateToYear('SALEDATE'),
    ColumnSelector(cols_all),  # we first have to select all the columns used in the union
    make_union(
        make_pipeline(ColumnSelector(cols_num),
                      StandardScaler()
        ),
        make_pipeline(ColumnSelector(cols_ord),
                      OrdinalEncoder()
        ),
        make_pipeline(ColumnSelector(cols_cat),
                      OneHotEncoder()
        )
    )
)

### 3.1 Random Forest

In [None]:
pipeline_rf = (make_pipeline(processing_pipeline, RandomForestRegressor()))
pipeline_rf.fit(x_train, y_train)

In [None]:
n_estimators = [100, 500]                            # number of trees in random forest
max_features = [5, 10]                             # number of features to consider at every split
max_depth = [int(x) for x in np.linspace(5, 110, num = 2)] # maximum number of levels in tree
max_depth.append(None)

grid_rf = {'randomforestregressor__n_estimators': n_estimators,
           'randomforestregressor__max_features': max_features,
           'randomforestregressor__max_depth': max_depth}

In [None]:
#clf = GridSearchCV(pipeline, grid_rf, n_jobs=1, verbose=True, scoring='neg_mean_absolute_error')
clf.fit(x_train, y_train)

clf_preds = clf.predict(x_test)
clf_preds = pd.Series(clf_preds)
clf_preds = clf_preds.rename("Grid Search Predicted values")

In [None]:
clf.cv_results_.keys()

In [None]:
best_params = clf.cv_results_["mean_test_score"]
best_params[0:10]

In [None]:
pred_train = compare_predictions(x_train, y_train, pipeline, y_train['PRICE'].mean())

### 3.2 Gradient Boosting

In [None]:
pipeline_gb = (make_pipeline(processing_pipeline, GradientBoostingRegressor()))
pipeline_gb.fit(x_train, y_train)

In [None]:
# Preparing grid search
# Supported parameters for GradientBoostingRegressor
# ---------------------
# loss='ls'
# learning_rate=0.1
# n_estimators=100
# subsample=1.0
# criterion='friedman_mse'
# min_samples_split=2
# min_samples_leaf=1
# min_weight_fraction_leaf=0.0
# max_depth=3
# min_impurity_decrease=0.0
# min_impurity_split=None
# init=None
# random_state=None
# max_features=None
# alpha=0.9
# verbose=0
# max_leaf_nodes=None
# warm_start=False
# presort='auto'
# validation_fraction=0.1
# n_iter_no_change=None
# tol=0.0001

learning_rate = [0.01, 0.04, 0.1]
n_estimators = [100, 500, 1000]
min_samples_split = [2, 3, 4]
min_samples_leaf = [1, 2]
max_depth = [3, 5, 6]

grid_gb = {'gradientboostingregressor__learning_rate': learning_rate,
           'gradientboostingregressor__n_estimators': n_estimators,
           'gradientboostingregressor__min_samples_split': min_samples_split,
           'gradientboostingregressor__min_samples_leaf': min_samples_leaf,
           'gradientboostingregressor__max_depth': max_depth}

In [None]:
#clf = GridSearchCV(pipeline_gb, grid_gb, n_jobs=1, verbose=True, scoring='r2')
clf.fit(x_train, y_train)

clf_preds = clf.predict(x_test)
clf_preds = pd.Series(clf_preds)
clf_preds = clf_preds.rename("Grid Search Predicted values")

In [None]:
clf.cv_results_.keys()

In [None]:
best_params = clf.cv_results_["mean_test_score"]
best_params[0:10]

### 3.3 Ada Boost

In [None]:
pipeline_ab = (make_pipeline(processing_pipeline, AdaBoostRegressor()))
pipeline_ab.fit(x_train, y_train)

In [None]:
# Preparing grid search
# Supported parameters for AdaBoostRegressor
# ---------------------
# ...
# ...
# ...

### to be updated ###
learning_rate = [0.01, 0.04, 0.1]
n_estimators = [100, 500, 1000]
min_samples_split = [2, 3, 4]
min_samples_leaf = [1, 2]
max_depth = [3, 5, 6]
### to be updated ###

grid_ab = {'adaboostregressor__learning_rate': learning_rate,
           'adaboostregressor__n_estimators': n_estimators,
           'adaboostregressor__min_samples_split': min_samples_split,
           'adaboostregressor__min_samples_leaf': min_samples_leaf,
           'adaboostregressor__max_depth': max_depth}

In [None]:
#clf = GridSearchCV(pipeline_ab, grid_ab, n_jobs=1, verbose=True, scoring='r2')
clf.fit(x_train, y_train)

clf_preds = clf.predict(x_test)
clf_preds = pd.Series(clf_preds)
clf_preds = clf_preds.rename("Grid Search Predicted values")

In [None]:
clf.cv_results_.keys()

In [None]:
best_params = clf.cv_results_["mean_test_score"]
best_params[0:10]

### 3.4 Bagging

In [None]:
pipeline_ba = (make_pipeline(processing_pipeline, BaggingRegressor()))
pipeline_ba.fit(x_train, y_train)

In [None]:
# Preparing grid search
# Supported parameters for BaggingRegressor
# ---------------------
# ...
# ...
# ...

### to be updated ###
learning_rate = [0.01, 0.04, 0.1]
n_estimators = [100, 500, 1000]
min_samples_split = [2, 3, 4]
min_samples_leaf = [1, 2]
max_depth = [3, 5, 6]
### to be updated ###

grid_ba = {'baggingregressor__learning_rate': learning_rate,
           'baggingregressor__n_estimators': n_estimators,
           'baggingregressor__min_samples_split': min_samples_split,
           'baggingregressor__min_samples_leaf': min_samples_leaf,
           'baggingregressor__max_depth': max_depth}

In [None]:
#clf = GridSearchCV(pipeline_ab, grid_ab, n_jobs=1, verbose=True, scoring='r2')
clf.fit(x_train, y_train)

clf_preds = clf.predict(x_test)
clf_preds = pd.Series(clf_preds)
clf_preds = clf_preds.rename("Grid Search Predicted values")

In [None]:
clf.cv_results_.keys()

In [None]:
best_params = clf.cv_results_["mean_test_score"]
best_params[0:10]