# Advanced Pipelines with Grid Search (regression)
Rather than using pipelines to evaluate the models with defaults and THEN performing your grid search - why don't we try to do everything at once?!

# Import Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
#from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load dataset
# we will use Gdown to load our Boston Housing dataset
# https://drive.google.com/file/d/1a0aNGSFWB-pf5ut1NsjE5ECIsbHHoAwI/view?usp=sharing
!gdown 1a0aNGSFWB-pf5ut1NsjE5ECIsbHHoAwI

# look left! it downloaded a local copy of 'BostonHousing.csv'

Downloading...
From: https://drive.google.com/uc?id=1a0aNGSFWB-pf5ut1NsjE5ECIsbHHoAwI
To: /content/BostonHousing.csv
100% 35.2k/35.2k [00:00<00:00, 42.2MB/s]


In [None]:
df = pd.read_csv('BostonHousing.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Read Data

In [None]:
# Split-out validation df
X = df.drop('medv', axis=1) #covariates - just drop the target!
y = df['medv'] #target variable
validation_size = 0.20
seed = 123 # so you will split the same way and evaluate the SAME dataset

# split!
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                                test_size=validation_size,
                                                                random_state=seed)

## Build Pipeline

In [None]:
# Construct some pipelines
pipe_dt = Pipeline([('scl', StandardScaler()),
			('clf', DecisionTreeRegressor(random_state=42))])

pipe_dt_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(0.95)),
			('clf', DecisionTreeRegressor(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
			('clf', RandomForestRegressor(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
			('pca', PCA(0.95)),
			('clf', RandomForestRegressor(random_state=42))])


## Define your Parameters for Grid Search

In [None]:
# Set grid search params
param_range = [5, 10, 15, 25]

# be careful! you need clf with two underscores clf__,
# because this is how we named the model above (look at all the clf)
# if you change clf above, make sure you update it down here too

grid_params_dt = [{
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': param_range,
		'clf__min_samples_split': param_range[1:]}] #everything except the first one!

grid_params_rf = [{
		'clf__min_samples_leaf': param_range,
		'clf__max_depth': param_range,
		'clf__min_samples_split': param_range[1:]}] #everything except the first one!

## Define your Grid Search

In [None]:
# Construct grid searches

gs_dt = GridSearchCV(estimator=pipe_dt,
    param_grid=grid_params_dt,
    scoring='neg_median_absolute_error',
    cv=10)

gs_dt_pca = GridSearchCV(estimator=pipe_dt_pca,
    param_grid=grid_params_dt,
    scoring='neg_median_absolute_error',
    cv=10)

gs_rf = GridSearchCV(estimator=pipe_rf,
    param_grid=grid_params_rf,
    scoring='neg_median_absolute_error',
    cv=10)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
    param_grid=grid_params_rf,
    scoring='neg_median_absolute_error',
    cv=10)


# List of pipelines for ease of iteration
grids = [gs_dt, gs_dt_pca, gs_rf, gs_rf_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Decision Tree', 1: 'Decision Tree w/PCA',
  2: 'Random Forest', 3: 'Random Forest w/PCA'}


**On your own:** Can you calculate how many models we are going to fit? 256! Can you do the math and get the same answer?

## Run it! Find the best model

In [None]:
from sklearn.metrics import median_absolute_error

In [None]:

# Fit the grid search objects
print('Performing model optimizations...')
best_err = np.inf
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
	print('\nEstimator: %s' % grid_dict[idx])
	# Fit grid search
	gs.fit(X_train, y_train)
	# Best params
	print('Best params: %s' % gs.best_params_)
	# Best training data error
	print('Best training error: %.3f' % gs.best_score_)
	# Predict on test data with best params
	y_pred = gs.predict(X_test)
	# Test data error of model with best params
	print('Test set error score for best params: %.3f ' % median_absolute_error(y_test, y_pred))
	# Track best (lowest test error) model
	if median_absolute_error(y_test, y_pred) < best_err: #updated: April 8, 2021 (4 PM)
		best_err = median_absolute_error(y_test, y_pred)
		best_gs = gs
		best_clf = idx
print('\nModel with best test set error: %s' % grid_dict[best_clf])

Performing model optimizations...

Estimator: Decision Tree
Best params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 15}
Best training error: -1.931
Test set error score for best params: 2.277 

Estimator: Decision Tree w/PCA
Best params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 10}
Best training error: -2.304
Test set error score for best params: 1.770 

Estimator: Random Forest
Best params: {'clf__max_depth': 10, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 10}
Best training error: -1.786
Test set error score for best params: 1.438 

Estimator: Random Forest w/PCA
Best params: {'clf__max_depth': 15, 'clf__min_samples_leaf': 5, 'clf__min_samples_split': 15}
Best training error: -2.005
Test set error score for best params: 2.242 

Model with best test set error: Random Forest


Remember - in a classification problem, we want accuracy to be 0 and then improve towards 1.

* best_err = 0
* if accuracy(y_test, y_pred) > best_err:


In a regression problem, we want error to be infinity at the start, and then improve towards 0. Be careful!

* best_err = np.inf
* if accuracy(y_test, y_pred) > best_err:

# Now what? Refit the model and evaluate it!
So now you know the combination of hyperparameters that will yield an accurate model. You should:

* Re-run the model
* Store the train and test preds
* Make scatterplots and calculate error metrics
* Tell a story!

Note here that we have only fit two models with different scaling techniques - why not go try ALL of the other regression models we showed during spot check models? Maybe try calculating some [polynomial features/interaction terms](https://machinelearningmastery.com/polynomial-features-transforms-for-machine-learning/) in the pipeline as well? Or expanding the grid search to try different hyperparameters?



In [None]:
# left to students to do as an exercise