# Model Selection

In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error

In [None]:
rng = np.random.RandomState(2)

## Read in dataset and split it

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "housing.csv"))

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=rng)

features = features = ["BasementArea", "GroundFloorArea", "Bedrooms", "Condition"]

X_train = train[features]
y_train = train["SalePrice"]
X_test = test[features]
y_test = test["SalePrice"]

## Choosing hyperparameter values - done wrong!

In [None]:
for max_depth in range(1, 11):
    decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=rng)
    decision_tree.fit(X_train, y_train)
    print(max_depth, mean_absolute_error(decision_tree.predict(X_test), y_test))

## Using a validation set

In [None]:
X_ms_train, X_val, y_ms_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=rng)

for max_depth in range(1, 11):
    decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=rng)
    decision_tree.fit(X_ms_train, y_ms_train)
    mae = mean_absolute_error(decision_tree.predict(X_val), y_val)
    print(max_depth, mae)

## Let's get the code to pick the winner and train and test the final model

In [None]:
X_ms_train, X_val, y_ms_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state=2)

val_errors = np.zeros(10)

for max_depth in range(1, 11):
    decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=2)
    decision_tree.fit(X_ms_train, y_ms_train)
    mae = mean_absolute_error(decision_tree.predict(X_val), y_val)
    val_errors[max_depth - 1] = mae

best_max_depth, lowest_val_error = np.argmin(val_errors) + 1, np.min(val_errors)

print(best_max_depth, lowest_val_error)

## We can fit, predict and compute the error in one line

In [None]:
ss = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

val_errors = np.zeros(10)

for max_depth in range(1, 11):
    decision_tree = DecisionTreeRegressor(max_depth=max_depth, random_state=2)
    mae = cross_val_score(decision_tree, X_train, y_train, scoring="neg_mean_absolute_error", cv=ss)
    val_errors[max_depth - 1] = mae[0]

best_max_depth, lowest_val_error = np.argmax(val_errors) + 1, np.max(val_errors)

print(best_max_depth, lowest_val_error)

## We can simplify even further

In [None]:
ss = ShuffleSplit(n_splits=1, test_size=0.25, random_state=2)

decision_tree = DecisionTreeRegressor(random_state=2)

param_grid = {"max_depth" : range(1, 11)}

decision_tree_gs = GridSearchCV(decision_tree, param_grid, scoring="neg_mean_absolute_error", cv=ss)

decision_tree_gs.fit(X_train, y_train)

print(decision_tree_gs.best_params_, decision_tree_gs.best_score_)

Note that, after a grid search, `gs` is re-trained on the whole original training set using the winning hyperparameters. This final model is available to us using `gs.best_estimator_`.

## But this dataset is too small to split into three - let's use k-fold cross-validation

In [None]:
ss = KFold(n_splits=10, shuffle=True, random_state=2)

decision_tree = DecisionTreeRegressor(random_state=2)

param_grid = {"max_depth" : range(1, 11)}

decision_tree_gs = GridSearchCV(decision_tree, param_grid, scoring="neg_mean_absolute_error", cv=ss)

decision_tree_gs.fit(X_train, y_train)

print(decision_tree_gs.best_params_, decision_tree_gs.best_score_)

# And there's a shortcut - but you are responsible for shuffling the data

(Assuming you used train_test_split, you're grand: by default, it shuffles the data.)

In [None]:
decision_tree = DecisionTreeRegressor(random_state=2)

param_grid = {"max_depth" : range(1, 11)}

decision_tree_gs = GridSearchCV(decision_tree, param_grid, scoring="neg_mean_absolute_error", cv=10)

decision_tree_gs.fit(X_train, y_train)

print(decision_tree_gs.best_params_, decision_tree_gs.best_score_)

## Why is it a grid search? It will try all combinations!

In [None]:
knn = Pipeline([
    ("scaler", StandardScaler()),
    ("predictor", KNeighborsRegressor())
])

param_grid = {"predictor__n_neighbors" : range(1, 11),
              "predictor__weights" : ["uniform", "distance"]}

knn_gs = GridSearchCV(knn, param_grid, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)

knn_gs.fit(X_train, y_train)

print(knn_gs.best_params_, knn_gs.best_score_)

`n_jobs=-1` tells it to use all available CPUs in parallel.

## With ingenuity, we can include other things in the grid search

In [None]:
knn = Pipeline([
    ("scaler", None),
    ("predictor", KNeighborsRegressor())
])

param_grid = {"scaler" : [None, MinMaxScaler(), RobustScaler(), StandardScaler()],
              "predictor__n_neighbors" : range(1, 11),
              "predictor__weights" : ["uniform", "distance"]}

knn_gs = GridSearchCV(knn, param_grid, scoring="neg_mean_absolute_error", cv=10, n_jobs=-1)

knn_gs.fit(X_train, y_train)

print(knn_gs.best_params_, knn_gs.best_score_)

Question: Having seen which configuration is the winner here, what should we probably do?

Question: It trained 801 kNN models. How did I get this number?

## Doing less work - but not guaranteed to be optimal

Above there were 80 configurations and we tried them all. Here, we try 40 of them, chosen at random.

In [None]:
knn = Pipeline([
    ("scaler", None),
    ("predictor", KNeighborsRegressor())
])

param_grid = {"scaler" : [None, MinMaxScaler(), RobustScaler(), StandardScaler()],
              "predictor__n_neighbors" : range(1, 11),
              "predictor__weights" : ["uniform", "distance"]}

knn_rs = RandomizedSearchCV(knn, param_grid, scoring="neg_mean_absolute_error", cv=10, n_iter=40, n_jobs=-1)

knn_rs.fit(X_train, y_train)

print(knn_rs.best_params_, knn_rs.best_score_)

## Error estimation - if you're certain you've finished model selection, then you can use the test set

In [None]:
mean_absolute_error(knn_gs.best_estimator_.predict(X_test), y_test)