# Select model

## Setup libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [4]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA 

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
from lightgbm import LGBMRegressor

## Create utility functions

In [9]:
def cross_val_score_print(estimator, X_val, y_val):
    score = cross_val_score(
        estimator,
        X_val,
        y_val,
        cv=3,
        scoring="r2",
        n_jobs=-1,
        error_score="raise",
    )
    print(
        "R2 {:.3f} +/- {:.3f}, count {}".format(
            np.mean(score), np.std(score), np.size(score)
        )
    )

In [10]:
def get_params_str(params):
    params_str = []
    key_prefix = "regressor"
    for key, value in params.items():
        trimmed_key = (
            key[len(key_prefix) :] if key.startswith(key_prefix) else key
        )
        trimmed_key = (
            trimmed_key[2:] if trimmed_key.startswith("__") else trimmed_key
        )
        params_str.append(f"{trimmed_key}: {value}")
    return ", ".join(params_str)

In [11]:
def display_grid_cv_scores(grid):
    for idx, params in enumerate(grid.cv_results_["params"]):
        print(
            "{}: R2 {:.3f} +/- {:.3f}".format(
                get_params_str(params),
                grid.cv_results_["mean_test_score"][idx],
                grid.cv_results_["std_test_score"][idx],
            )
        )

## Get data

In [12]:
california_housing = fetch_california_housing(as_frame=True)

In [13]:
X_val = california_housing.data
y_val = california_housing.target

## Cross validation for one estimator

In [14]:
pipe = Pipeline([("scalar", StandardScaler()), ("regressor", SVR())])
cross_val_score_print(pipe, X_val, y_val)

R2 0.677 +/- 0.029, count 3


## Cross validation for multiple estimators

In [15]:
steps = [
    ("dim_reducer", "passthrough"),
    ("scaler", StandardScaler()),
    ("regressor", "passthrough"),
]

In [16]:
pipe = Pipeline(steps)

In [17]:
regressors = [
    LinearRegression(),
    SVR(),
    RandomForestRegressor(),
    LGBMRegressor(),
]

In [18]:
param_grid = [
    {
        "dim_reducer": [PCA(), FastICA()],
        "dim_reducer__n_components": [3, 6],
        "regressor": regressors,
    }
]
grid = GridSearchCV(pipe, cv=3, scoring="r2", n_jobs=-1, param_grid=param_grid)
grid.fit(X_val, y_val)
display_grid_cv_scores(grid)

dim_reducer: PCA(), dim_reducer__n_components: 3, : LinearRegression(): R2 -0.024 +/- 0.005
dim_reducer: PCA(), dim_reducer__n_components: 3, : SVR(): R2 0.036 +/- 0.040
dim_reducer: PCA(), dim_reducer__n_components: 3, : RandomForestRegressor(): R2 -0.004 +/- 0.012
dim_reducer: PCA(), dim_reducer__n_components: 3, : LGBMRegressor(): R2 0.074 +/- 0.001
dim_reducer: PCA(), dim_reducer__n_components: 6, : LinearRegression(): R2 0.483 +/- 0.030
dim_reducer: PCA(), dim_reducer__n_components: 6, : SVR(): R2 0.614 +/- 0.054
dim_reducer: PCA(), dim_reducer__n_components: 6, : RandomForestRegressor(): R2 0.581 +/- 0.026
dim_reducer: PCA(), dim_reducer__n_components: 6, : LGBMRegressor(): R2 0.610 +/- 0.031
dim_reducer: FastICA(n_components=6), dim_reducer__n_components: 3, : LinearRegression(): R2 -0.024 +/- 0.005
dim_reducer: FastICA(n_components=6), dim_reducer__n_components: 3, : SVR(): R2 0.036 +/- 0.040
dim_reducer: FastICA(n_components=6), dim_reducer__n_components: 3, : RandomForestRegr

In [19]:
param_grid = [
    {
        "scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
        "regressor": regressors,
    }
]
grid = GridSearchCV(pipe, cv=3, scoring="r2", n_jobs=-1, param_grid=param_grid)
grid.fit(X_val, y_val)
display_grid_cv_scores(grid)

: LinearRegression(), scaler: StandardScaler(): R2 0.576 +/- 0.015
: LinearRegression(), scaler: MinMaxScaler(): R2 0.576 +/- 0.015
: LinearRegression(), scaler: RobustScaler(): R2 0.576 +/- 0.015
: SVR(), scaler: StandardScaler(): R2 0.677 +/- 0.029
: SVR(), scaler: MinMaxScaler(): R2 0.604 +/- 0.028
: SVR(), scaler: RobustScaler(): R2 0.644 +/- 0.017
: RandomForestRegressor(), scaler: StandardScaler(): R2 0.655 +/- 0.048
: RandomForestRegressor(), scaler: MinMaxScaler(): R2 0.658 +/- 0.047
: RandomForestRegressor(), scaler: RobustScaler(): R2 0.656 +/- 0.049
: LGBMRegressor(), scaler: StandardScaler(): R2 0.698 +/- 0.032
: LGBMRegressor(), scaler: MinMaxScaler(): R2 0.702 +/- 0.031
: LGBMRegressor(), scaler: RobustScaler(): R2 0.698 +/- 0.032


In [20]:
param_grid = [
    {"regressor": [LinearRegression()]},
    {"regressor": [SVR()], "regressor__C": [1.0, 2.0, 3.0]},
    {
        "regressor": [RandomForestRegressor()],
        "regressor__n_estimators": [100, 200],
    },
    {
        "regressor": [LGBMRegressor()],
        "regressor__num_leaves": [20, 40, 80],
        "regressor__n_estimators": [20, 40, 80],
    },
]
grid = GridSearchCV(pipe, cv=3, scoring="r2", n_jobs=-1, param_grid=param_grid)
grid.fit(X_val, y_val)
display_grid_cv_scores(grid)

: LinearRegression(): R2 0.576 +/- 0.015
: SVR(), C: 1.0: R2 0.677 +/- 0.029
: SVR(), C: 2.0: R2 0.682 +/- 0.028
: SVR(), C: 3.0: R2 0.684 +/- 0.028
: RandomForestRegressor(), n_estimators: 100: R2 0.655 +/- 0.048
: RandomForestRegressor(), n_estimators: 200: R2 0.658 +/- 0.048
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 20, num_leaves: 20: R2 0.623 +/- 0.050
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 20, num_leaves: 40: R2 0.645 +/- 0.046
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 20, num_leaves: 80: R2 0.647 +/- 0.051
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 40, num_leaves: 20: R2 0.672 +/- 0.039
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 40, num_leaves: 40: R2 0.683 +/- 0.040
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 40, num_leaves: 80: R2 0.676 +/- 0.047
: LGBMRegressor(n_estimators=80, num_leaves=40), n_estimators: 80, num_leaves: 20: R2 0.692 +/- 0.034
: LGBMR