In [0]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Data loading

In [0]:
data,target = load_boston(return_X_y=True)

# Pipeline definition

In [0]:
pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_regression)),
     ('model',LinearRegression())
    ]
)

# Grid Search with cross-validation for linear regression

Defining grid search parameters.

In [0]:
search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':[3,4,5,6,7,8,9,10]},
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3

    )

Performing grid search over the specified parameters.

In [0]:
search.fit(data,target)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('selector',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x7f34c366c1e0>)),
                                       ('model',
                                        LinearRegression(copy_X=True,
                                                         fit_intercept=True,
                                                         n_jobs=None,
                                                         normalize=False))],
                                verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'selector__k': [3, 4, 5, 6, 7, 8, 9, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=3)

The best value for *k* is:

In [0]:
search.best_params_

{'selector__k': 3}

The best score achieved is:

In [0]:
search.best_score_

-36.4236890153343

# Grid search with cross-validation for Random Forest Regressor

In [0]:
pipeline = Pipeline(
    [
     ('selector',SelectKBest(f_regression)),
     ('model',RandomForestRegressor(random_state = 0))
    ]
)

In [0]:
search = GridSearchCV(
    estimator = pipeline,
    param_grid = {'selector__k':[3,4,5,6,7,8,9,10] , 'model__n_estimators':np.arange(10,200,10)   },
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=3

    )

In [0]:
search.fit(data,target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 5 folds for each of 152 candidates, totalling 760 fits


[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 338 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 498 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 722 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 760 out of 760 | elapsed:  2.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('selector',
                                        SelectKBest(k=10,
                                                    score_func=<function f_regression at 0x7f34c366c1e0>)),
                                       ('model',
                                        RandomForestRegressor(bootstrap=True,
                                                              ccp_alpha=0.0,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              max_samples=None,
                                                              min_impurity_decrease=0.0,
     

In [0]:
search.best_params_

{'model__n_estimators': 110, 'selector__k': 6}

In [0]:
search.best_score_

-22.170138432624004