In [103]:
from sklearn.datasets import fetch_california_housing

X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [104]:
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64

In [105]:
import sklearn

sklearn.set_config(display="diagram")

In [106]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    PolynomialFeatures(),
    StandardScaler(),
    Ridge(),
)

In [65]:
import pandas as pd
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, X, y)
cv_results = pd.DataFrame(cv_results)

In [66]:
cv_results

Unnamed: 0,fit_time,score_time,test_score
0,0.037823,0.003607,0.46765
1,0.020068,0.00294,0.552113
2,0.014576,0.002619,0.579568
3,0.011805,0.001643,0.500778
4,0.011756,0.001594,-4.211175


In [67]:
cv_results.aggregate(["mean", "std"])

Unnamed: 0,fit_time,score_time,test_score
mean,0.019206,0.002481,-0.422213
std,0.010944,0.000864,2.118542


In [68]:
for params in model.get_params():
    print(params)

memory
steps
verbose
polynomialfeatures
standardscaler
ridge
polynomialfeatures__degree
polynomialfeatures__include_bias
polynomialfeatures__interaction_only
polynomialfeatures__order
standardscaler__copy
standardscaler__with_mean
standardscaler__with_std
ridge__alpha
ridge__copy_X
ridge__fit_intercept
ridge__max_iter
ridge__normalize
ridge__positive
ridge__random_state
ridge__solver
ridge__tol


## Manual hyperparameters search

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0
)

In [82]:
import numpy as np

parameter_grid = {
    "polynomialfeatures__degree": np.arange(2, 5),
    "ridge__alpha": np.logspace(1, 3, num=5),
}

In [83]:
from collections import defaultdict
search_results = defaultdict(list)

for degree in parameter_grid["polynomialfeatures__degree"]:
    for alpha in parameter_grid["ridge__alpha"]:
        search_results["polynomialfeatures__degree"].append(degree)
        search_results["ridge__alpha"].append(alpha)
        model.set_params(
            polynomialfeatures__degree=degree,
            ridge__alpha=alpha,
        )
        model.fit(X_train, y_train)
        search_results["score"].append(model.score(X_test, y_test))
search_results = pd.DataFrame(search_results)

In [84]:
search_results.sort_values(by="score", ascending=False)

Unnamed: 0,polynomialfeatures__degree,ridge__alpha,score
4,2,1000.0,0.098531
11,4,31.622777,0.054431
3,2,316.227766,-1.222417
2,2,100.0,-3.373971
0,2,10.0,-3.796247
1,2,31.622777,-4.600567
9,3,1000.0,-6.715816
5,3,10.0,-13.677841
8,3,316.227766,-13.90918
7,3,100.0,-19.358309


## Grid hyperparameters search

In [85]:
from sklearn.model_selection import GridSearchCV

search_cv = GridSearchCV(model, param_grid=parameter_grid)
search_cv.fit(X_train, y_train)

In [86]:
search_cv.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 316.22776601683796}

In [87]:
pd.DataFrame(search_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomialfeatures__degree,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016169,0.006292,0.002324,0.000488,2,10.0,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.655806,0.52857,0.362651,0.653063,0.665514,0.573121,0.116659,7
1,0.0097,0.000744,0.001672,0.000406,2,31.622777,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.657668,0.565873,0.422209,0.654168,0.66204,0.592392,0.092297,5
2,0.009078,0.000351,0.001597,0.000497,2,100.0,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.639703,0.604569,0.494025,0.665001,0.655253,0.61171,0.062322,3
3,0.008711,6.7e-05,0.001427,0.000181,2,316.227766,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.601179,0.629179,0.562315,0.640136,0.642539,0.615069,0.030193,1
4,0.009515,0.001165,0.002492,0.00104,2,1000.0,"{'polynomialfeatures__degree': 2, 'ridge__alph...",0.576208,0.615647,0.592287,0.585586,0.619774,0.5979,0.017012,4
5,0.030766,0.001347,0.003768,0.000973,3,10.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",-0.237696,0.442649,0.447031,0.230267,0.680427,0.312536,0.309802,13
6,0.034487,0.00244,0.003079,0.000296,3,31.622777,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.320556,0.515025,0.43425,0.280084,0.680808,0.446145,0.143737,10
7,0.030411,0.001753,0.002897,0.000125,3,100.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.588009,0.564294,0.418581,0.398021,0.679281,0.529637,0.10645,9
8,0.031365,0.001353,0.003104,0.000287,3,316.227766,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.665599,0.587401,0.475191,0.547993,0.669202,0.589077,0.073398,6
9,0.030238,0.000554,0.003402,0.000791,3,1000.0,"{'polynomialfeatures__degree': 3, 'ridge__alph...",0.616699,0.612782,0.536046,0.658794,0.649991,0.614862,0.043321,2


In [88]:
search_cv.best_estimator_

In [89]:
search_cv.score(X_test, y_test)

-1.222417346053133

In [90]:
cv_results = cross_validate(search_cv, X, y, return_estimator=True)
cv_results = pd.DataFrame(cv_results)

In [91]:
cv_results

Unnamed: 0,fit_time,score_time,estimator,test_score
0,5.356896,0.00435,GridSearchCV(estimator=Pipeline(steps=[('polyn...,0.258175
1,4.30081,0.001634,GridSearchCV(estimator=Pipeline(steps=[('polyn...,0.475082
2,4.452103,0.00244,GridSearchCV(estimator=Pipeline(steps=[('polyn...,0.561609
3,4.414243,0.001665,GridSearchCV(estimator=Pipeline(steps=[('polyn...,0.526413
4,4.488308,0.003557,GridSearchCV(estimator=Pipeline(steps=[('polyn...,-17.880146


In [92]:
cv_results["estimator"]

0    GridSearchCV(estimator=Pipeline(steps=[('polyn...
1    GridSearchCV(estimator=Pipeline(steps=[('polyn...
2    GridSearchCV(estimator=Pipeline(steps=[('polyn...
3    GridSearchCV(estimator=Pipeline(steps=[('polyn...
4    GridSearchCV(estimator=Pipeline(steps=[('polyn...
Name: estimator, dtype: object

In [94]:
for est in cv_results["estimator"]:
    print(est.best_params_)

{'polynomialfeatures__degree': 2, 'ridge__alpha': 31.622776601683793}
{'polynomialfeatures__degree': 2, 'ridge__alpha': 1000.0}
{'polynomialfeatures__degree': 2, 'ridge__alpha': 1000.0}
{'polynomialfeatures__degree': 2, 'ridge__alpha': 1000.0}
{'polynomialfeatures__degree': 3, 'ridge__alpha': 100.0}


## Randomized hyperparameters search

In [120]:
from scipy.stats import loguniform

parameter_distributions = {
    "polynomialfeatures__degree": np.arange(1, 5),
    "ridge__alpha": loguniform(1, 3),
}

In [124]:
from sklearn.model_selection import RandomizedSearchCV

search_cv = RandomizedSearchCV(
    model, param_distributions=parameter_distributions, n_iter=10,
)

In [125]:
cv_results = cross_validate(search_cv, X, y, return_estimator=True)
cv_results = pd.DataFrame(cv_results)

In [126]:
cv_results

Unnamed: 0,fit_time,score_time,estimator,test_score
0,1.437984,0.001013,RandomizedSearchCV(estimator=Pipeline(steps=[(...,0.548888
1,0.794178,0.000667,RandomizedSearchCV(estimator=Pipeline(steps=[(...,0.468178
2,2.621391,0.001329,RandomizedSearchCV(estimator=Pipeline(steps=[(...,0.550833
3,1.943822,0.000793,RandomizedSearchCV(estimator=Pipeline(steps=[(...,0.536806
4,2.65083,0.001313,RandomizedSearchCV(estimator=Pipeline(steps=[(...,0.660523


In [128]:
for est in cv_results["estimator"]:
    print(est.best_params_)

{'polynomialfeatures__degree': 1, 'ridge__alpha': 2.6212293822798984}
{'polynomialfeatures__degree': 1, 'ridge__alpha': 1.1446172910698547}
{'polynomialfeatures__degree': 1, 'ridge__alpha': 2.7070303203315333}
{'polynomialfeatures__degree': 1, 'ridge__alpha': 2.6446728743460817}
{'polynomialfeatures__degree': 1, 'ridge__alpha': 1.9106128161271672}


## Model exposing an efficient internal cross-validation

In [130]:
from sklearn.linear_model import RidgeCV

model = make_pipeline(
    PolynomialFeatures(),
    StandardScaler(),
    RidgeCV(alphas=np.logspace(1, 3, num=50)),
)

In [131]:
cv_results = cross_validate(model, X, y, return_estimator=True)
cv_results = pd.DataFrame(cv_results)
cv_results

Unnamed: 0,fit_time,score_time,estimator,test_score
0,0.170703,0.001835,"(PolynomialFeatures(), StandardScaler(), Ridge...",0.492637
1,0.1165,0.001674,"(PolynomialFeatures(), StandardScaler(), Ridge...",0.48306
2,0.120018,0.001678,"(PolynomialFeatures(), StandardScaler(), Ridge...",0.565591
3,0.132901,0.002846,"(PolynomialFeatures(), StandardScaler(), Ridge...",0.535315
4,0.141918,0.001722,"(PolynomialFeatures(), StandardScaler(), Ridge...",-6.222846


In [134]:
for est in cv_results["estimator"]:
    print(est[-1].alpha_)

222.29964825261956
625.0551925273969
754.3120063354615
754.3120063354615
54.286754393238596


## Exercise