<a href="https://colab.research.google.com/github/fboldt/aulasml/blob/master/aula6a_gridsearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
print(X.shape)
print(y.shape)

(20640, 8)
(20640,)


In [2]:
from sklearn.model_selection import train_test_split
Xtr, Xte, ytr, yte = train_test_split(X, y, random_state=42)
Xtr.shape, Xte.shape, ytr.shape, yte.shape

((15480, 8), (5160, 8), (15480,), (5160,))

In [3]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate, RepeatedKFold
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

modeloKnn = KNeighborsRegressor()
scores = cross_validate(modeloKnn, Xtr, ytr, cv=RepeatedKFold(random_state=42),
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
np.mean(scores['test_score']) #, scores['test_score']

-1.1769315482657852

In [4]:
from sklearn.model_selection import GridSearchCV

param = {'n_neighbors': [1, 7, 15]}

modeloGS15 = GridSearchCV(KNeighborsRegressor(), param, verbose=3,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
modeloGS15.fit(Xtr, ytr)
modeloGS15.best_params_, modeloGS15.best_score_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ....................n_neighbors=1;, score=-1.629 total time=   0.1s
[CV 2/5] END ....................n_neighbors=1;, score=-1.581 total time=   0.0s
[CV 3/5] END ....................n_neighbors=1;, score=-1.663 total time=   0.0s
[CV 4/5] END ....................n_neighbors=1;, score=-1.641 total time=   0.0s
[CV 5/5] END ....................n_neighbors=1;, score=-1.747 total time=   0.1s
[CV 1/5] END ....................n_neighbors=7;, score=-1.177 total time=   0.0s
[CV 2/5] END ....................n_neighbors=7;, score=-1.106 total time=   0.0s
[CV 3/5] END ....................n_neighbors=7;, score=-1.193 total time=   0.0s
[CV 4/5] END ....................n_neighbors=7;, score=-1.129 total time=   0.1s
[CV 5/5] END ....................n_neighbors=7;, score=-1.184 total time=   0.1s
[CV 1/5] END ...................n_neighbors=15;, score=-1.205 total time=   0.1s
[CV 2/5] END ...................n_neighbors=15;, 

({'n_neighbors': 7}, -1.157725048976956)

In [5]:
modeloGS15.cv_results_

{'mean_fit_time': array([0.02307773, 0.02196712, 0.02759647]),
 'std_fit_time': array([0.00272691, 0.00311301, 0.01193773]),
 'mean_score_time': array([0.02379603, 0.03559113, 0.03924627]),
 'std_score_time': array([0.00426107, 0.00965566, 0.00765736]),
 'param_n_neighbors': masked_array(data=[1, 7, 15],
              mask=[False, False, False],
        fill_value=999999),
 'params': [{'n_neighbors': 1}, {'n_neighbors': 7}, {'n_neighbors': 15}],
 'split0_test_score': array([-1.62906669, -1.17686686, -1.20473666]),
 'split1_test_score': array([-1.58073762, -1.10643004, -1.12155902]),
 'split2_test_score': array([-1.66312837, -1.19311375, -1.20469472]),
 'split3_test_score': array([-1.64065279, -1.12860307, -1.14912536]),
 'split4_test_score': array([-1.74667338, -1.18361152, -1.19391883]),
 'mean_test_score': array([-1.65205177, -1.15772505, -1.17480692]),
 'std_test_score': array([0.05444245, 0.03396512, 0.03361314]),
 'rank_test_score': array([3, 1, 2], dtype=int32)}

In [6]:
modeloGS150 = GridSearchCV(KNeighborsRegressor(), param, verbose=3,
                          cv=RepeatedKFold(random_state=42),
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
modeloGS150.fit(Xtr, ytr)
modeloGS150.best_params_, modeloGS150.best_score_

Fitting 50 folds for each of 3 candidates, totalling 150 fits
[CV 1/50] END ...................n_neighbors=1;, score=-1.664 total time=   0.0s
[CV 2/50] END ...................n_neighbors=1;, score=-1.643 total time=   0.0s
[CV 3/50] END ...................n_neighbors=1;, score=-1.679 total time=   0.0s
[CV 4/50] END ...................n_neighbors=1;, score=-1.622 total time=   0.1s
[CV 5/50] END ...................n_neighbors=1;, score=-1.685 total time=   0.1s
[CV 6/50] END ...................n_neighbors=1;, score=-1.584 total time=   0.1s
[CV 7/50] END ...................n_neighbors=1;, score=-1.654 total time=   0.1s
[CV 8/50] END ...................n_neighbors=1;, score=-1.662 total time=   0.1s
[CV 9/50] END ...................n_neighbors=1;, score=-1.651 total time=   0.1s
[CV 10/50] END ..................n_neighbors=1;, score=-1.676 total time=   0.0s
[CV 11/50] END ..................n_neighbors=1;, score=-1.681 total time=   0.0s
[CV 12/50] END ..................n_neighbors=1;

({'n_neighbors': 7}, -1.1632495612704894)

In [7]:
param = {'n_neighbors': [1, 7, 15],
         'metric': ['cosine', 'cityblock', 'euclidean']}

modeloGS45 = GridSearchCV(KNeighborsRegressor(), param, verbose=3, cv=5,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
modeloGS45.fit(Xtr, ytr)
modeloGS45.best_params_, modeloGS45.best_score_

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .....metric=cosine, n_neighbors=1;, score=-1.201 total time=   0.6s
[CV 2/5] END .....metric=cosine, n_neighbors=1;, score=-1.155 total time=   0.5s
[CV 3/5] END .....metric=cosine, n_neighbors=1;, score=-1.205 total time=   0.5s
[CV 4/5] END .....metric=cosine, n_neighbors=1;, score=-1.123 total time=   0.5s
[CV 5/5] END .....metric=cosine, n_neighbors=1;, score=-1.175 total time=   0.5s
[CV 1/5] END .....metric=cosine, n_neighbors=7;, score=-0.791 total time=   0.7s
[CV 2/5] END .....metric=cosine, n_neighbors=7;, score=-0.761 total time=   0.8s
[CV 3/5] END .....metric=cosine, n_neighbors=7;, score=-0.857 total time=   0.9s
[CV 4/5] END .....metric=cosine, n_neighbors=7;, score=-0.771 total time=   0.9s
[CV 5/5] END .....metric=cosine, n_neighbors=7;, score=-0.811 total time=   0.9s
[CV 1/5] END ....metric=cosine, n_neighbors=15;, score=-0.835 total time=   0.7s
[CV 2/5] END ....metric=cosine, n_neighbors=15;, 

({'metric': 'cosine', 'n_neighbors': 7}, -0.7979977106272027)

In [8]:
modelos = [
    ("modeloGS15", modeloGS15),
    ("modeloGS150", modeloGS150),
    ("modeloGS45", modeloGS45)
    ]

for nome, modelo in modelos:
  mse = mean_squared_error(modelo.predict(Xte), yte)
  print(nome, mse)


modeloGS15 1.1185550426887638
modeloGS150 1.1185550426887638
modeloGS45 0.7450083756017327


In [9]:
for nome, modelo in modelos:
  modelo.verbose = 0
  scores = cross_validate(modelo, X, y, # return_estimator=True,
                          scoring=make_scorer(mean_squared_error,
                                              greater_is_better=False))
  print(nome, np.mean(scores['test_score']))

modeloGS15 -1.2340087401769488
modeloGS150 -1.230186033171933
modeloGS45 -0.8737280962690767


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsRegressor(n_neighbors=7))
])

scores = cross_validate(pipe, X, y,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
np.mean(scores['test_score'])

-0.5794714514842296

In [11]:
for nome, modelo in modelos:
  modelo.verbose = 0
  pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('modelo', modelo)
  ])
  scores = cross_validate(pipe, X, y,
                          scoring=make_scorer(mean_squared_error,
                                              greater_is_better=False))
  print(nome, np.mean(scores['test_score']))

modeloGS15 -0.5602334294995837
modeloGS150 -0.5658099997863507
modeloGS45 -0.5552164909398922


In [12]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsRegressor())
])

param = {'clf__n_neighbors': [1, 7, 15],
         'clf__metric': ['cosine', 'cityblock', 'euclidean'],
         'scaler__with_mean': [True, False]}

modeloGS90 = GridSearchCV(pipe, param, cv=5, verbose=1,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
scores = cross_validate(modeloGS90, X, y, verbose=3, return_estimator=True,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
np.mean(scores['test_score'])

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


[CV] END ......................................, score=-0.517 total time=  44.1s
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......................................, score=-0.536 total time=  45.4s
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......................................, score=-0.589 total time=  42.8s
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......................................, score=-0.565 total time=  41.2s
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......................................, score=-0.568 total time=  41.7s


-0.5552164909398922

In [13]:
for modelo in scores['estimator']:
  print(modelo.best_params_, modelo.best_score_)

{'clf__metric': 'cityblock', 'clf__n_neighbors': 15, 'scaler__with_mean': True} -0.5380868189174538
{'clf__metric': 'cosine', 'clf__n_neighbors': 15, 'scaler__with_mean': True} -0.5764514000528239
{'clf__metric': 'cosine', 'clf__n_neighbors': 15, 'scaler__with_mean': True} -0.5924712389663743
{'clf__metric': 'cityblock', 'clf__n_neighbors': 15, 'scaler__with_mean': True} -0.5017991991430303
{'clf__metric': 'cityblock', 'clf__n_neighbors': 15, 'scaler__with_mean': True} -0.5189740649095438


In [14]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('clf', LinearRegression())
])

param = {'poly__degree': [1, 2, 3],
         'clf__positive': [True, False]}

modeloPoly = GridSearchCV(pipe, param, cv=5, verbose=1,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
scores = cross_validate(modeloPoly, X, y, return_estimator=True,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
np.mean(scores['test_score'])

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits


-0.5582901717686554

In [15]:
for modelo in scores['estimator']:
  print(modelo.best_params_, modelo.best_score_)

{'clf__positive': False, 'poly__degree': 1} -0.5847593560581984
{'clf__positive': False, 'poly__degree': 1} -0.5552131176776777
{'clf__positive': False, 'poly__degree': 1} -0.5571502116400604
{'clf__positive': False, 'poly__degree': 1} -0.5618397318830617
{'clf__positive': False, 'poly__degree': 1} -0.624427192591885


In [16]:
from sklearn.svm import SVR

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVR())])

scores = cross_validate(pipe, X, y, return_estimator=True, verbose=1,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
print(np.mean(scores['test_score']), np.mean(scores['fit_time']))

-0.41663446793477227 11.92108793258667


In [17]:
from sklearn.model_selection import RandomizedSearchCV

param = {'clf__C': [0.1, 1, 10],
         'clf__kernel': ['poly', 'rbf', 'sigmoid', 'linear']}

modeloSVR = RandomizedSearchCV(pipe, param, cv=3, verbose=1, n_iter=5,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))

# scores = cross_validate(modeloSVR, X, y, return_estimator=True, verbose=1,
#                         scoring=make_scorer(mean_squared_error,
#                                             greater_is_better=False))
# print(np.mean(scores['test_score']), np.mean(scores['fit_time']))
# for modelo in scores['estimator']:
#   print(modelo.best_params_, modelo.best_score_)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

clf = RandomForestRegressor(random_state=0)

param_grid = {"max_depth": [3, None],
              "min_samples_split": [5, 10]}

modeloHGS = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
                             max_resources=10, random_state=0)
modeloHGS.fit(Xtr, ytr)
modeloHGS.best_params_, modeloHGS.best_score_

({'max_depth': None, 'min_samples_split': 10, 'n_estimators': 9},
 0.7772668333341842)

In [19]:
param_grid = {}

modeloHGS = HalvingGridSearchCV(clf, param_grid, resource='n_estimators',
                             max_resources=10, random_state=0)
modeloHGS.fit(Xtr, ytr)
modeloHGS.best_params_, modeloHGS.best_score_

({'n_estimators': 10}, 0.7770443874582622)

In [20]:
scores = cross_validate(modeloHGS, X, y, return_estimator=True, verbose=1,
                        scoring=make_scorer(mean_squared_error,
                                            greater_is_better=False))
for modelo in scores['estimator']:
  print(modelo.best_params_, modelo.best_score_)

{'n_estimators': 10} 0.600121423652453
{'n_estimators': 10} 0.532807009266565
{'n_estimators': 10} 0.5020498592816481
{'n_estimators': 10} 0.6113534355486194
{'n_estimators': 10} 0.6052370103233692


In [21]:
print(np.mean(scores['test_score']), scores['test_score'])

-0.46800609280935995 [-0.55313122 -0.37432603 -0.40511508 -0.51747387 -0.48998427]
