In [28]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [29]:
X, y = housing.data, housing.target
X.shape, y.shape

((20640, 8), (20640,))

In [33]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

modelo = KNeighborsRegressor()
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
print(scores)
sempad = np.mean(scores['test_score'])
print(f"Sem padronização: {sempad}")

[1.06204206 1.0429052  1.18034623 1.11082086 1.20043084]
{'fit_time': array([0.01677084, 0.01621294, 0.01542091, 0.01412678, 0.01494312]), 'score_time': array([0.02024746, 0.0184505 , 0.01600766, 0.01720572, 0.02355742]), 'test_score': array([1.06204206, 1.0429052 , 1.18034623, 1.11082086, 1.20043084])}
Sem padronização: 1.1193090385892301


In [37]:
from sklearn.model_selection import GridSearchCV

parametros = {'n_neighbors': [3,5,7]}

modelo = GridSearchCV(KNeighborsRegressor(), parametros, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))
modelo.fit(X, y)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [3, 5, 7]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))

In [20]:
modelo.cv_results_

{'mean_fit_time': array([0.01793947, 0.01431694, 0.01421213]),
 'std_fit_time': array([0.00250932, 0.00031473, 0.00017878]),
 'mean_score_time': array([0.02062955, 0.0168407 , 0.01875701]),
 'std_score_time': array([0.00372312, 0.00101215, 0.00095394]),
 'param_n_neighbors': masked_array(data=[3, 5, 7],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}],
 'split0_test_score': array([-1.09610429, -1.06204206, -1.05085651]),
 'split1_test_score': array([-1.08422426, -1.0429052 , -1.02501342]),
 'split2_test_score': array([-1.19488801, -1.18034623, -1.18530073]),
 'split3_test_score': array([-1.14627172, -1.11082086, -1.09626923]),
 'split4_test_score': array([-1.22886959, -1.20043084, -1.1786578 ]),
 'mean_test_score': array([-1.15007158, -1.11930904, -1.10721954]),
 'std_test_score': array([0.05564246, 0.0624426 , 0.06519915]),
 'rank_test_score': array([3, 2, 1], dtype=int

In [38]:
modelo.best_estimator_

KNeighborsRegressor(n_neighbors=7)

In [39]:
from sklearn.metrics import mean_squared_error
from math import sqrt

ypred = modelo.predict(X)
mse = mean_squared_error(y, ypred)
sqrt(mse)

0.8920520764852705

In [40]:
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False), return_estimator=True)
print(scores['test_score'])
sempad = np.mean(scores['test_score'])
print(f"Sem padronização: {sempad}")

[1.05085651 1.02501342 1.18530073 1.09626923 1.1786578 ]
Sem padronização: 1.1072195374897678


In [41]:
scores

{'fit_time': array([0.39500642, 0.39957285, 0.37156892, 0.37499475, 0.38206172]),
 'score_time': array([0.02429104, 0.01844668, 0.0185101 , 0.01926804, 0.020262  ]),
 'estimator': [GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=

In [42]:
for estimator in scores['estimator']:
    print(estimator.best_estimator_)

KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)


Pipelie + GridSearch

In [46]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

parametros = {'n_neighbors': [7 ,9 ,11 ,13]}

GridSearchKNN = GridSearchCV(KNeighborsRegressor(), parametros, scoring='neg_root_mean_squared_error')

modelo = Pipeline([
    ("padronização", StandardScaler()),
    ("gsknn", GridSearchKNN)
])
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[0.70619214 0.75376284 0.76648122 0.75602218 0.76324968]
Com padronização: 0.7491416118627162


In [45]:
pipeline = Pipeline([
    ("padronização", StandardScaler()),
    ("knn", KNeighborsRegressor())
])

parametros = {'knn__n_neighbors': [7,9,11, 13]}

modelo = GridSearchCV(pipeline, parametros, scoring='neg_root_mean_squared_error')

scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[0.70619214 0.75376284 0.76648122 0.75602218 0.76324968]
Com padronização: 0.7491416118627162
