In [1]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
diabetes.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename'])

In [2]:
X, y = diabetes.data, diabetes.target
X.shape, y.shape

((442, 10), (442,))

In [3]:
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

modelo = KNeighborsRegressor()
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
sempad = np.mean(scores['test_score'])
print(f"Sem padronização: {sempad}")

[56.2778101  63.42066441 59.63695849 59.59086644 61.70551029]
Sem padronização: 60.126361947859564


In [4]:
from sklearn.model_selection import GridSearchCV

parametros = {'n_neighbors': [3,5,7]}

modelo = GridSearchCV(KNeighborsRegressor(), parametros, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))
modelo.fit(X, y)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [3, 5, 7]},
             scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))

In [5]:
modelo.cv_results_

{'mean_fit_time': array([0.0005321 , 0.00041428, 0.00037537]),
 'std_fit_time': array([2.46477602e-04, 3.82528029e-05, 1.78273337e-05]),
 'mean_score_time': array([0.00092211, 0.00101962, 0.00091867]),
 'std_score_time': array([6.07119886e-05, 1.21127207e-04, 5.68251865e-05]),
 'param_n_neighbors': masked_array(data=[3, 5, 7],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}],
 'split0_test_score': array([-58.07341774, -56.2778101 , -56.99180879]),
 'split1_test_score': array([-67.43002106, -63.42066441, -62.93518794]),
 'split2_test_score': array([-62.28861222, -59.63695849, -59.57483095]),
 'split3_test_score': array([-62.02315061, -59.59086644, -55.63774296]),
 'split4_test_score': array([-67.25015489, -61.70551029, -60.43146504]),
 'mean_test_score': array([-63.41307131, -60.12636195, -59.11420714]),
 'std_test_score': array([3.53743647, 2.3951518 , 2.5743976 ]),
 'ra

In [6]:
modelo.best_estimator_

KNeighborsRegressor(n_neighbors=7)

In [7]:
from sklearn.metrics import mean_squared_error
from math import sqrt

ypred = modelo.predict(X)
mse = mean_squared_error(y, ypred)
sqrt(mse)

51.15149901888715

In [8]:
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False), return_estimator=True)
print(scores['test_score'])
sempad = np.mean(scores['test_score'])
print(f"Sem padronização: {sempad}")

[56.99180879 62.93518794 59.57483095 55.63774296 60.43146504]
Sem padronização: 59.114207135103996


In [9]:
scores

{'fit_time': array([0.02166939, 0.02064991, 0.01992059, 0.01957273, 0.01936269]),
 'score_time': array([0.00091386, 0.00120926, 0.00092387, 0.0008595 , 0.00086474]),
 'estimator': (GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=KNeighborsRegressor(),
               param_grid={'n_neighbors': [3, 5, 7]},
               scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False)),
  GridSearchCV(estimator=

In [10]:
for estimator in scores['estimator']:
    print(estimator.best_estimator_)

KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)
KNeighborsRegressor(n_neighbors=7)


# Pipeline + GridSearch

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

parametros = {'n_neighbors': [3,5,7]}

GridSearchKNN = GridSearchCV(KNeighborsRegressor(), parametros, scoring='neg_root_mean_squared_error')

modelo = Pipeline([
    ("padronização", StandardScaler()),
    ("gsknn", GridSearchKNN)
])
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[56.82092904 63.05450375 60.15515846 54.92728369 59.33477082]
Com padronização: 58.858529150715086


In [12]:
pipeline = Pipeline([
    ("padronização", StandardScaler()),
    ("knn", KNeighborsRegressor())
])

parametros = {'knn__n_neighbors': [3,5,7]}

modelo = GridSearchCV(pipeline, parametros, scoring='neg_root_mean_squared_error')

scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, squared=False))
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[56.82092904 63.05450375 60.15515846 54.92728369 59.33477082]
Com padronização: 58.858529150715086
