In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df = load_diabetes(as_frame=True)['frame']
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


## Aplicando Cross-Validation

Vamos implementar um KNN para resolver o problema de regressão do nosso dataset.

Aplicamos um Cross Validation com 5 Folds e utilizamos uma métrica de otimização adequada ao problema de regressão.

Também é utilizado GridSearch para verificar qual a combinação de hiperparâmetros traz o melhor resultado entre:
- **Número de vizinhos**: [5, 10, 15, 20]
- **Métrica de distância entre vizinhos**: [Uniforme, Distância Euclidiana]

O target, para se apromimar melhor dos valores, foi aplicado $log(x + 1)$ via função `log1p()`. Para reverter podemos usar $e^{x - 1}$ via `expm1()`.

Será separado 80% do conjuntos de dados para treino e 20% para testes.

O conjunto de validação já é avaliado em cada Fold, sendo que escolhemos Fold = 5 para este caso.

Obs: O "neg" usado na métrica se dá ao fato do sklearn padronizar a maximização em seus modelos

In [3]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
y = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2)

In [4]:
param_grid = {
    'n_neighbors': [5, 10, 15, 20],
    'weights': ['uniform', 'distance']
}

knn_regressor = KNeighborsRegressor()
grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

scores = cross_val_score(grid_search, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mse_scores = -scores

print("Cross-validation scores:", ', '.join(mse_scores.round(4).astype(str)))
print("Mean MSE:", mse_scores.mean().round(4))
print("Standard Deviation of MSE:", mse_scores.std().round(4))

Cross-validation scores: 0.1776, 0.1865, 0.1818, 0.2071, 0.2047
Mean MSE: 0.1916
Standard Deviation of MSE: 0.0121


## Verificando melhor Parâmetros e Resultado no Teste

É visto que os melhores parâmetros são 15 vizinhos e a métrica utilizada é distância euclidiana.

Por fim visualizamos valores reais, valores previstos pelo modelo e o erro na regressão via KNN.

In [5]:
gs = grid_search.fit(X_train, y_train)
print("Best Params:", gs.best_params_)

y_pred = grid_search.predict(X_test)

df_result = pd.DataFrame(zip(y_test.tolist(), y_pred.tolist()), columns=["Y-True", "Y-Pred"])
print("Negative Mean Squared Error:", -grid_search.best_score_.round(4))
df_result.iloc[:10]

Best Params: {'n_neighbors': 20, 'weights': 'distance'}
Negative Mean Squared Error: 0.1894


Unnamed: 0,Y-True,Y-Pred
0,5.030438,4.553032
1,5.493061,4.917038
2,5.153292,4.804102
3,4.532599,4.45418
4,5.398163,5.514942
5,4.962845,4.709949
6,5.834811,5.440244
7,4.025352,4.422247
8,4.934474,4.332919
9,5.313206,5.062656
