#### Data preparation

In [1]:
import numpy as np
np.random.seed(42)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

In [3]:
dataset = load_breast_cancer()
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(f"x_train shape: {x_train.shape} x_test.shape: {x_test.shape}")

x_train shape: (455, 30) x_test.shape: (114, 30)


#### Grid Search

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
# Params for KNN: n_neighbors and weights
parameters = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
}

clf = KNeighborsClassifier()
grid_cv = GridSearchCV(clf, parameters, cv=10)
grid_cv.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9],
                         'weights': ['uniform', 'distance']})

In [6]:
print("GridSearch Keys:")
for key in grid_cv.cv_results_.keys():
    print(f"\t{key}")

GridSearch Keys:
	mean_fit_time
	std_fit_time
	mean_score_time
	std_score_time
	param_n_neighbors
	param_weights
	params
	split0_test_score
	split1_test_score
	split2_test_score
	split3_test_score
	split4_test_score
	split5_test_score
	split6_test_score
	split7_test_score
	split8_test_score
	split9_test_score
	mean_test_score
	std_test_score
	rank_test_score


In [7]:
print(f"GridSearch Params: {grid_cv.cv_results_['params']}")

GridSearch Params: [{'n_neighbors': 3, 'weights': 'uniform'}, {'n_neighbors': 3, 'weights': 'distance'}, {'n_neighbors': 5, 'weights': 'uniform'}, {'n_neighbors': 5, 'weights': 'distance'}, {'n_neighbors': 7, 'weights': 'uniform'}, {'n_neighbors': 7, 'weights': 'distance'}, {'n_neighbors': 9, 'weights': 'uniform'}, {'n_neighbors': 9, 'weights': 'distance'}]


In [8]:
print(f"Best parameters set found on development set: {grid_cv.best_params_}\n")

means = grid_cv.cv_results_['mean_test_score']
stds = grid_cv.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, grid_cv.cv_results_['params']):
    print(f"{mean:.3f} (+/-{2*std:.3f}) for {params}")

Best parameters set found on development set: {'n_neighbors': 3, 'weights': 'distance'}

0.927 (+/-0.095) for {'n_neighbors': 3, 'weights': 'uniform'}
0.932 (+/-0.095) for {'n_neighbors': 3, 'weights': 'distance'}
0.919 (+/-0.095) for {'n_neighbors': 5, 'weights': 'uniform'}
0.918 (+/-0.105) for {'n_neighbors': 5, 'weights': 'distance'}
0.921 (+/-0.101) for {'n_neighbors': 7, 'weights': 'uniform'}
0.923 (+/-0.107) for {'n_neighbors': 7, 'weights': 'distance'}
0.919 (+/-0.099) for {'n_neighbors': 9, 'weights': 'uniform'}
0.923 (+/-0.099) for {'n_neighbors': 9, 'weights': 'distance'}


#### Best Found model

In [9]:
clf = KNeighborsClassifier(n_neighbors=3, weights="distance")
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(f"Accuracy: {score}")

Accuracy: 0.9385964912280702
