In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [2]:
digits = datasets.load_digits() # 手写数字数据集
X = digits.data
y = digits.target

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.4, 
                     random_state=666) # 留出法划分数据集

In [4]:
best_k, best_p, best_score = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6): # 网格搜索
        knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k, best_p, best_score = k, p, score
            
print("Best K =", best_k)
print("Best P =", best_p)
print("Best Score =", best_score)

Best K = 3
Best P = 4
Best Score = 0.9860917941585535


In [5]:
# GridSearchCV对这进行了高级封装
best_k, best_p, best_score = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        # Evaluate a score by cross-validation
        scores = cross_val_score(knn_clf, # 估计器对象
                                 X_train, # 将数据划分为训练数据集合验证数据集(调整超参数)
                                 y_train,
                                 cv=5, # k折交叉验证折数
                                 n_jobs=-1, # 进行训练的CPU核心个数,默认n_jobs=1(n_jobs=-1:使用所有CPU核心进行训练)
                                 scoring=None) # 默认为None,使用训练器的score方法(可使用make_score设定其他score function of loss function)
        score = np.mean(scores)
        if score > best_score:
            best_k, best_p, best_score = k, p, score
            
print("Best K =", best_k)
print("Best P =", best_p)
print("Best Score =", best_score)

Best K = 2
Best P = 2
Best Score = 0.9851473095532756


In [6]:
best_knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=best_k, p=best_p)
best_knn_clf.fit(X_train, y_train)
best_knn_clf.score(X_test, y_test) # 使用测试数据集作为衡量最终模型性能的数据集

0.980528511821975