# 超参数的寻找

In [1]:
import numpy as np
from sklearn import datasets
# 载入 digits 数据集
mydata = datasets.load_digits()
X = mydata.data
y = mydata.target

# 将 数据集 分成 train data 和 test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 666)

#利用 sklearn.neighbors 中的 KNeighborsClassifier训练模型
from sklearn.neighbors import KNeighborsClassifier
sklKnnClf = KNeighborsClassifier(n_neighbors = 3)
sklKnnClf.fit(X_train, y_train)
y_predict = sklKnnClf.predict(X_test)

# 直接用 KNeighborsClassifier 中的sorce 函数 测算正确率
sklAccuracy = sklKnnClf.score(X_test,y_test)
sklAccuracy

0.9888888888888889

# 1.寻找最优的超参数k

In [2]:
bestk = 0
best_score = 0
for k in np.arange(1,11):
    sklKnnClf = KNeighborsClassifier(n_neighbors = k)
    sklKnnClf.fit(X_train, y_train)
    sklAccuracy = sklKnnClf.score(X_test,y_test)
    if sklAccuracy > best_score:
        bestk = k
        best_score = sklAccuracy
print("bestk=",bestk)
print("best_score=",best_score)
#若最后bestk 的值接近多设置搜索范围的边缘,为了使准确率更高，可以扩大搜索范围
#eg: 若bestk = 10,则可以重新搜索 7：16 看是否有更优的k

bestk= 4
best_score= 0.9916666666666667


# 2.寻找最优的超参数weights,即是否考虑距离权重 

In [3]:
myweights = ['uniform','distance']
best_weight = ''
bestk = 0
best_score = 0
for myweight in myweights:
    for k in np.arange(1,11):
        sklKnnClf = KNeighborsClassifier(n_neighbors = k,weights = myweight)
        sklKnnClf.fit(X_train, y_train)
        sklAccuracy = sklKnnClf.score(X_test,y_test)
        if sklAccuracy > best_score:
            best_weight = myweight
            bestk = k
            best_score = sklAccuracy
print("best_weight=",best_weight)
print("bestk=",bestk)
print("best_score=",best_score)

best_weight= uniform
bestk= 4
best_score= 0.9916666666666667


# 3.寻找最优的超参数p(即 是用欧式距离，曼哈顿距离or 明可夫斯基距离)

In [4]:
# 注意，如果是寻找最有参数p， 这里weights = distance (weights 可否为 uniform？ why?)
bestp = 0
bestk = 0
best_score = 0

for k in np.arange(1,11):
    for p in np.arange(1,6):
        sklKnnClf = KNeighborsClassifier(n_neighbors = k,weights = 'distance',p = p)
        sklKnnClf.fit(X_train, y_train)
        sklAccuracy = sklKnnClf.score(X_test,y_test)
        if sklAccuracy > best_score:
            bestp = p
            bestk = k
            best_score = sklAccuracy
print('bestp=',bestp)
print("bestk=",bestk)
print("best_score=",best_score)

bestp= 2
bestk= 3
best_score= 0.9888888888888889
