## 超参数

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)

In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors = 3)
knn_clf.fit(X_train, y_train) # fit作用于训练集
knn_clf.score(X_test, y_test) # score作用于测试集

0.98888888888888893

超参数和模型参数
寻找好的超参数

- 领域知识：比如特定医学领域啦
- 经验数值：库的默认数值
- 实验搜索：默认数值不好时就自己找，因为具体问题

不要总觉得自己没有，你要相信自己得到过，得没得到不就是记忆的区别吗，只要记忆足够合理，没有得到也是得到

## 寻找最好的k
这门课程的目的是为了学习scikit-learn的封装原理

In [9]:
for k in range(1, 11):
    print(k)

1
2
3
4
5
6
7
8
9
10


In [12]:
best_score = 0.0
best_k = -1
for k in range(1, 11):
    # 因为超参数是模型开始前送入的，所以每次都要实例化类
    knn_clf = KNeighborsClassifier(n_neighbors = k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_k = k
print(best_k)
print("best_k =", best_k)
print("best_score =", best_score)

4
best_k = 4
best_score = 0.991666666667


## KNN中有很多超参数，而不是只有n_neighbors，因为没考虑一些因素

![](https://i.loli.net/2018/02/02/5a746a712a3cb.png)

## weights底层实现这里就不做了
## 考虑距离？不考虑距离？也就是weights啦

In [14]:
# 每多一个超参数就加到最前面
best_method = ""
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        # 因为超参数是模型开始前送入的，所以每次都要实例化类
        knn_clf = KNeighborsClassifier(n_neighbors = k, weights = method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score: # 准确度
            best_k = k
            best_score = score
            # best_k = k
            best_method = method
print(best_k)
print("best_k =", best_k)
print("best_score =", best_score)
print("best_method =", method)

4
best_k = 4
best_score = 0.991666666667
best_method = distance


## 更多的关于距离的定义

## 欧拉距离

## 曼哈顿距离
![](https://i.loli.net/2018/02/02/5a746ca6b8ca9.png)

## 距离
![](https://i.loli.net/2018/02/02/5a746cf6b1995.png)

## 数学一致性
![](https://i.loli.net/2018/02/02/5a746d2d7e8b6.png)
![](https://i.loli.net/2018/02/02/5a746d56b425d.png)

## 搜索明可夫斯基距离相应的p

In [16]:
%%time

# 每多一个超参数就加到最前面
# best_method = ""
best_p = -1
best_score = 0.0
best_k = -1
# for method in ["uniform", "distance"]:
for p in range(1, 6):
    for k in range(1, 11):
        # 因为超参数是模型开始前送入的，所以每次都要实例化类
        knn_clf = KNeighborsClassifier(n_neighbors = k, weights = "distance", p = p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score: # 准确度
            best_k = k
            best_score = score
            # best_k = k
            best_method = method
print(best_k)
print("best_k =", best_k)
print("best_score =", best_score)
# print("best_method =", method)
print("best_p =", best_p)

5
best_k = 5
best_score = 0.988888888889
best_p = -1
Wall time: 42.1 s
