# k近邻算法教程

### 0. 添加依赖

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris # 用来加载iris数据
from sklearn.model_selection import train_test_split # 切分训练集和测试集
from sklearn.metrics import accuracy_score # 计算分类数据的预测准确度

#### 1. 数据加载和预处理

In [2]:
iris = load_iris()

In [4]:
df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
# 加入分类标签数据，target
df['class'] = iris.target
print(iris.target_names)
# 将分类标签数据[0,1,2]转换成文字
df['class'] = df['class'].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})
print(df)

['setosa' 'versicolor' 'virginica']
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1              

In [6]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
x = iris.data
print(x)
y = iris.target.reshape(-1,1)
print(y)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [9]:
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(105, 4) (105, 1)
(45, 4) (45, 1)


### 2. 算法实现

In [11]:
# 距离函数
# 类似曼哈顿距离，但没有开平方
def l1_dist(a, b):
    return np.sum(np.abs(a-b), axis=1)

# 欧式距离
# a,b是向量vector
def l2_dist(a, b):
    return np.sqrt(np.sum((a-b)**2, axis=1))

In [12]:
# 分类器实现

class kNN(object):
    # 初始化，类的构造方法
    # knn的k默认为1
    # 默认的距离函数是l1范数
    def __init__(self, k_neighbors = 1, disc_func = l1_dist):
        self.k_neighbors = k_neighbors
        self.disc_func = disc_func
    
    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
        
    def predict(self, test):
        # 预测数组初始化为0
        y_pred = np.zeros((test.shape[0],1), dtype = self.y_train.dtype)
        
        # 迭代测试数据
        # i是索引，x_test是迭代的每一个数据
        for i, x_test in enumerate(test):
            # 计算距离矩阵，将x_test这条数据和训练集中的所有数据计算距离
            distances = self.disc_func(self.x_train, x_test)
            
            # 按距离大小排序,抽取出索引值
            nn_index = np.argsort(distances)
            
            # 取前k个值索引，然后取出对应的索引的分类标签
            nn_pred = self.y_train[nn_index[:self.k_neighbors]].ravel()
            # 计算分类频率，输出分类频率最高的那一类标签
            y_pred[i] = np.argmax(np.bincount(nn_pred))
            
        return y_pred

In [13]:
a = [1,2,3,4,5]
a[:3]

[1, 2, 3]

### 3.测试

In [15]:
# k赋值为9
knn = kNN(k_neighbors = 9)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print("分类准确率: {:.5f}%".format(accuracy_score(y_test, y_pred)*100))

分类准确率: 93.33333%


In [16]:
knn = kNN()
knn.fit(x_train, y_train)

result_list = []
for p in [1,2]:
    # 当p==1时，我们使用的是L1范数
    # 当p==2时，我们使用的是L2范数
    knn.disc_func = l1_dist if p == 1 else l2_dist
    
    # [1,3,5,7,...,19]
    for k in range(1,20,2):
        knn.k_neighbors = k
        y_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, y_pred)*100
        result_list.append([k, 'l1_dist' if p == 1 else 'l2_dist', acc])
        
df = pd.DataFrame(result_list, columns=['k', '距离函数', '分类准确率'])
df

Unnamed: 0,k,距离函数,分类准确率
0,1,l1_dist,91.111111
1,3,l1_dist,93.333333
2,5,l1_dist,93.333333
3,7,l1_dist,93.333333
4,9,l1_dist,93.333333
5,11,l1_dist,95.555556
6,13,l1_dist,95.555556
7,15,l1_dist,95.555556
8,17,l1_dist,93.333333
9,19,l1_dist,95.555556
