# k近邻算法教程

### 0. 添加依赖

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris # 用来加载iris数据
from sklearn.model_selection import train_test_split # 切分训练集和测试集
from sklearn.metrics import accuracy_score # 计算分类数据的预测准确度

#### 1. 数据加载和预处理

In [3]:
iris = load_iris()

#print(iris)

In [5]:
df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
# 加入分类标签数据，target
df['class'] = iris.target
df['class'] = df['class'].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
x = iris.data
y = iris.target.reshape(-1,1)
print(x.shape, y.shape)

(150, 4) (150, 1)


In [9]:
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify = y)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(105, 4) (105, 1)
(45, 4) (45, 1)


### 2. 算法实现

In [11]:
# 距离函数
def l1_dist(a, b):
    return np.sum(np.abs(a-b), axis=1)

def l2_dist(a, b):
    return np.sqrt(np.sum((a-b)**2, axis=1))

In [21]:
# 分类器实现

class kNN(object):
    # 初始化，类的构造方法
    def __init__(self, k_neighbors = 1, disc_func = l1_dist):
        self.k_neighbors = k_neighbors
        self.disc_func = disc_func
    
    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
        
    def predict(self, test):
        # 预测数组初始化为0
        y_pred = np.zeros((test.shape[0],1), dtype = self.y_train.dtype)
        
        for i,x_test in enumerate(test):
            # 计算距离矩阵
            distances = self.disc_func(self.x_train, x_test)
            
            # 按距离大小排序,取出索引值
            nn_index = np.argsort(distances)
            
            # 取前k个值索引，计算分类频率
            nn_pred = self.y_train[nn_index[:self.k_neighbors]].ravel()
            y_pred[i] = np.argmax(np.bincount(nn_pred))
            
        return y_pred

### 3.测试

In [30]:
knn = kNN(k_neighbors = 9)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print("分类准确率: {:.5f}%".format(accuracy_score(y_test, y_pred)*100))

分类准确率: 93.33333%


In [35]:
knn = kNN()
knn.fit(x_train, y_train)

result_list = []
for p in [1,2]:
    knn.disc_func = l1_dist if p == 1 else l2_dist
    
    for k in range(1,20,2):
        knn.k_neighbors = k
        y_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, y_pred)*100
        result_list.append([k, 'l1_dist' if p == 1 else 'l2_dist', acc])
        
df = pd.DataFrame(result_list, columns=['k', '距离函数', '分类准确率'])
df

Unnamed: 0,k,距离函数,分类准确率
0,1,l1_dist,91.111111
1,3,l1_dist,93.333333
2,5,l1_dist,93.333333
3,7,l1_dist,93.333333
4,9,l1_dist,93.333333
5,11,l1_dist,95.555556
6,13,l1_dist,95.555556
7,15,l1_dist,95.555556
8,17,l1_dist,93.333333
9,19,l1_dist,95.555556
