In [1]:
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier as KNC

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# 设定数据集路径
test_path = 'dataset/testDigits'
train_path = 'dataset/trainingDigits'

In [16]:
# 读取和处理数据
test_files = os.listdir(test_path)  # 列出数据集所有文件
train_files = os.listdir(train_path)
test_x = []
test_y = []
train_x = []
train_y = []

for tf in test_files:
    with open(os.path.join(test_path, tf), 'r') as fp:
        t = fp.read()  # 读取文件
        t = t.replace('\n', '')
        t = np.array(list(t)).astype(int)  # 转换为int类型
        test_x.append(t)
        t_label = int(tf.split('_')[0])  # 获取数据的标签
        test_y.append(t_label)
test_y = np.array(test_y)

for tf in train_files:
    with open(os.path.join(train_path, tf), 'r') as fp:
        t = fp.read()
        t = t.replace('\n', '')
        t = np.array(list(t)).astype(int)
        train_x.append(t)
        t_label = int(tf.split('_')[0])
        train_y.append(t_label)
train_y = np.array(train_y)

In [17]:
# 实现KNN算法核心原理
def get_distance(X, Y, method='euclidean'):
    """
    计算向量X、Y的各种距离，X与Y均为np.array数组，method为字符串，表示计算哪种距离
    """
    if method == 'euclidean':  # 欧氏距离
        return np.linalg.norm(X - Y)
    elif method == 'cosine':  # 余弦相似度
        return np.dot(X, Y)/(np.linalg.norm(X) * np.linalg.norm(Y))
    elif method == 'manhattan':  # 曼哈顿距离
        return np.sum(np.abs(X - Y))
    elif method == 'chebyshev':  # 切比雪夫距离
        return np.abs(X - Y).max()
    else:
        raise ValueError('没有此距离参数')


def knn_algorithm(test_X, train_X, train_Y, distance='euclidean', k: int=3):
    """
    输入测试数据集、训练数据集及其标签，distance为计算距离的方式，与get_distance函数里的method参数一样，k为选取最小距离样本的个数
    """
    test_Y = []  # 待求测试数据集的标签
    for tX in test_X:  # 循环测试样本
        td  = []  # 中间变量，当前测试样本与训练集的距离数组
        for rX in train_X:
            td.append(get_distance(tX, rX, method=distance))  # 计算距离后加到td中间变量
        idx = np.argpartition(np.array(td), k)[:k]  # 找出距离最小的k个样本的位置序号
        ty = train_Y[idx]  # k个训练集样本的标签
        test_Y.append(np.argmax(np.bincount(ty)))  # 统计这k个训练集样本的标签，多数表决，添加到test_Y末尾
    return test_Y

In [18]:
# 调用knn算法，求测试数据集的标签
predict_y1 = np.array(knn_algorithm(test_x, train_x, train_y))

In [19]:
# 计算准确率
accuracy1 = np.mean(np.equal(predict_y1, test_y))  # 计算分类的准确度
'准确率', accuracy1
err_label_idx1 = np.where(np.equal(predict_y1, test_y) == False)[0]  # 错误分类的位置序号
err_label_num1 = len(err_label_idx1)  # 错误分类的个数
'错误分类个数', err_label_num1
err_label1 = test_y[err_label_idx1]  # 错误分类的正确标签
err_pred_label1 = predict_y1[err_label_idx1]  # 预测的标签
'正确标签', err_label1, '预测的标签', err_pred_label1

('准确率', 0.9883720930232558)

('错误分类个数', 11)

('正确标签',
 array([1, 3, 5, 5, 8, 8, 8, 8, 9, 9, 9]),
 '预测的标签',
 array([7, 9, 3, 6, 6, 3, 1, 1, 1, 7, 5], dtype=int64))

In [20]:
# 再使用sklearn的KNN分类器构建和上面一样的分类器，进行分类
clf = KNC(n_neighbors=3)  # 构建的分类器，设定K为3
clf.fit(train_x, train_y)  # 拟合

KNeighborsClassifier(n_neighbors=3)

In [21]:
# 预测测试集的标签
predict_y2 = clf.predict(test_x)

In [22]:
# 计算分类器的平均准确率
accuracy2 = clf.score(test_x, test_y)
'准确率', accuracy2
err_label_idx2 = np.where(np.equal(predict_y2, test_y) == False)[0]  # 错误分类的位置序号
err_label_num2 = len(err_label_idx2)  # 错误分类的个数
'错误分类个数', err_label_num2
err_label2 = test_y[err_label_idx2]  # 错误分类的正确标签
err_pred_label2 = predict_y2[err_label_idx2]  # 预测的标签
'正确标签', err_label2, '预测的标签', err_pred_label2

('准确率', 0.9873150105708245)

('错误分类个数', 12)

('正确标签',
 array([1, 3, 5, 5, 8, 8, 8, 8, 8, 9, 9, 9]),
 '预测的标签',
 array([7, 9, 3, 6, 6, 3, 1, 1, 1, 1, 7, 5]))

## 总结对比
|            | knn_algorithm | KNeighborsClassifier |
| ---------- | ------------- | -------------------- |
| 准确率     | 98.84         | 98.73                |
| 误分类个数 | 11            | 12                   |