# kNN

## 电影分类


In [None]:
import numpy as np
import operator
from os import listdir


def createDataSet():
    '''
    函数说明：创建数据集

    Returns:
        group - 数据集
        labels - 分类标签
    '''
    group = np.array([[1, 101], [5, 89], [108, 5], [115, 8]])
    labels = ['爱情片', '爱情片', '动作片', '动作片']

    return group, labels


def classify0(inX, dataSet, labels, k):
    '''
    使用k-近邻算法将每组数据划分到某个类中

    Created on Sep 16, 2010
    kNN: k Nearest Neighbors

    Input:      inX: vector to compare to existing dataset (1xN)
                dataSet: size m data set of known vectors (NxM)
                labels: data set labels (1xM vector)
                k: number of neighbors to use for comparison (should be an odd number)

    Output:     the most popular class label

    @author: pbharrin
    '''

    # numpy函数shape[0]返回dataSet的行数
    dataSetSize = dataSet.shape[0]
    # 将inX重复dataSetSize次并排成一列
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    # 二维特征相减后平方（用diffMat的转置乘diffMat）
    sqDiffMat = diffMat**2
    # sum()所有元素相加，sum(0)列相加，sum(1)行相加
    sqDistances = sqDiffMat.sum(axis=1)
    # 开方，计算出距离
    distances = sqDistances**0.5
    # argsort函数返回的是distances值从小到大的--索引值
    sortedDistIndicies = distances.argsort()

    # 定义一个记录类别次数的字典
    classCount = {}
    # 选择距离最小的k个点
    for i in range(k):
        # 取出前k个元素的类别
        voteIlabel = labels[sortedDistIndicies[i]]
        # 字典的get()方法，返回指定键的值，如果值不在字典中返回0
        # 计算类别次数
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    # python3中用items()替换python2中的iteritems()
    # key = operator.itemgetter(1)根据字典的值进行排序
    # key = operator.itemgetter(0)根据字典的键进行排序
    # reverse降序排序字典
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)

    # 返回次数最多的类别，即所要分类的类别
    return sortedClassCount[0][0]


groups, labels = createDataSet()
classify0([0, 0], groups, labels, 3)

## 改进约会网站的配对效果


In [None]:
def file2matrix(filename):
    """
    函数说明：打开解析文件，对数据进行分类，1代表不喜欢，2代表魅力一般，3代表极具魅力

    Parameters:
        filename - 文件名

    Returns:
        returnMat - 特征矩阵
        classLabelVector - 分类label向量
    """
    love_dictionary = {'largeDoses': 3, 'smallDoses': 2, 'didntLike': 1}

    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)  # get the number of lines in the file

    # 返回的NumPy矩阵numberOfLines行，3列
    returnMat = np.zeros((numberOfLines, 3))  # prepare matrix to return
    classLabelVector = []  # prepare labels return
    index = 0
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        # 将数据的前3列进行提取保存在returnMat矩阵中，也就是特征矩阵
        returnMat[index, :] = listFromLine[0:3]
        # 根据文本内容进行分类1：不喜欢；2：一般；3：喜欢
        if (listFromLine[-1].isdigit()):
            classLabelVector.append(int(listFromLine[-1]))
        else:
            classLabelVector.append(love_dictionary.get(listFromLine[-1]))
        index += 1

    # 返回标签列向量以及特征矩阵
    return returnMat, classLabelVector


datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
print(datingDataMat, datingLabels[0:20])

In [None]:
'''
Created on Oct 27, 2010

@author: Peter
'''
from numpy import *
import matplotlib
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
datingDataMat, datingLabels = file2matrix('datingTestSet.txt')
# ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
ax.axis([-2, 25, -0.2, 2.0])
plt.xlabel('Percentage of Time Spent Playing Video Games')
plt.ylabel('Liters of Ice Cream Consumed Per Week')
plt.show()

In [None]:
from matplotlib.font_manager import FontProperties
import matplotlib.lines as mlines


def showdatas(datingDataMat, datingLabels):
    """
    函数说明：可视化数据

    Parameters:
        datingDataMat - 特征矩阵
        datingLabels - 分类Label

    Returns:
        None

    Modify:
        2018-07-13
    """
    # 设置汉字格式为14号简体字
    font = FontProperties("Arial Unicode MS", size=14)
    # 将fig画布分隔成1行1列，不共享x轴和y轴，fig画布的大小为（13，8）
    # 当nrows=2，ncols=2时，代表fig画布被分为4个区域，axs[0][0]代表第一行第一个区域
    fig, axs = plt.subplots(nrows=2, ncols=2, sharex=False, sharey=False, figsize=(13, 8))

    # 获取datingLabels的行数作为label的个数
    numberOfLabels = len(datingLabels)
    # label的颜色配置矩阵
    LabelsColors = []
    for i in datingLabels:
        # didntLike
        if i == 1:
            LabelsColors.append('black')
        # smallDoses
        if i == 2:
            LabelsColors.append('orange')
        # largeDoses
        if i == 3:
            LabelsColors.append('red')
    # 画出散点图，以datingDataMat矩阵第一列为x，第二列为y，散点大小为15, 透明度为0.5
    axs[0][0].scatter(x=datingDataMat[:, 0], y=datingDataMat[:, 1], color=LabelsColors, s=15, alpha=.5)
    # 设置标题，x轴label， y轴label
    axs0_title_text = axs[0][0].set_title(u'每年获得的飞行常客里程数与玩视频游戏所消耗时间占比', fontproperties=font)
    axs0_xlabel_text = axs[0][0].set_xlabel(u'每年获得的飞行常客里程数', fontproperties=font)
    axs0_ylabel_text = axs[0][0].set_ylabel(u'玩视频游戏所消耗时间占比', fontproperties=font)
    plt.setp(axs0_title_text, size=9, weight='bold', color='red')
    plt.setp(axs0_xlabel_text, size=7, weight='bold', color='black')
    plt.setp(axs0_ylabel_text, size=7, weight='bold', color='black')
    # 画出散点图，以datingDataMat矩阵第一列为x，第三列为y，散点大小为15, 透明度为0.5
    axs[0][1].scatter(x=datingDataMat[:, 0], y=datingDataMat[:, 2], color=LabelsColors, s=15, alpha=.5)
    # 设置标题，x轴label， y轴label
    axs1_title_text = axs[0][1].set_title(u'每年获得的飞行常客里程数与每周消费的冰淇淋公升数', fontproperties=font)
    axs1_xlabel_text = axs[0][1].set_xlabel(u'每年获得的飞行常客里程数', fontproperties=font)
    axs1_ylabel_text = axs[0][1].set_ylabel(u'每周消费的冰淇淋公升数', fontproperties=font)
    plt.setp(axs1_title_text, size=9, weight='bold', color='red')
    plt.setp(axs1_xlabel_text, size=7, weight='bold', color='black')
    plt.setp(axs1_ylabel_text, size=7, weight='bold', color='black')
    # 画出散点图，以datingDataMat矩阵第二列为x，第三列为y，散点大小为15, 透明度为0.5
    axs[1][0].scatter(x=datingDataMat[:, 1], y=datingDataMat[:, 2], color=LabelsColors, s=15, alpha=.5)
    # 设置标题，x轴label， y轴label
    axs2_title_text = axs[1][0].set_title(u'玩视频游戏所消耗时间占比与每周消费的冰淇淋公升数', fontproperties=font)
    axs2_xlabel_text = axs[1][0].set_xlabel(u'玩视频游戏所消耗时间占比', fontproperties=font)
    axs2_ylabel_text = axs[1][0].set_ylabel(u'每周消费的冰淇淋公升数', fontproperties=font)
    plt.setp(axs2_title_text, size=9, weight='bold', color='red')
    plt.setp(axs2_xlabel_text, size=7, weight='bold', color='black')
    plt.setp(axs2_ylabel_text, size=7, weight='bold', color='black')
    # 设置图例
    didntLike = mlines.Line2D([], [], color='black', marker='.', markersize=6, label='didntLike')
    smallDoses = mlines.Line2D([], [], color='orange', marker='.', markersize=6, label='smallDoses')
    largeDoses = mlines.Line2D([], [], color='red', marker='.', markersize=6, label='largeDoses')
    # 添加图例
    axs[0][0].legend(handles=[didntLike, smallDoses, largeDoses])
    axs[0][1].legend(handles=[didntLike, smallDoses, largeDoses])
    axs[1][0].legend(handles=[didntLike, smallDoses, largeDoses])
    # 显示图片
    plt.show()


showdatas(datingDataMat, datingLabels)

In [None]:
# 查看系统可用字体
from matplotlib.font_manager import FontManager

fm = FontManager()
mat_fonts = set(f.name for f in fm.ttflist)
print(mat_fonts)

In [None]:
def autoNorm(dataSet):
    """
    函数说明：对数据进行归一化

    Parameters:
        dataSet - 特征矩阵

    Returns:
        normDataSet - 归一化后的特征矩阵
        ranges - 数据范围
        minVals - 数据最小值
    """
    # 获取数据的最小值
    minVals = dataSet.min(0)
    # 获取数据的最大值
    maxVals = dataSet.max(0)
    # 最大值和最小值的范围
    ranges = maxVals - minVals
    # shape(dataSet)返回dataSet的矩阵行列数
    normDataSet = np.zeros(np.shape(dataSet))
    # numpy函数shape[0]返回dataSet的行数
    m = dataSet.shape[0]
    # 原始值减去最小值（x-xmin）
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    # 差值处以最大值和最小值的差值（x-xmin）/（xmax-xmin）
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    # 归一化数据结果，数据范围，最小值

    return normDataSet, ranges, minVals

In [None]:
def datingClassTest():
    """
    函数说明：分类器测试函数

    Parameters:
        None

    Returns:
        normDataSet - 归一化后的特征矩阵
        ranges - 数据范围
        minVals - 数据最小值

    Modify:
        2018-07-13
    """
    # 取所有数据的10% hoRatio越小，错误率越低
    hoRatio = 0.10  # hold out 10%
    datingDataMat, datingLabels = file2matrix('datingTestSet.txt')  # load data setfrom file
    # 数据归一化，返回归一化数据结果，数据范围，最小值
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # 获取normMat的行数
    m = normMat.shape[0]
    # 10%的测试数据的个数
    numTestVecs = int(m * hoRatio)
    # 分类错误计数
    errorCount = 0.0
    for i in range(numTestVecs):
        # 前numTestVecs个数据作为测试集，后m-numTestVecs个数据作为训练集
        # k选择label数+1（结果比较好）
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]):
            errorCount += 1.0

    print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print(errorCount)

In [None]:
def classifyPerson():
    """
    函数说明：通过输入一个人的三纬特征，进行分类输出
    """
    resultList = ['not at all', 'in small doses', 'in large doses']

    # 三维特征用户输入
    percentTats = float(input("玩视频游戏所消耗时间百分比："))
    ffMiles = float(input("每年获得的飞行常客里程数："))
    iceCream = float(input("每周消费的冰淇淋公升数："))

    filename = "datingTestSet.txt"
    # 打开并处理数据
    datingDataMat, datingLabels = file2matrix(filename)
    # 训练集归一化
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # 生成NumPy数组，测试集
    inArr = np.array([percentTats, ffMiles, iceCream])
    # 测试集归一化
    norminArr = (inArr - minVals) / ranges
    # 返回分类结果
    classifierResult = classify0(norminArr, normMat, datingLabels, 4)

    print("你可能 %s 这个人" % (resultList[classifierResult - 1]))

In [None]:
import time

# 获取程序运行时间
start = time.perf_counter()
datingClassTest()
classifyPerson()
end = time.perf_counter()
# 打印程序运行时间
print('Running time: %f Seconds' % (end - start))

## 手写识别系统


In [None]:
def img2vector(filename):
    """
    函数说明：将32*32的二进制图像转换为1*1024向量

    Parameters:
        filename - 文件名

    Returns:
        returnVect - 返回二进制图像的1*1024向量
    """
    # 创建1*1024零向量
    returnVect = np.zeros((1, 1024))
    # 打开文件
    fr = open(filename)
    # 按行读取
    for i in range(32):
        # 读取一行数据
        lineStr = fr.readline()
        # 每一行的前32个数据依次存储到returnVect中
        for j in range(32):
            returnVect[0, 32 * i + j] = int(lineStr[j])

    return returnVect


testVector = img2vector('testDigits/0_13.txt')
testVector[0, 0:31]

In [None]:
def handwritingClassTest():
    """
    函数说明：手写数字分类测试
    """

    # 测试集的Labels
    hwLabels = []
    # 返回trainingDigits目录下的文件名
    trainingFilesList = listdir('trainingDigits')
    # 返回文件夹下文件的个数
    m = len(trainingFilesList)
    # 初始化训练的Mat矩阵（全零阵），测试集
    trainingMat = np.zeros((m, 1024))
    # 从文件名中解析出训练集的类别
    for i in range(m):
        # 获得文件的名字
        fileNameStr = trainingFilesList[i]
        fileStr = fileNameStr.split('.')[0]
        # 获得分类的数字
        classNumber = int(fileNameStr.split('_')[0])
        # 将获得的类别添加到hwLabels中
        hwLabels.append(classNumber)
        # 将每一个文件的1x1024数据存储到trainingMat矩阵中
        trainingMat[i, :] = img2vector('trainingDigits/%s' % (fileNameStr))

    # # 构造kNN分类器
    # neigh = kNN(n_neighbors=3, algorithm='auto')
    # # 拟合模型，trainingMat为测试矩阵，hwLabels为对应标签
    # neigh.fit(trainingMat, hwLabels)

    # 返回testDigits目录下的文件列表
    testFileList = listdir('testDigits')
    # 错误检测计数
    errorCount = 0.0
    # 测试数据的数量
    mTest = len(testFileList)
    # 从文件中解析出测试集的类别并进行分类测试
    for i in range(mTest):
        # 获得文件名字
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        # 获得分类的数字
        classNumber = int(fileNameStr.split('_')[0])
        # 获得测试集的1*1024向量，用于训练
        vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr))
        # 获得预测结果
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber))
        if (classifierResult != classNumber):
            errorCount += 1.0
    print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount / mTest * 100))


handwritingClassTest()

## 项目:鸢尾花多分类任务的 sklearn 包实现


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import mglearn
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

iris_dataset = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)

# 利用DataFrame创建散点图矩阵，按照y_train着色
grr = scatter_matrix(
    iris_dataframe,
    c=y_train,
    figsize=(
        15,
        15),
    marker='o',
    hist_kwds={
        'bins': 20},
    s=60,
    alpha=.8,
    cmap=mglearn.cm3)
plt.show()
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_mew.shape: {}".format(X_new.shape))
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

# 模型评价
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

In [None]:
'''
Created on Oct 6, 2010

@author: Peter
'''
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle


n = 1000  # number of points to create
xcord = zeros((n))
ycord = zeros((n))
markers = []
colors = []
fw = open('testSet.txt', 'w')
for i in range(n):
    [r0, r1] = random.standard_normal(2)
    myClass = random.uniform(0, 1)
    if (myClass <= 0.16):
        fFlyer = random.uniform(22000, 60000)
        tats = 3 + 1.6 * r1
        markers.append(20)
        colors.append(2.1)
        classLabel = 1  # 'didntLike'
        print("%d, %f, class1" % (fFlyer, tats))
    elif ((myClass > 0.16) and (myClass <= 0.33)):
        fFlyer = 6000 * r0 + 70000
        tats = 10 + 3 * r1 + 2 * r0
        markers.append(20)
        colors.append(1.1)
        classLabel = 1  # 'didntLike'
        print("%d, %f, class1" % (fFlyer, tats))
    elif ((myClass > 0.33) and (myClass <= 0.66)):
        fFlyer = 5000 * r0 + 10000
        tats = 3 + 2.8 * r1
        markers.append(30)
        colors.append(1.1)
        classLabel = 2  # 'smallDoses'
        print("%d, %f, class2" % (fFlyer, tats))
    else:
        fFlyer = 10000 * r0 + 35000
        tats = 10 + 2.0 * r1
        markers.append(50)
        colors.append(0.1)
        classLabel = 3  # 'largeDoses'
        print("%d, %f, class3" % (fFlyer, tats))
    if (tats < 0):
        tats = 0
    if (fFlyer < 0):
        fFlyer = 0
    xcord[i] = fFlyer
    ycord[i] = tats
    fw.write("%d\t%f\t%f\t%d\n" % (fFlyer, tats, random.uniform(0.0, 1.7), classLabel))

fw.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord, ycord, c=colors, s=markers)
type1 = ax.scatter([-10], [-10], s=20, c='red')
type2 = ax.scatter([-10], [-15], s=30, c='green')
type3 = ax.scatter([-10], [-20], s=50, c='blue')
ax.legend([type1, type2, type3], ["Class 1", "Class 2", "Class 3"], loc=2)
# ax.axis([-5000,100000,-2,25])
plt.xlabel('Frequent Flyier Miles Earned Per Year')
plt.ylabel('Percentage of Body Covered By Tatoos')
plt.show()

In [None]:
'''
Created on Oct 6, 2010

@author: Peter
'''
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle


n = 1000  # number of points to create
xcord1 = []
ycord1 = []
xcord2 = []
ycord2 = []
xcord3 = []
ycord3 = []
markers = []
colors = []
fw = open('testSet.txt', 'w')
for i in range(n):
    [r0, r1] = random.standard_normal(2)
    myClass = random.uniform(0, 1)
    if (myClass <= 0.16):
        fFlyer = random.uniform(22000, 60000)
        tats = 3 + 1.6 * r1
        markers.append(20)
        colors.append(2.1)
        classLabel = 1  # 'didntLike'
        xcord1.append(fFlyer)
        ycord1.append(tats)
    elif ((myClass > 0.16) and (myClass <= 0.33)):
        fFlyer = 6000 * r0 + 70000
        tats = 10 + 3 * r1 + 2 * r0
        markers.append(20)
        colors.append(1.1)
        classLabel = 1  # 'didntLike'
        if (tats < 0):
            tats = 0
        if (fFlyer < 0):
            fFlyer = 0
        xcord1.append(fFlyer)
        ycord1.append(tats)
    elif ((myClass > 0.33) and (myClass <= 0.66)):
        fFlyer = 5000 * r0 + 10000
        tats = 3 + 2.8 * r1
        markers.append(30)
        colors.append(1.1)
        classLabel = 2  # 'smallDoses'
        if (tats < 0):
            tats = 0
        if (fFlyer < 0):
            fFlyer = 0
        xcord2.append(fFlyer)
        ycord2.append(tats)
    else:
        fFlyer = 10000 * r0 + 35000
        tats = 10 + 2.0 * r1
        markers.append(50)
        colors.append(0.1)
        classLabel = 3  # 'largeDoses'
        if (tats < 0):
            tats = 0
        if (fFlyer < 0):
            fFlyer = 0
        xcord3.append(fFlyer)
        ycord3.append(tats)

fw.close()
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(xcord,ycord, c=colors, s=markers)
type1 = ax.scatter(xcord1, ycord1, s=20, c='red')
type2 = ax.scatter(xcord2, ycord2, s=30, c='green')
type3 = ax.scatter(xcord3, ycord3, s=50, c='blue')
ax.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2)
ax.axis([-5000, 100000, -2, 25])
plt.xlabel('Frequent Flyier Miles Earned Per Year')
plt.ylabel('Percentage of Time Spent Playing Video Games')
plt.show()