# knn原理

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from math import sqrt

In [None]:
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073408, 1.781539638],
              [1.343808831, 3.368360945],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
              ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


# 显示数据集

X_train = np.array(raw_data_X)            # 训练集
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])  # 测试集

# 绘制
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='g')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='r')
plt.scatter(x[0], x[1], color='b')
plt.show()


# knn过程

# 1.计算预测点与每个点之间的距离
distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in X_train]
# print(distances)

# 2.排序查找最近 k 个点的下标
k = 6
nearest = np.argsort(distances)
topK_y = [y_train[i] for i in nearest[:k]]
# print(topK_y)

# 3.根据最近 k 个点的标签，进行预测
votes = Counter(topK_y)
# print(votes.most_common(1))
predict_y = votes.most_common(1)[0][0]
predict_y

# 测试自己封装的库

In [None]:
%%time
import os, sys
sys.path.append(os.path.abspath('..'))
%run ../testML/1.knn.py

# 使用scikit-learn

In [None]:
# coding:utf-8
"""
1.获取数据集
2.数据基本处理
3.特征工程
4.机器学习(knn算法)
5.模型评估
""";

In [None]:
from sklearn.datasets import load_iris                  # 1.获取数据集
from sklearn.model_selection import train_test_split    # 2.数据分割
from sklearn.preprocessing import StandardScaler        # 3.数据标准化
from sklearn.neighbors import KNeighborsClassifier      # 4-5.knn模型
from sklearn.model_selection import GridSearchCV        # 4.2网格搜索、交叉验证

In [None]:
# 1.获取数据集
iris = load_iris()


# 2.数据基本处理
# 2.1 异常值、缺失值处理

# 2.2 数据集分割：测试集、训练集
x_train, x_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=22, test_size=0.2)  # 20%测试集


# 3.特征工程
# 3.1 实例化一个转换器
transfer = StandardScaler()

# 3.2 调用fit、transform方法：对特征值进行标准化，消除量纲影响
transfer.fit(x_train)
x_train = transfer.transform(x_train)
x_test = transfer.transform(x_test)


# 4.机器学习
# 4.1 实例化一个估计器
estimater = KNeighborsClassifier(n_neighbors=4)  # knn超参数：k

# 4.2 交叉验证、网格搜索，进行模型选择与调优
param_grid = {"n_neighbors": [1, 3, 5, 7, 9]}   # 网格搜索 k
estimater = GridSearchCV(
    estimater, param_grid=param_grid, cv=10, n_jobs=1)  # 10折交叉验证

# 4.3 模型训练
estimater.fit(x_train, y_train)

# 5.模型评估
# 5.1 输出预测值
y_pre = estimater.predict(x_test)
print("预测值是:\n", y_pre)
print("真实值是:\n", y_test)

# 5.2 输出准确率
ret = estimater.score(x_test, y_test)
print("准确率是:\n", ret)

# 5.3 其他评价指标
print("最好的模型:\n", estimater.best_estimator_)
print("最好的结果:\n", estimater.best_score_)
print("整体模型结果:\n", estimater.cv_results_)