In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

**距离样本点最近的K个neighbors，按照少数服从多数原则，来预测样本点的标签**

In [2]:
data_df = pd.read_csv('../data/Iris.csv', index_col='Id')
data_df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
FEAT_COLS = data_df.columns.tolist()[:4]
FEAT_COLS

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

In [4]:
SPECIES_LABEL_DICT = {
    'Iris-setosa':      0,  # 山鸢尾
    'Iris-versicolor':  1,  # 变色鸢尾
    'Iris-virginica':   2   # 维吉尼亚鸢尾
}

In [5]:
data_df['label'] = data_df['Species'].map(SPECIES_LABEL_DICT) # 其实没有这个转换也可以

In [6]:
data_df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa,0
2,4.9,3.0,1.4,0.2,Iris-setosa,0
3,4.7,3.2,1.3,0.2,Iris-setosa,0
4,4.6,3.1,1.5,0.2,Iris-setosa,0
5,5.0,3.6,1.4,0.2,Iris-setosa,0


In [7]:
X = data_df[FEAT_COLS].values
y = data_df['Species'].values

**note2**

- ```python
X = data_df[FEAT_COLS]
y = data_df['Species']
``` .values不加也可以进行下面的模型训练，可以算出score，但是后面的X_test[idx, :]不可以如此进行slicing
- 报错```TypeError: unhashable type: 'slice'```


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 10)

- 
```python
KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=1, **kwargs)
```

In [11]:
k_list = [3,5,7]
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    acc = knn.score(X_test, y_test)
    print('k=', k , '-> Accuracy: ' ,acc)

k= 3 -> Accuracy:  0.96
k= 5 -> Accuracy:  0.96
k= 7 -> Accuracy:  0.98


In [12]:
idx = 10
test_sample_feat = [X_test[idx,:]] # 必须是 [ X_test[idx,:] ]
pred_label = knn.predict(test_sample_feat)
true_label = y_test[idx]
print('真实标签{}，预测标签{}'.format(true_label, pred_label))

真实标签Iris-versicolor，预测标签['Iris-versicolor']


```python
"""
    任务：鸢尾花识别
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

DATA_FILE = './data_ai/Iris.csv'

SPECIES_LABEL_DICT = {
    'Iris-setosa':      0,  # 山鸢尾
    'Iris-versicolor':  1,  # 变色鸢尾
    'Iris-virginica':   2   # 维吉尼亚鸢尾
}

# 使用的特征列
FEAT_COLS = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


def main():
    """
        主函数
    """
    # 读取数据集
    iris_data = pd.read_csv(DATA_FILE, index_col='Id')
    iris_data['Label'] = iris_data['Species'].map(SPECIES_LABEL_DICT)

    # 获取数据集特征
    X = iris_data[FEAT_COLS].values

    # 获取数据标签
    y = iris_data['Label'].values

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=10)

    # 声明模型
    knn_model = KNeighborsClassifier()
    # 训练模型
    knn_model.fit(X_train, y_train)
    # 评价模型
    accuracy = knn_model.score(X_test, y_test)
    print('预测准确率:{:.2f}%'.format(accuracy * 100))

    # 取单个测试样本
    idx = 25
    test_sample_feat = [X_test[idx, :]]
    y_true = y_test[idx]
    y_pred = knn_model.predict(test_sample_feat)
    print('真实标签{}，预测标签{}'.format(y_true, y_pred))


if __name__ == '__main__':
    main()
```