### 测试算法性能

In [4]:
import numpy as np
from sklearn import datasets

In [5]:
iris = datasets.load_iris()  # 加载鸢尾花数据集
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [6]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [7]:
iris.data[:10]  # 鸢尾花四个特征数据

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [8]:
iris.target_names  # 鸢尾花三种类型的名称

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [9]:
iris.target[:10]  # 三种类型对应的code,注意:data和target的数据是一一对应的

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [10]:
X = iris.data
y = iris.target

In [11]:
X.shape  # 总共150行数据

(150, 4)

In [12]:
y.shape

(150,)

train_test_split

分离出一部分数据做训练，另外一部分数据做测试。

In [13]:
y  # 观察数据发现每个编码都已经分类好了,为了保证训练模型的准确性,需将数据打乱,保证每一项数据参与训练的概率都是相同的

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [14]:
shuffled_indexes = np.random.permutation(len(X))  # 随机置换序列,这样就能保证特征值和标记一一对应
shuffled_indexes

array([127,  57, 105, 134, 122,  78, 106,  60,  69,  24,  61,  97, 135,
        16,  98, 131,  37,  76,  91,  38, 129, 114,  74,   1, 108,  70,
         9,  87,  85, 149, 146,  96,  20, 110,  53,  36,  50,  17,  64,
        86,  75, 144,  43,  51,  48,  26, 148, 145,  82,  89,  68,  32,
        88,  58,  42, 123, 113, 103,   7,  28, 141,  39,   8, 112,  41,
       117,  67,  56,  71, 116, 101, 136, 133,  80,  21,  83, 125, 139,
       102,   5, 111,  72, 137,  31, 132, 147,  10,  27,  18,  93,  54,
        47,  52,  94, 138,  46,  30,  23,  35,  44,   3,  59,  65,  66,
       100,  90,   4,  49,  84, 126, 130,  62, 107,  92,  14,  13,  19,
        33,  29,  95, 121,  15, 115,  11,  77,  12,  34, 119,   0, 104,
        25, 140,  79, 142, 109, 124,  55, 143, 118,   2,  63,  99,  73,
        81, 128, 120,  40,   6,  45,  22])

In [15]:
test_ratio = 0.2  # 设置分割比例
test_size = int(len(X) * test_ratio)
test_size

30

In [16]:
# 分割测试数据和训练数据
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]

In [17]:
# 准备数据

X_test = X[test_indexes]
y_test = y[test_indexes]

X_train = X[train_indexes]
y_train = y[train_indexes]

In [18]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [19]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


使用封装后的train_test_split算法

In [22]:
from train_split.model_selection import train_test_split 

# 调用train_test_split,并设置随机数种子为10
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, seed=10)
print(X_train2.shape)
print(y_train2.shape)

(120, 4)
(120,)


In [26]:
# 导入自己封装的kNN算法来测试

from train_split.kNN import KNNClassifier

my_knn_clf = KNNClassifier(3)  # 初始化分类器
my_knn_clf.fit(X_train, y_train)  # 拟合
y_predict = my_knn_clf.predict(X_test)
y_predict

array([2, 1, 2, 2, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 1, 2, 0, 1, 1, 0, 2, 2,
       1, 0, 2, 2, 0, 1, 1, 2])

In [28]:
# 查看之前分割得测试结果
y_test

array([2, 1, 2, 2, 2, 1, 2, 1, 1, 0, 1, 1, 2, 0, 1, 2, 0, 1, 1, 0, 2, 2,
       1, 0, 2, 1, 0, 1, 1, 2])

In [31]:
# 计算算法性能
sum(y_predict == y_test) / len(y_test)

0.9333333333333333

### sklearn中的train_test_split

自己封装train_test_split的思路是借鉴sklearn库model_selection模块train_test_split函数

In [32]:
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X,y,test_size=0.1, random_state=10)

In [34]:
print(X_train3.shape)
print(y_train3.shape)

(135, 4)
(135,)


In [36]:
print(X_test3.shape)
print(y_test3.shape)


(15, 4)
(15,)
