# 判断机器学习算法的性能
> **train test split**

<img src='./picture/3-1.png' style='width=700px;heigh=300px;float=left'>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
X.shape

(150, 4)

In [4]:
y.shape

(150,)

## train_test_split
> **提取前80%的数据**

> **首先将下x,y 合并成新的数组，将新的数组打乱，最后拆分成x, y**

> **或建立一个乱序的数组，将里面作为索引值，利用fancy indexing传入一维向量索引**

In [9]:
shuffle_indexes = np.random.permutation(len(X)) ##调用permutation生成随机排序数组
shuffle_indexes

array([ 80,   3, 133, 143,  85,  45,  22, 116,  37, 127,  60,  94,  33,
         4,  64,  88,  32,  29,  87, 102, 140,  74,  50, 100,  23,  31,
       121,  71, 118,  43,  36,  27, 104,  70,  72,  95,  86,  28,  12,
        78,  34, 110,  47,  91,   7,  38,  61,  99,  67,  57,  30,  89,
       134,  39, 120,  42, 142, 101,  25,  48, 112,  18,  56,  97,  44,
        54, 111,  84, 108, 139,  65,  10,  59,  20, 123,  15,  92,   0,
       126, 103,   6,  63,  66,  19,  77,  69,  35, 113, 107, 130, 114,
        14,  24,  17, 115,  79,  82, 138,   5,  13,  53,   2, 141,  40,
       124, 129,  68,  16,  51, 149,  83, 135,  52, 144, 119,  55,   1,
       136, 131,  62, 128,  81, 106, 132, 145, 105, 146,  49, 147,  90,
        11,   8,  73,  58, 125,  76,  41,  26,  96, 148, 117,  75,  98,
         9,  21,  46,  93, 109, 137, 122])

In [10]:
test_ratio = 0.2
test_size = int(len(X) * test_ratio)
test_size

30

In [11]:
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

---
采用fancy indexing 获得x_train 和 y_train

In [14]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)


## 将自己的算法封装
> **编写train_test_split函数**

> **利用python的特性自行解包————x1,x2,x3,x4 = function(x, y)**

> **利用训练集fit形成模型, 利用测试集查看预测的结果，最后通<font color='red'>过向量比对（==）</font>返回布尔数组**

In [16]:
import sys ##将包的路径导入
sys.path.append(r'../') 

In [17]:
from ml_python.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)


In [19]:
from ml_python.kNN_new import KNNClassifier

In [20]:
my_knn_clf = KNNClassifier(k = 3)
my_knn_clf.fit(X_train, y_train)

knn(k=3)

In [24]:
y_predict = my_knn_clf.predict(X_test)
y_predict

array([0, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       2, 1, 2, 1, 1, 0, 0, 1])

In [25]:
y_test

array([0, 2, 2, 1, 0, 2, 2, 2, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       2, 1, 2, 1, 1, 0, 0, 2])

In [26]:
sum(y_predict == y_test)

29

In [27]:
sum(y_predict == y_test)/len(y_test
                            )

0.9666666666666667

## sklearn中的train_test_split
> **通过传入参数**

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=666)

In [32]:
print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)
