![对测试数据集如何归一化](https://i.loli.net/2018/02/03/5a75b9570f2b4.png)
scikit-learn中的scalar类
![](https://i.loli.net/2018/02/03/5a75b9b32dfd7.png)

原始数据集需要拆分训练数据+测试数据  
对训练数据集进行归一化  
数据预处理应该在数据集分批之后，我这样理解的
测试数据时模拟真实环境
* 真实环境很可能无法得到所有测试，比如真实环境中新来了一朵鸢尾花
数据的均值和方差
* 对数据的归一化也是算法的一部分
要保存训练数据集得到的均值和方差
使得Scalar的操作和算法模型的使用方法类似，得到关键模型（某种参数就作为fit的结果），然后transform，可以理解为参数作用于训练或者测试数据集

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
X[:10,:] # float类型

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1]])

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666)

In [6]:
## scikit-learn中的StandardScaler

In [7]:
# 数据预处理
from sklearn.preprocessing import StandardScaler

In [8]:
standardScaler = StandardScaler() # 首字母小写命名

In [9]:
# 均值方差归一化的关键信息
standardScaler.fit(X_train) # 得到需要的参数就是均值和标准差，根据训练数据部分

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
standardScaler.mean_

array([ 5.83416667,  3.0825    ,  3.70916667,  1.16916667])

In [11]:
# standardScaler.std_ # 下划线

In [12]:
standardScaler.scale_ # 描述数据变化范围，标准差只是描述数据分布范围的一种指标

array([ 0.81019502,  0.44076874,  1.76295187,  0.75429833])

In [13]:
standardScaler.transform(X_train) # 对数据进行归一化处理了

array([[-0.90616043,  0.94720873, -1.30982967, -1.28485856],
       [-1.15301457, -0.18717298, -1.30982967, -1.28485856],
       [-0.16559799, -0.64092567,  0.22169257,  0.17345038],
       [ 0.45153738,  0.72033239,  0.95909217,  1.49918578],
       [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],
       [ 1.43895396,  0.2665797 ,  0.56203085,  0.30602392],
       [ 0.3281103 , -1.09467835,  1.07253826,  0.30602392],
       [ 2.1795164 , -0.18717298,  1.63976872,  1.2340387 ],
       [-0.78273335,  2.30846679, -1.25310662, -1.4174321 ],
       [ 0.45153738, -2.00218372,  0.44858475,  0.43859746],
       [ 1.80923518, -0.41404933,  1.46959958,  0.83631808],
       [ 0.69839152,  0.2665797 ,  0.90236912,  1.49918578],
       [ 0.20468323,  0.72033239,  0.44858475,  0.571171  ],
       [-0.78273335, -0.86780201,  0.10824648,  0.30602392],
       [-0.53587921,  1.40096142, -1.25310662, -1.28485856],
       [-0.65930628,  1.40096142, -1.25310662, -1.28485856],
       [-1.0295875 ,  0.

In [14]:
X_train # X_train本身没有改变，因为只是输入作用了一个函数，并没有改变变量

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6

In [15]:
X_train = standardScaler.transform(X_train)

In [16]:
X_train

array([[-0.90616043,  0.94720873, -1.30982967, -1.28485856],
       [-1.15301457, -0.18717298, -1.30982967, -1.28485856],
       [-0.16559799, -0.64092567,  0.22169257,  0.17345038],
       [ 0.45153738,  0.72033239,  0.95909217,  1.49918578],
       [-0.90616043, -1.3215547 , -0.40226093, -0.0916967 ],
       [ 1.43895396,  0.2665797 ,  0.56203085,  0.30602392],
       [ 0.3281103 , -1.09467835,  1.07253826,  0.30602392],
       [ 2.1795164 , -0.18717298,  1.63976872,  1.2340387 ],
       [-0.78273335,  2.30846679, -1.25310662, -1.4174321 ],
       [ 0.45153738, -2.00218372,  0.44858475,  0.43859746],
       [ 1.80923518, -0.41404933,  1.46959958,  0.83631808],
       [ 0.69839152,  0.2665797 ,  0.90236912,  1.49918578],
       [ 0.20468323,  0.72033239,  0.44858475,  0.571171  ],
       [-0.78273335, -0.86780201,  0.10824648,  0.30602392],
       [-0.53587921,  1.40096142, -1.25310662, -1.28485856],
       [-0.65930628,  1.40096142, -1.25310662, -1.28485856],
       [-1.0295875 ,  0.

In [17]:
# 对测试数据进行处理
X_test = standardScaler.transform(X_test)
X_test_standard = standardScaler.transform(X_test) # 使用的是X_train的关键参数
# 标签不需要归一化，因为标签不需要训练，需要归一化的是不同样本的特征值分布

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
knn_clf = KNeighborsClassifier(n_neighbors = 3)

In [20]:
knn_clf.fit(X_train, y_train) 
# 这里为什么要y_train呢？也就是训练模型为什么要标签数据集呢？
# 比如说knn，准确讲是你要把fit的东西输给predict使用，那自然奥y_train了

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [21]:
knn_clf.score(X_test_standard, y_test)

0.6333333333333333

In [22]:
knn_clf.score(X_test, y_test)

1.0

In [23]:
[print(i) for i in range(10)]

0
1
2
3
4
5
6
7
8
9


[None, None, None, None, None, None, None, None, None, None]

## 使用我们自己写的preprocessing.py来做

In [24]:
import sys
sys.path.append("../../")
from playML.preprocessing import StandardScaler # 标准归一化 StandardScaler

In [25]:
from sklearn import datasets
iris = datasets.load_iris() # 注意函数加括号
X = iris.data
y = iris.target

In [26]:
X[:10,:] # float类型

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1]])

In [27]:
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_ratio = 0.2, seed = 666)

In [28]:
# 求训练数据集的关键参数
standardScaler = StandardScaler() # 也没有什么初始化的输入参数
standardScaler.fit(X_train)

<playML.preprocessing.StandardScaler at 0x1c5ccbbf4e0>

In [29]:
# 必须清理了重新运行
X_train = standardScaler.transform(X_train)

In [30]:
standardScaler.mean_

array([ 5.81833333,  3.02      ,  3.7925    ,  1.22416667])

In [31]:
standardScaler.scale_

array([ 0.80103926,  0.4007493 ,  1.71795239,  0.76015304])

In [32]:
X_test_standard = standardScaler.transform(X_test)

In [33]:
from playML.kNN import KNNClassifier # KNeighborsClassifier

In [34]:
my_knn_clf = KNNClassifier(k=3)

In [35]:
my_knn_clf.fit(X_train, y_train)

KNN(k=3)

In [36]:
my_knn_clf.score(X_test, y_test)

0.26666666666666666

In [37]:
my_knn_clf.score(X_test_standard, y_test) # 注意应该是用Scaler过后的X_test

0.96666666666666667