## 此脚本用于学习研究sklearn中的数据预处理部分

### 一、数据无量纲化

#### 1.preprocessing.MinMaxScaler(归一化)

In [64]:
from sklearn.preprocessing import MinMaxScaler
data= [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
import pandas as pd
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [65]:
scaler = MinMaxScaler()
scaler.fit(data)   #fit本质是获得data的min和max
scaler.transform(data) #通过接口导出结果，返回ndarray

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [66]:
result = scaler.fit_transform(data)  #或者一步到位

In [67]:
scaler.inverse_transform(result) #逆转归一化

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [68]:
#使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=[5,10]) #依然实例化
scaler.fit_transform(data)

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

In [69]:
#当X中的特征数量非常多的时候，fit会报错并表示，数据量太大了我计算不了
#此时使用partial_fit作为训练接口
scaler = scaler.partial_fit(data)
scaler.transform(data)


array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

#### 2.preprocessing.StandardScaler(标准化)

In [70]:
from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = StandardScaler()
scaler.fit(data)
scaler.transform(data)

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [71]:
scaler.fit_transform(data)   

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [72]:
#标准化对象的几个属性
scaler.mean_

array([-0.125,  9.   ])

In [73]:
scaler.var_

array([ 0.546875, 35.      ])

In [74]:
scaler.inverse_transform(scaler.fit_transform(data))

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

### 二、缺失值处理

In [75]:
import pandas as pd
import numpy as np

In [76]:
data = pd.read_csv(r'C:\Users\Mypc\Desktop\菜菜\源文件\03-数据预处理与特征工程\Narrativedata.csv',index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


class  sklearn.impute.SimpleImputer (missing_values=nan, strategy=’mean’, fill_value=None, verbose=0,
copy=True)

In [77]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         714 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [78]:
#填补年龄
Age = data.loc[:,'Age'].values.reshape(-1,1)
Age.shape

(891, 1)

In [79]:
from sklearn.impute import SimpleImputer
imp_mean =SimpleImputer(missing_values=np.nan,strategy='mean') #均值
im_mean = SimpleImputer() # 默认是均值
imp_median = SimpleImputer(missing_values=np.nan,strategy='median')#中位数
imp_zero = SimpleImputer(missing_values=np.nan,strategy='constant',fill_value= 0) #0
imp_most = SimpleImputer(missing_values= np.nan,strategy='most_frequent') #众数
data.loc[:,'Age'] = imp_mean.fit_transform(Age)
data.loc[:,'Age'] = imp_median.fit_transform(Age)
data.loc[:,'Age'] = imp_zero.fit_transform(Age)

In [80]:
#使用pandas和numpy处理缺失值
data = pd.read_csv(r'C:\Users\Mypc\Desktop\菜菜\源文件\03-数据预处理与特征工程\Narrativedata.csv',index_col=0)
data.loc[:,'Age'] = data.loc[:,'Age'].fillna(data.loc[:,'Age'].median())

In [81]:
data.dropna(axis=0,inplace=True)

### 三、处理分类型特征：编码处理和哑变量处理

#### 3.1编码处理：将文字型数据转化为数值型数据

* preprocessing.LabelEncoder 标签专用
* preprocessing.OrdinalEncoder 特征专用

In [82]:
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.preprocessing import OrdinalEncoder as OE

In [83]:
y =data.iloc[:,-1]
y.head()

0     No
1    Yes
2    Yes
3    Yes
4     No
Name: Survived, dtype: object

In [84]:
le = LE()
le = le.fit(y)
le.transform(y)

array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,
       2, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1,
       2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2,
       0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 1, 0, 0,
       2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 0, 2,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1, 0,
       0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 2,
       0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,

In [85]:
le.classes_  #属性.classes_查看标签中究竟有多少类别

array(['No', 'Unknown', 'Yes'], dtype=object)

In [88]:
le.fit_transform(y) #也可以直接fit_transform一步到位

array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,
       2, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1,
       2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2,
       0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 1, 0, 0,
       2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0, 1,
       0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 0, 2,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1, 0,
       0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0, 2,
       0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2,

In [89]:
le.inverse_transform(le.fit_transform(y))  #反转

array(['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'Unknown', 'Yes', 'No', 'No', 'No', 'Unknown', 'No', 'Yes', 'No',
       'Yes', 'Unknown', 'Yes', 'Yes', 'Yes', 'No', 'Unknown', 'No', 'No',
       'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No',
       'No', 'Yes', 'No', 'No', 'No', 'Unknown', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'Unknown', 'No', 'Unknown',
       'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Unknown', 'Yes',
       'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'No', 'Yes', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes',
       'No', 'Unknown', 'No', 'No', 'Yes', 'No', 'No', 'Ye

In [90]:
data.iloc[:,-1] = LE().fit_transform(data.iloc[:,-1]) #一般这么写即可

In [91]:
data_ = data.copy()
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [92]:
oe = OE()
oe = oe.fit(data_.iloc[:,1:-1])
oe.categories_ #相当于LabelEncoder的classes_属性
oe.transform(data_.iloc[:,1:-1])[:5]   #最好别包含已经是数值型的数据，有可能会改变原有数据的！！

array([[1., 2.],
       [0., 0.],
       [0., 2.],
       [0., 2.],
       [1., 2.]])

In [93]:
data_.iloc[:,1:-1] = OE().fit_transform(data_.iloc[:,1:-1])#一般这么写

#### 3.2哑变量处理

* preprocessing.OneHotEncoder

In [94]:
from sklearn.preprocessing import OneHotEncoder as OHE
x=data.iloc[:,1:-1]
ohe = OHE(categories='auto') #自动识别每个特征的类型分类并组装array
result = ohe.fit_transform(x).toarray() 

In [95]:
ohe.inverse_transform(ohe.fit_transform(x).toarray()) #逆转

array([['male', 'S'],
       ['female', 'C'],
       ['female', 'S'],
       ...,
       ['female', 'S'],
       ['male', 'C'],
       ['male', 'Q']], dtype=object)

In [96]:
ohe.get_feature_names() #这些就是哑变量处理后的分类标签

array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)

In [97]:
data.columns.values 

array(['Age', 'Sex', 'Embarked', 'Survived'], dtype=object)

In [98]:
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2.0,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0


In [99]:
data.columns = np.hstack((data.columns.values,ohe.get_feature_names())).tolist() #PD需要先合并数据，再考虑columns!

ValueError: Length mismatch: Expected axis has 4 elements, new values have 9 elements

In [100]:
newdata.columns = np.hstack((data.columns.values,ohe.get_feature_names())).tolist() 

In [101]:
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,x0_female,x0_male,x1_C,x1_Q,x1_S
0,22.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,2.0,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,2.0,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0


In [102]:
newdata.drop(['Sex','Embarked'],axis=1,inplace=True)

### 四、处理连续性变量：二值化与分段

#### 4.1二值化sklearn.preprocessing.Binarizer

In [104]:
data_2 = data.copy()
from sklearn.preprocessing import Binarizer

In [122]:
X = data_2.iloc[:,0].values.reshape(-1,1) #Binarizer类为特征专用，故不能使用一维数组
pd.Series(X.flatten()).median()
binarizer = Binarizer(threshold=28) #表示按照28的阈值划分二分类
transformer = binarizer.fit_transform(X)
transformer

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],

#### 4.2分箱函数sklearn.preprocessing.KBinsDiscretizer

* 能够将连续性变量排序后按顺序分箱后编码。

In [138]:
from sklearn.preprocessing import KBinsDiscretizer
import scipy 

In [147]:
X = data.iloc[:,0].values.reshape(-1,1)
kbd = KBinsDiscretizer(n_bins=3,strategy='quantile',encode='onehot') #分箱为3，等数划分，独热
kbd.fit_transform(X).toarray()

X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') #分箱为3，等宽划分，普通返回
est.fit_transform(X)
#查看转换后分的箱：变成了一列中的三箱
set(est.fit_transform(X).ravel()) #此处是ndarry

kbd = KBinsDiscretizer(n_bins=3,strategy='uniform',encode='onehot') #分箱为3，等宽划分，独热
kbd.fit_transform(X).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])