In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
data = pd.read_csv('./datas/Narrativedata.csv',index_col=0)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [3]:
# 缺失值处理

#填补年龄
Age = data.loc[:,"Age"].values.reshape(-1,1) #sklearn当中特征矩阵必须是二维

#实例化与不同方法填补
imp_mean = SimpleImputer() #默认均值填补
imp_median = SimpleImputer(strategy="median") #用中位数填补
imp_0 = SimpleImputer(strategy="constant",fill_value=0) #用0填补

#完成调取结果
imp_mean = imp_mean.fit_transform(Age)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)

#在这里我们使用中位数填补Age
data.loc[:,"Age"] = imp_median

#使用众数填补Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)

#查看更改后的数据
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
Age         891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
Survived    891 non-null object
dtypes: float64(1), object(3)
memory usage: 34.8+ KB


In [4]:
# 分类特征




In [5]:
# Spreprocessing.LabelEncoder:标签专用

y = data.iloc[:,-1] #要输入的是标签，不是特征矩阵，所以允许一维
le = LabelEncoder() #实例化
label = le.fit_transform(y) #调取结果
print(le.classes_) #查看标签中究竟有多少类别
print()

y1 = le.inverse_transform(label) #逆转
print(y1)

data.iloc[:,-1] = label #让标签等于我们运行出来的结果
data.head()

#一般写法:data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])

['No' 'Unknown' 'Yes']

['No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'Unknown' 'Yes'
 'No' 'No' 'No' 'Unknown' 'No' 'Yes' 'No' 'Yes' 'Unknown' 'Yes' 'Yes'
 'Yes' 'No' 'Unknown' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Unknown' 'Yes' 'No' 'No' 'Yes' 'No'
 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes'
 'Unknown' 'No' 'Unknown' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Unknown' 'Yes' 'Yes'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'No'
 'Yes' 'Yes' 'No' 'Unknown' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No'
 'Unknown' 'Unknown' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Unknown' 'No'
 'Unknown' 'Unknown' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No'
 'No' 'No' 'Yes' 'Yes' 'No' 

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,2
2,26.0,female,S,2
3,35.0,female,S,2
4,35.0,male,S,0


In [6]:
# preprocessing.OrdinalEncoder：特征专用

data_ = data.copy()
data_.head()
OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_ #对应LabelEncoder的接口classes_，一模一样的功能
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,2
2,26.0,0.0,2.0,2
3,35.0,0.0,2.0,2
4,35.0,1.0,2.0,0


In [7]:
# preprocessing.OneHotEncoder：独热编码，创建哑变量

X = data.iloc[:,1:-1]

enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
print(result)
print()

#一步到位写法
#OneHotEncoder(categories='auto').fit_transform(X).toarray()

#依然可以还原
pd.DataFrame(enc.inverse_transform(result))
print(enc.get_feature_names()) #返回每一个稀疏矩阵的列名

#axis=1,表示跨行进行合并，也就是将量表左右相连，如果是axis=0，就是将量表上下相连
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
newdata.head()

[[0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1.]
 ...
 [1. 0. 0. 0. 1.]
 [0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 0.]]

['x0_female' 'x0_male' 'x1_C' 'x1_Q' 'x1_S']


Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,0.0,1.0,0.0,0.0,1.0
1,38.0,2,1.0,0.0,1.0,0.0,0.0
2,26.0,2,1.0,0.0,0.0,0.0,1.0
3,35.0,2,1.0,0.0,0.0,0.0,1.0
4,35.0,0,0.0,1.0,0.0,0.0,1.0


In [8]:
# 连续特征




In [9]:
# preprocessing.Binarizer

data_2 = data.copy()
X = data_2.iloc[:,0].values.reshape(-1,1) #类为特征专用，所以不能使用一维数组
transformer = Binarizer(threshold=30).fit_transform(X)
transformer[:5]

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.]])

In [10]:
# preprocessing.KBinsDiscretizer

X = data.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)

#查看转换后分的箱：变成了一列中的三箱
print(set(est.fit_transform(X).ravel()))

est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
est.fit_transform(X).toarray()

{0.0, 1.0, 2.0}


array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])