In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## 归一化

sklearn的归一化数据，会将二维数组按列方向广播，进行归一化

In [4]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [5]:
scaler = MinMaxScaler()
result = scaler.fit_transform(df)
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [8]:
data = [[2, 6], [2, 6], [2, 6], [2, 6]]
df = pd.DataFrame(data)
df
scaler = StandardScaler()
result = scaler.fit_transform(df)
result

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

## 缺失值
可以采用sklearn.impute模块来处理，也可用pandas来处理NaN   
几种常用策略：   
+ 均值   
+ 中位数  
+ 众数  
+ 常数  
+ 舍弃

In [22]:
data = [[2, 3], [1, 4], [2, 6], [2, 6], [np.nan,7]]
df = pd.DataFrame(data)
print(df)
print(df.loc[:,0].fillna(df.loc[:,0].mean()))
print(df.loc[:,0].fillna(df.loc[:,0].median()))
print(df.loc[:,0].fillna(df.loc[:,0].mode()[0]))
print(df.loc[:,0].fillna(0))

     0  1
0  2.0  3
1  1.0  4
2  2.0  6
3  2.0  6
4  NaN  7
0    2.00
1    1.00
2    2.00
3    2.00
4    1.75
Name: 0, dtype: float64
0    2.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: 0, dtype: float64
0    2.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: 0, dtype: float64
0    2.0
1    1.0
2    2.0
3    2.0
4    0.0
Name: 0, dtype: float64


In [24]:
print(df)
df_ = df.dropna() # 参数axis=0,inplace=False
df_

     0  1
0  2.0  3
1  1.0  4
2  2.0  6
3  2.0  6
4  NaN  7


Unnamed: 0,0,1
0,2.0,3
1,1.0,4
2,2.0,6
3,2.0,6


## 特征编码和哑变量
通常特征变量有三种类型：  
1. 无序离散变量，也称为分类变量/特征，例如性别：男，女
2. 有序离散变量，离散值之间有顺序关系，例如学位：学士，硕士，博士
3. 连续变量，变量值之间存在距离关系

针对离散变量，需要将文字转换为数值，这个过程叫编码，（可以采用Ordinal Encoding）  
例如学位：  
学士-0  
硕士-1  
博士-2  

针对分类变量，需要将类别转换成一个编码，彼此互不相关 （常见的采用One-Hot Encoding）  
例如性别：  
男-01    
女-10   


In [59]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [119]:
df = pd.read_csv('./Narrativedata.csv', index_col=0)
# df.info()
df = df.dropna().reset_index(drop=True) # reset_index(drop=True) drop后要重置索引！否则后面concat就对不上
x_data = df.iloc[:,0:-1]
y_data = df.iloc[:,-1]
x_data

Unnamed: 0,Age,Sex,Embarked
0,22.0,male,S
1,38.0,female,C
2,26.0,female,S
3,35.0,female,S
4,35.0,male,S
...,...,...,...
707,39.0,female,Q
708,27.0,male,S
709,19.0,female,S
710,26.0,male,C


In [61]:
y_data = LabelEncoder().fit_transform(y_data)
y_data

In [80]:
enc = OrdinalEncoder(categories='auto', dtype=int)
x_data[['Sex','Embarked']] = enc.fit_transform( x_data[['Sex','Embarked']])
print(enc.categories_)
print(enc.feature_names_in_)
x_data

[array([0, 1]), array([0, 1, 2])]
['Sex' 'Embarked']


Unnamed: 0,Age,Sex,Embarked
0,22.0,1,2
1,38.0,0,0
2,26.0,0,2
3,35.0,0,2
4,35.0,1,2
...,...,...,...
885,39.0,0,1
886,27.0,1,2
887,19.0,0,2
889,26.0,1,0


In [116]:
enc = OneHotEncoder(dtype=int)
result = enc.fit_transform( x_data[['Sex','Embarked']]).toarray()
print(enc.categories_)
print(enc.feature_names_in_) #参与on-hot编码的特征名字
result

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
['Sex' 'Embarked']


array([[0, 1, 0, 0, 1],
       [1, 0, 1, 0, 0],
       [1, 0, 0, 0, 1],
       ...,
       [1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0]])

In [91]:
# one-hot编码后列特征名
column_names = enc.get_feature_names_out()
column_names

array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype=object)

In [121]:
x_data_new = pd.concat([x_data,pd.DataFrame(result)], axis=1)
x_data_new.drop(columns=enc.feature_names_in_,inplace=True)
x_data_new


Unnamed: 0,Age,0,1,2,3,4
0,22.0,0,1,0,0,1
1,38.0,1,0,1,0,0
2,26.0,1,0,0,0,1
3,35.0,1,0,0,0,1
4,35.0,0,1,0,0,1
...,...,...,...,...,...,...
707,39.0,1,0,0,1,0
708,27.0,0,1,0,0,1
709,19.0,1,0,0,0,1
710,26.0,0,1,1,0,0


In [125]:
# 修改列名
keys = range(column_names.shape[0])
x_data_new.rename(columns=dict(zip(keys, column_names)), inplace=True)
x_data_new.columns

Index(['Age', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [126]:
x_data_new

Unnamed: 0,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0,1,0,0,1
1,38.0,1,0,1,0,0
2,26.0,1,0,0,0,1
3,35.0,1,0,0,0,1
4,35.0,0,1,0,0,1
...,...,...,...,...,...,...
707,39.0,1,0,0,1,0
708,27.0,0,1,0,0,1
709,19.0,1,0,0,0,1
710,26.0,0,1,1,0,0
