In [1]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder    # 特徵縮放、編碼，為計算其變異數
from sklearn.compose import make_column_transformer    # 建立流程
from sklearn.feature_selection import VarianceThreshold    # 移除低變異特徵

### Data Import

In [2]:
dict_df_pre = pd.read_pickle('data_pre.pkl')
dict_df_pre.keys()

dict_keys(['target', 'features', 'others', 'features_pre'])

In [3]:
X = dict_df_pre['features_pre'].copy()

### Remove Low-variance Features by Variance Threshold

* 特徵編碼

In [4]:
# 各編碼所使用特徵名稱

cols_to_DumEnc = [col for col in X.select_dtypes(include='category').columns if not X[col].dtype.ordered]
cols_to_OrdEnc = [col for col in X.select_dtypes(include='category').columns if X[col].dtype.ordered]

# 編碼流程
col_trans = make_column_transformer(
    
    (OneHotEncoder(drop='first', dtype='uint8', sparse_output=False), cols_to_DumEnc),    # dummy encoding
    (OrdinalEncoder(dtype='uint8'), cols_to_OrdEnc),    # uint8: 0 to (2**8)-1 (recommend)
    
    remainder='passthrough',    # 略過剩餘欄位
    verbose_feature_names_out=False    # 去掉 transformer 前綴
).set_output(transform='pandas')    #輸出 dataframe

# 執行編碼
X_enc = col_trans.fit_transform(X)

* 特徵縮放

In [5]:
# 縮放特徵名稱
cols_to_scale = list(X_enc.select_dtypes(include='uint32').columns) + cols_to_OrdEnc

# 建立 MinMaxScaler 物件
mms = MinMaxScaler(feature_range=(0, 1)).set_output(transform='pandas')    # feature_range=(0, 1) (default)

# 執行特徵縮放
X_scale = mms.fit_transform(X_enc[cols_to_scale]).astype('float32')

X_vt0 = pd.concat([X_scale, X_enc.drop(cols_to_scale, axis=1)], axis=1)

In [6]:
del X, cols_to_DumEnc, cols_to_OrdEnc, col_trans, mms, X_scale

* 特徵篩選

In [7]:
# 以查獲逾期食品案件比例作為特徵篩選門檻
t = dict_df_pre['target'].mean()

# 建立（初始化） VarianceThreshold 物件：物件名稱 = 類別名稱()
vt = VarianceThreshold(threshold=t).set_output(transform='pandas')    # default 0.0

# 執行篩選
X_vt = vt.fit_transform(X_vt0)

In [8]:
print(f'variance threshold: {t:.4f}')

# 特徵篩選及 variance 計算結果
df_vtRsl = pd.DataFrame(
    data = {
        '特徵': list(X_vt0.columns), 
        '變異數': np.var(X_vt0, axis=0).round(decimals=4), 
        '是否排除': [False if c in X_vt.columns else True for c in X_vt0.columns]
    }, 
)

variance threshold: 0.0054


* 縮放特徵轉回原始特徵（SMOTE -> Standardization）

In [9]:
X_select = pd.concat([X_enc[cols_to_scale], X_vt.drop(cols_to_scale, axis=1, errors='ignore')], axis=1)

In [10]:
del cols_to_scale, X_enc, X_vt0, vt, X_vt

### Data Save

In [None]:
dict_df_pre['features_sel'] = X_select
dict_df_pre['vt'] = t
dict_df_pre['vt_result'] = df_vtRsl

# pd.to_pickle(dict_df_pre, 'data_fs.pkl')