# 餐飲業逾期風險預測模型
由我國「應建立食品追溯追蹤系統之食品業者」規定可知，須申報食品流向資料於本署非追不可系統多為製造與輸入業者，對於販售與餐飲業者則無相關資料可用，無法推算其庫存，進而評估其是否可能存放逾期食品。因此，針對販售與餐飲業者，則透過後市場稽查資料及業者相關資訊，嘗試以監督式學習方法建構業者逾期食品違規風險預測模型，而此處為餐飲業資料的建模評估。

### 程式架構
* 資料前處理(Preprocessing.ipynb)
* **特徵篩選(FeatureSelection.ipynb)**
* 資料不平衡處理(HandlingImbData.ipynb)
* 模型配適(ModelFitting.ipynb)

## 特徵篩選

### Package

In [None]:
import numpy as np 
import pandas as pd

from sklearn.preprocessing import MinMaxScaler    # 特徵縮放
# from sklearn.feature_selection import VarianceThreshold    # 移除低變異特徵

### Data Import

In [2]:
dict_dfs_pre = pd.read_pickle('../data/data_pre.pkl')
dict_dfs_pre.keys()

dict_keys(['others', 'train', 'test'])

In [3]:
X_train = dict_dfs_pre['train']['X'].copy()    # 避免更動原始資料

### Remove Low-variance Features
<div>移除訓練集低變異特徵，包含：</div>
<ul>
    <li>類別特徵：計算樣本數最多類別占比，排除高於目標最多類別占比之特徵。</li>
    <li>數值特徵：縮放至0~1，計算變異數，排除高於目標變異數之特徵。</li>
</ul>

##### 類別特徵：計算樣本數最多類別占比

In [4]:
# 占比計算與篩選結果
thres_cat = 1 - dict_dfs_pre['train']['y'].mean()    # 門檻：非違規案件占比
cols_cat = list(X_train.select_dtypes(include=['category', 'uint8']).columns)    # 名稱
topRatio_cat = [X_train[col].value_counts().max()/X_train.shape[0] for col in cols_cat]    # 占比

df_proxyVTRsl = pd.DataFrame(
    data={
        '特徵': cols_cat, 
        '最多類別占比': np.array(topRatio_cat).round(decimals=4), 
        '是否排除': [tr > thres_cat for tr in topRatio_cat]
    }
)
print(f'閾值：{thres_cat: .4f}')


閾值： 0.9947


In [5]:
del cols_cat, topRatio_cat

##### 數值特徵：計算變異數

In [6]:
# 變異數計算與篩選結果
thres_num = dict_dfs_pre['train']['y'].var(ddof=0)    # ddof=0，標準一致即可
cols_num = list(X_train.select_dtypes(include='uint32').columns)

#   特徵縮放至0~1
mms = MinMaxScaler().set_output(transform='pandas')    # 建立（初始化）MinMaxScaler物件：物件名稱=類別名稱()
X_mms = mms.fit_transform(X_train[cols_num])
var_num = list(X_mms.var(ddof=0))

#   篩選結果
df_VTRsl = pd.DataFrame(
    data={
        '特徵': cols_num, 
        '變異數': var_num, 
        '是否排除': [var < thres_num for var in var_num], 
    }
)
print(f'閾值：{thres_num: .4f}')


閾值： 0.0053


In [7]:
del cols_num, mms, X_mms, var_num

##### 特徵篩選

In [8]:
cols_rm = (
   list(df_proxyVTRsl[df_proxyVTRsl['是否排除']]['特徵']) + 
   list(df_VTRsl[df_VTRsl['是否排除']]['特徵'])
) 

# 訓練集特徵篩選
X_train_select = X_train.drop(columns=cols_rm)

# 測試集不得參與特徵篩選流程，直接套用篩選結果
X_test_select = dict_dfs_pre['test']['X'].drop(columns=cols_rm)

### Data Save

In [None]:
# X
dict_dfs_pre['train']['X_select'] = X_train_select
dict_dfs_pre['test']['X_select'] = X_test_select

# result of feature selection
dict_vt = {
    'threshold_cat': thres_cat, 'result_cat': df_proxyVTRsl, 
    'threshold_num': thres_num, 'result_num': df_VTRsl, 
}
dict_dfs_pre['feature_selection'] = dict_vt

# serialization
# pd.to_pickle(dict_dfs_pre, '../data/data_fs.pkl')

##### 特徵編碼

In [10]:
# # 各編碼所使用特徵名稱
# cols_to_DumEnc = [col for col in X_train.select_dtypes(include='category').columns if not X_train[col].dtype.ordered]
# cols_to_OrdEnc = [col for col in X_train.select_dtypes(include='category').columns if X_train[col].dtype.ordered]

# # 編碼流程
# enc_trans = make_column_transformer(
#     (OneHotEncoder(drop='first', dtype='uint8', sparse_output=False), cols_to_DumEnc),    # dummy encoding
#     (OrdinalEncoder(dtype='uint8'), cols_to_OrdEnc),    # uint8: 0 to (2**8)-1 (recommend)
#     remainder='passthrough',    # 略過剩餘欄位
#     force_int_remainder_cols=False,    # corresponds to the `remainder` (default: True)
#     verbose_feature_names_out=False    # 去掉 transformer 前綴
# ).set_output(transform='pandas')    #輸出 dataframe

# # enc_trans.fit(X_train)    # fit training set
# # X_enc = enc_trans.transform(X_train)    # transform training set

# X_enc = enc_trans.fit_transform(X_train)