In [6]:
import pandas as pd

In [7]:
public_df = pd.read_csv('public_processed.csv')
train_df = pd.read_csv('training.csv')
### 保留 txkey 欄位最後上傳用
final_df = public_df[['txkey']]

### 預測用資料集，訓練完模型在跑這行就好
## 把資料轉成正確data type
# 類別變數比較多，所以先把全部轉成類別
public_df = public_df.astype('category')

# 剩下轉回數值變數
public_df[['locdt', 'loctm', 'flam1', 'csmam']] = public_df[['locdt', 'loctm', 'flam1', 'csmam']].astype('int64')
public_df[['conam', 'iterm']] = public_df[['conam', 'iterm']].astype('float64')

## 缺失值填充

# 指定要填充的欄位
columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']

# 將 "others" 添加到類別中，如果已經存在，則忽略
for column in columns_to_fill:
    public_df[column] = public_df[column].cat.add_categories('-1').fillna('-1')


# stscd(狀態碼)幾乎全部都是缺失值，也應該不是重要特徵，先就刪掉這欄
public_df.drop('stscd', axis=1, inplace=True)

# txkey全部都是唯一資料，刪除不用
public_df.drop('txkey', axis=1, inplace=True)

## 把資料轉成正確data type
# 類別變數比較多，所以先把全部轉成類別
train_df = train_df.astype('category')

# 剩下轉回數值變數
train_df[['locdt', 'loctm', 'flam1', 'csmam']] = train_df[['locdt', 'loctm', 'flam1', 'csmam']].astype('int64')
train_df[['conam', 'iterm']] = train_df[['conam', 'iterm']].astype('float64')


## 缺失值填充

# 指定要填充的欄位
columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']

# 將 "others" 添加到類別中，如果已經存在，則忽略
for column in columns_to_fill:
    train_df[column] = train_df[column].cat.add_categories('-1').fillna('-1')


# stscd(狀態碼)幾乎全部都是缺失值，也應該不是重要特徵，先就刪掉這欄
train_df.drop('stscd', axis=1, inplace=True)

### 移除不必要的column

## 檢查txkey是否只包含唯一值
#uni_txkey = train_df['txkey'].value_counts().reset_index()
#print(uni_txkey.loc[uni_txkey['count'] != 1])

# txkey全部都是唯一資料，刪除不用
train_df.drop('txkey', axis=1, inplace=True)

## PyCaret Tutorial

In [8]:
# label轉成數值，不轉會出現error
train_df['label'] = train_df['label'].astype('int64')

In [1]:
# import ClassificationExperiment and init the class
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

In [9]:
categorical_feature = [col for col in train_df.columns if train_df[col].dtype == 'category' and col != 'label']
print(categorical_feature)
print('Number of categorical feature:', len(categorical_feature))

['chid', 'cano', 'contp', 'etymd', 'mchno', 'acqic', 'mcc', 'ecfg', 'insfg', 'bnsfg', 'stocn', 'scity', 'ovrlt', 'flbmk', 'hcefg', 'csmcu', 'flg_3dsmk']
Number of categorical feature: 17


### Setup

In [11]:
# init setup on exp
exp.setup(train_df, target='label',
          fix_imbalance=True, fix_imbalance_method='RandomOverSampler',
          n_jobs=10,
          fold=5,
          log_experiment=True, experiment_name='01_FirstExp',
          session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(8688526, 24)"
4,Transformed data shape,"(14725654, 51)"
5,Transformed train set shape,"(12119096, 51)"
6,Transformed test set shape,"(2606558, 51)"
7,Ordinal features,6
8,Numeric features,6
9,Categorical features,17


2023/11/13 15:03:58 INFO mlflow.tracking.fluent: Experiment with name '01_FirstExp' does not exist. Creating a new experiment.


<pycaret.classification.oop.ClassificationExperiment at 0x104f84bb0>

In [None]:
exp.models()

In [None]:
# compare selected models
include_models = ['lr', 'nb', 'dt', 'svm', 'mlp', 'rf', 'lightgbm'] 
best = exp.compare_models(include=include_models, sort='F1')

In [None]:
#exp.evaluate_model(best)
#exp.plot_model(best, plot='feature')

In [None]:
exp.save_model(best, '01_first_model_pipeline')

In [None]:
tune_best = exp.tune_model(best, optimize='F1')

In [None]:
tune_best

In [None]:
exp.save_model(tune_best, '02_tuned_model_pipeline')