In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('training.csv')

In [3]:
# txkey全部都是唯一資料，刪除不用
train_df.drop('txkey', axis=1, inplace=True)

## Setup data type

In [4]:
train_df[['chid','cano','contp','etymd','mchno','acqic','mcc','ecfg','insfg','bnsfg',\
          'stocn','scity','stscd','ovrlt','flbmk','hcefg','csmcu','flg_3dsmk','label']] = \
    train_df[['chid','cano','contp','etymd','mchno','acqic','mcc','ecfg','insfg','bnsfg','stocn',\
              'scity','stscd','ovrlt','flbmk','hcefg','csmcu','flg_3dsmk','label']].astype('category')

train_df[train_df.select_dtypes(include='int64').columns] = \
    train_df[train_df.select_dtypes(include='int64').columns].astype('int32')

train_df[train_df.select_dtypes(include='float64').columns] = \
    train_df[train_df.select_dtypes(include='float64').columns].astype('float32')

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8688526 entries, 0 to 8688525
Data columns (total 25 columns):
 #   Column     Dtype   
---  ------     -----   
 0   locdt      int32   
 1   loctm      int32   
 2   chid       category
 3   cano       category
 4   contp      category
 5   etymd      category
 6   mchno      category
 7   acqic      category
 8   mcc        category
 9   conam      float32 
 10  ecfg       category
 11  insfg      category
 12  iterm      float32 
 13  bnsfg      category
 14  flam1      int32   
 15  stocn      category
 16  scity      category
 17  stscd      category
 18  ovrlt      category
 19  flbmk      category
 20  hcefg      category
 21  csmcu      category
 22  csmam      int32   
 23  flg_3dsmk  category
 24  label      category
dtypes: category(19), float32(2), int32(4)
memory usage: 502.4 MB


In [5]:
# Check for missing values
print(train_df.isnull().sum())

locdt              0
loctm              0
chid               0
cano               0
contp              0
etymd         203455
mchno              0
acqic              0
mcc             4550
conam              0
ecfg               0
insfg              0
iterm              0
bnsfg              0
flam1              0
stocn            600
scity         266066
stscd        8665195
ovrlt              0
flbmk              0
hcefg         286656
csmcu         498657
csmam              0
flg_3dsmk          0
label              0
dtype: int64


### Delete 'stscd' status code

In [6]:
train_df.drop('stscd', axis=1, inplace=True)

## Add time series column

In [7]:
train_df['sequence'] = (train_df.sort_values(by=['chid', 'locdt', 'loctm']).groupby('chid').cumcount() + 1).astype('int32')

In [8]:
# 指定要填充的欄位
columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']

# 將 "unknown" 添加到類別中，如果已經存在，則忽略
for column in columns_to_fill:
    train_df[column] = train_df[column].cat.add_categories('-1').fillna('-1')

# Check for missing values
print(train_df.isnull().sum())
print(train_df.info())

locdt        0
loctm        0
chid         0
cano         0
contp        0
etymd        0
mchno        0
acqic        0
mcc          0
conam        0
ecfg         0
insfg        0
iterm        0
bnsfg        0
flam1        0
stocn        0
scity        0
ovrlt        0
flbmk        0
hcefg        0
csmcu        0
csmam        0
flg_3dsmk    0
label        0
sequence     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8688526 entries, 0 to 8688525
Data columns (total 25 columns):
 #   Column     Dtype   
---  ------     -----   
 0   locdt      int32   
 1   loctm      int32   
 2   chid       category
 3   cano       category
 4   contp      category
 5   etymd      category
 6   mchno      category
 7   acqic      category
 8   mcc        category
 9   conam      float32 
 10  ecfg       category
 11  insfg      category
 12  iterm      float32 
 13  bnsfg      category
 14  flam1      int32   
 15  stocn      category
 16  scity      category
 17  ovrlt      category

## PyCaret setup

In [9]:
# label轉成數值，不轉會出現error
train_df['label'] = train_df['label'].astype('int64')

In [10]:
# import ClassificationExperiment and init the class
from pycaret.classification import ClassificationExperiment
exp = ClassificationExperiment()

In [11]:
categorical_feature = [col for col in train_df.columns if train_df[col].dtype == 'category' and col != 'label']
print(categorical_feature)
print('Number of categorical feature:', len(categorical_feature))

['chid', 'cano', 'contp', 'etymd', 'mchno', 'acqic', 'mcc', 'ecfg', 'insfg', 'bnsfg', 'stocn', 'scity', 'ovrlt', 'flbmk', 'hcefg', 'csmcu', 'flg_3dsmk']
Number of categorical feature: 17


In [12]:
# init setup on exp
exp.setup(train_df, target='label',
          fix_imbalance=True, fix_imbalance_method='RandomOverSampler',
          n_jobs=8,fold=5,
          log_experiment=True, experiment_name='Exp_1121_add_time',
          session_id=696)

Unnamed: 0,Description,Value
0,Session id,696
1,Target,label
2,Target type,Binary
3,Original data shape,"(8688526, 25)"
4,Transformed data shape,"(14725654, 52)"
5,Transformed train set shape,"(12119096, 52)"
6,Transformed test set shape,"(2606558, 52)"
7,Ordinal features,6
8,Numeric features,7
9,Categorical features,17


2023/11/21 18:09:17 INFO mlflow.tracking.fluent: Experiment with name 'Exp_1121_add_time' does not exist. Creating a new experiment.


<pycaret.classification.oop.ClassificationExperiment at 0x7f6d49b775b0>

### LighGBM

In [13]:
LightGBM = exp.create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9986,0.8597,0.705,0.8914,0.7873,0.7866,0.7921
1,0.9985,0.8583,0.6974,0.8784,0.7775,0.7767,0.782
2,0.9986,0.9736,0.7079,0.8976,0.7915,0.7908,0.7965
3,0.9986,0.8598,0.7016,0.8922,0.7855,0.7848,0.7905
4,0.9986,0.9677,0.7027,0.8944,0.7871,0.7864,0.7921
Mean,0.9986,0.9038,0.7029,0.8908,0.7858,0.7851,0.7906
Std,0.0,0.0546,0.0035,0.0066,0.0046,0.0046,0.0048


In [14]:
exp.save_model(LightGBM, 'Exp_1121_add_time_lightgbm_cv5_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['locdt', 'loctm', 'conam', 'iterm',
                                              'flam1', 'csmam', 'sequence'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  Tra...
                  LGBMClassifier(boosting_type='gbdt', class_weigh

In [15]:
tune_lightgbm = exp.tune_model(LightGBM, optimize='F1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9988,0.9822,0.7074,0.9432,0.8085,0.8079,0.8163
1,0.9987,0.984,0.7016,0.9316,0.8004,0.7998,0.8079
2,0.9988,0.9937,0.7112,0.9418,0.8104,0.8098,0.8179
3,0.9987,0.986,0.7023,0.9439,0.8054,0.8048,0.8136
4,0.9988,0.9941,0.7027,0.9445,0.8059,0.8053,0.8141
Mean,0.9987,0.988,0.705,0.941,0.8061,0.8055,0.814
Std,0.0,0.005,0.0037,0.0048,0.0034,0.0034,0.0034


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 4847638, number of negative: 4847638
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.273364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2837
[LightGBM] [Info] Number of data points in the train set: 9695276, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 4847638, number of negative: 4847638
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.107224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2814
[LightGBM] [Info] Number of data points in the train set: 9695276, number of used features: 50


In [16]:
exp.save_model(tune_lightgbm, 'Tuned_1121_add_time_lightgbm_cv5_pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['locdt', 'loctm', 'conam', 'iterm',
                                              'flam1', 'csmam', 'sequence'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  Tra...
                                 boosting_type='gbdt', class_weigh

### RF

### Evaluate models by public dataset

In [17]:
public_df = pd.read_csv('public.csv')

In [18]:
public_df[['txkey','chid','cano','contp','etymd','mchno','acqic','mcc','ecfg','insfg','bnsfg',\
          'stocn','scity','stscd','ovrlt','flbmk','hcefg','csmcu','flg_3dsmk']] = \
    public_df[['txkey','chid','cano','contp','etymd','mchno','acqic','mcc','ecfg','insfg','bnsfg','stocn',\
              'scity','stscd','ovrlt','flbmk','hcefg','csmcu','flg_3dsmk']].astype('category')

public_df[public_df.select_dtypes(include='int64').columns] = \
    public_df[public_df.select_dtypes(include='int64').columns].astype('int32')

public_df[public_df.select_dtypes(include='float64').columns] = \
    public_df[public_df.select_dtypes(include='float64').columns].astype('float32')

In [19]:
public_df.drop('txkey', axis=1, inplace=True)
public_df.drop('stscd', axis=1, inplace=True)

In [20]:
public_df['sequence'] = (public_df.sort_values(by=['chid', 'locdt', 'loctm']).groupby('chid').cumcount() + 1).astype('int32')

In [21]:
# 指定要填充的欄位
columns_to_fill = ['etymd', 'mcc', 'stocn', 'scity', 'hcefg', 'csmcu']

# 將 "unknown" 添加到類別中，如果已經存在，則忽略
for column in columns_to_fill:
    public_df[column] = public_df[column].cat.add_categories('-1').fillna('-1')

# Check for missing values
print(public_df.isnull().sum())
print(public_df.info())

locdt        0
loctm        0
chid         0
cano         0
contp        0
etymd        0
mchno        0
acqic        0
mcc          0
conam        0
ecfg         0
insfg        0
iterm        0
bnsfg        0
flam1        0
stocn        0
scity        0
ovrlt        0
flbmk        0
hcefg        0
csmcu        0
csmam        0
flg_3dsmk    0
label        0
sequence     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609040 entries, 0 to 609039
Data columns (total 25 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   locdt      609040 non-null  int32   
 1   loctm      609040 non-null  int32   
 2   chid       609040 non-null  category
 3   cano       609040 non-null  category
 4   contp      609040 non-null  category
 5   etymd      609040 non-null  category
 6   mchno      609040 non-null  category
 7   acqic      609040 non-null  category
 8   mcc        609040 non-null  category
 9   conam      609040 non-null  float3

In [23]:
X_public = public_df.drop('label', axis=1)
y_public = public_df['label']

In [26]:
predictions = exp.predict_model(tune_lightgbm, data=X_public)



In [39]:
from sklearn.metrics import classification_report
print(classification_report(y_public,predictions['prediction_label'].values))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    606856
           1       0.89      0.26      0.40      2184

    accuracy                           1.00    609040
   macro avg       0.94      0.63      0.70    609040
weighted avg       1.00      1.00      1.00    609040

