### Task Schedule:
1. 訓練模型,調整參數(預計使用lgb，速度較快)(更:使用catboost,效果較好)
2. 嘗試使用不同模型,做Ensamble(blending, stacking)
3. Anomaly detection

### 注意事項:
1. 因為test data和train data時間不相關,在驗證時採取前60天訓練61~90天驗證,但仍需小心時間差異造成的影響
2. Anomaly detection: 看這類的模型能不能取代boosting(似乎是不行，盜刷數據並沒有那麼Anomaly）,但可以嘗試將Anomaly結果當成新feature

### <font color=green>Results:</font>

#### Catboost:
    * FE1~4,catboost訓練 validation:0.5, LB:0.55
    * FE1,3,4 validation:0.5313149, LB:0.6(th=0.37), default parameter

#### LGB:
    * 不做處理,直接丟lgb訓練 leaderboard score:0.45

## 讀取,轉換字串成可以訓練的資料

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import math

import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier, Pool

%matplotlib inline
data_path = '../data'

random_seed = 20
data_list=['raw_data.csv','FE_data1.csv','FE_data2.csv','FE_data3.csv','FE_data4.csv']

## 除掉一些可能會overfit,distribution不同,受時間影響大的feature

delete_list1 = ['bacno','locdt','loctm','cano','fraud_ind']
delete_list2 = ['mchno','acqic','mcc']
delete_list3 = ['stocn','scity','csmcu']
delete_list4 = ['iterm']
delete_list6 = ['mchno_fraud_mean','mcc_fraud_mean','acqic_fraud_mean']
delete_list7 = ['bacno_locdt_skew','bacno_locdt_kurt','cano_locdt_skew','cano_locdt_kurt']
delete_list8 = ['bacno_lastlocdt','cano_lastlocdt']

delete_list5 = ['contp','etymd','hcefg','insfg','ovrlt','flbmk','flg_3dsmk']
# bacno_cano_nunique

delete_list = delete_list1+delete_list2+delete_list3+delete_list4+delete_list6+delete_list7+['txkey']+delete_list8

In [2]:
def load_data(data_list):
    data=[]
    for d in data_list:
        x = pd.read_csv('../data/preprocess/{}'.format(d))
        x_null = x.isnull().sum()
        
        print('\n',d,x.shape)
        print("Null columns:\n",x_null[x_null>0])

        if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
            x.fillna(value=-1,inplace=True)

        data.append(x)

    all_data = pd.concat(data,axis=1)
    all_data_numsum = all_data.isnull().sum()
    print('ALL data null:')
    print(all_data_numsum[all_data_numsum>0])
    return all_data

all_data = load_data(data_list)

raw_data.csv (1943452, 23)
Null columns:
 fraud_ind    421665
dtype: int64
FE_data1.csv (1943452, 50)
Null columns:
 cano_conam_skew      92612
cano_conam_kurt     155720
cano_conam_var       38678
bacno_locdt_skew     58303
bacno_locdt_kurt    101191
cano_locdt_skew      92612
cano_locdt_kurt     155720
dtype: int64
FE_data2.csv (1943452, 4)
Null columns:
 Series([], dtype: int64)
FE_data3.csv (1943452, 3)
Null columns:
 Series([], dtype: int64)
FE_data4.csv (1943452, 6)
Null columns:
 Series([], dtype: int64)
ALL data null:
fraud_ind    421665
dtype: int64


In [4]:
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic','mchno','etymd','contp']
all_data[category_list]=all_data[category_list].astype('category')

In [5]:
## 切三種不同的訓練集驗證

X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train2 = all_data[all_data['locdt']<=45].drop(columns=delete_list)
y_train2 = all_data[all_data['locdt']<=45]['fraud_ind']
X_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)]['fraud_ind']

X_train3 = all_data[all_data['locdt']<=30].drop(columns=delete_list)
y_train3 = all_data[all_data['locdt']<=30]['fraud_ind']
X_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)]['fraud_ind']


test_data_txkey = all_data[all_data['locdt']>90]['txkey'].copy().values
X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list) 
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 

X_test_all = all_data[all_data['locdt']>90].drop(columns=delete_list) 
# y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

## Train on catboost
* https://catboost.ai/docs/concepts/python-reference_parameters-list.html
* 研究有哪些可以用的function

In [6]:
categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]

print(X_train1.dtypes[categorical_features_indices])
category_list2 = X_train1.dtypes[categorical_features_indices].index

contp    category
etymd    category
hcefg    category
stscd    category
dtype: object


In [7]:
param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':2000,
    'learning_rate':0.1,
    'l2_leaf_reg':3,
    'bagging_temperature':0.5,
#     'sampling_frequency':'PerTreeLevel',
    
    'depth':6,
    'one_hot_max_size':300,
    
#     'min_data_in_leaf':1,
#     'max_leaves':31,
#     'task_type':"GPU",
#     'devices':1',
    'rsm':1,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'verbose':True    
}

In [8]:
print(categorical_features_indices)

model = CatBoostClassifier(**param_cat)

model.fit(
    X_train1, y_train1,
    cat_features=categorical_features_indices,    
    eval_set=(X_test1, y_test1),
    early_stopping_rounds=200,
#     use_best_model=True,
    silent=False,
#     plot=True,
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())


# preds_class = model.predict(test_data)
# preds_proba = model.predict_proba(test_data)

[1 3 6 9]
0:	learn: 0.5677877	test: 0.3194412	best: 0.3194412 (0)	total: 278ms	remaining: 9m 16s
1:	learn: 0.5943315	test: 0.3245407	best: 0.3245407 (1)	total: 520ms	remaining: 8m 38s
2:	learn: 0.5651751	test: 0.3290827	best: 0.3290827 (2)	total: 739ms	remaining: 8m 12s
3:	learn: 0.5843635	test: 0.3381172	best: 0.3381172 (3)	total: 960ms	remaining: 7m 58s
4:	learn: 0.5913974	test: 0.3392721	best: 0.3392721 (4)	total: 1.21s	remaining: 8m 1s
5:	learn: 0.5990081	test: 0.3514023	best: 0.3514023 (5)	total: 1.43s	remaining: 7m 53s
6:	learn: 0.5859512	test: 0.3482359	best: 0.3514023 (5)	total: 1.65s	remaining: 7m 50s
7:	learn: 0.5965710	test: 0.3649345	best: 0.3649345 (7)	total: 1.89s	remaining: 7m 51s
8:	learn: 0.6053277	test: 0.3760617	best: 0.3760617 (8)	total: 2.13s	remaining: 7m 50s
9:	learn: 0.6147860	test: 0.3774945	best: 0.3774945 (9)	total: 2.34s	remaining: 7m 46s
10:	learn: 0.6093117	test: 0.3779177	best: 0.3779177 (10)	total: 2.55s	remaining: 7m 41s
11:	learn: 0.6161331	test: 0.384

93:	learn: 0.7055620	test: 0.4663264	best: 0.4663264 (93)	total: 22.3s	remaining: 7m 32s
94:	learn: 0.7065226	test: 0.4698421	best: 0.4698421 (94)	total: 22.6s	remaining: 7m 32s
95:	learn: 0.7078023	test: 0.4702085	best: 0.4702085 (95)	total: 22.8s	remaining: 7m 32s
96:	learn: 0.7080533	test: 0.4731744	best: 0.4731744 (96)	total: 23s	remaining: 7m 32s
97:	learn: 0.7087752	test: 0.4732331	best: 0.4732331 (97)	total: 23.3s	remaining: 7m 31s
98:	learn: 0.7088145	test: 0.4733640	best: 0.4733640 (98)	total: 23.5s	remaining: 7m 32s
99:	learn: 0.7091312	test: 0.4748201	best: 0.4748201 (99)	total: 23.8s	remaining: 7m 31s
100:	learn: 0.7120484	test: 0.4747613	best: 0.4748201 (99)	total: 24s	remaining: 7m 31s
101:	learn: 0.7130462	test: 0.4763086	best: 0.4763086 (101)	total: 24.3s	remaining: 7m 32s
102:	learn: 0.7145787	test: 0.4765209	best: 0.4765209 (102)	total: 24.6s	remaining: 7m 32s
103:	learn: 0.7150536	test: 0.4748201	best: 0.4765209 (102)	total: 24.8s	remaining: 7m 32s
104:	learn: 0.7159

184:	learn: 0.7524423	test: 0.4944893	best: 0.4947291 (182)	total: 45.2s	remaining: 7m 23s
185:	learn: 0.7526312	test: 0.4949792	best: 0.4949792 (185)	total: 45.5s	remaining: 7m 23s
186:	learn: 0.7529991	test: 0.4944233	best: 0.4949792 (185)	total: 45.7s	remaining: 7m 23s
187:	learn: 0.7532831	test: 0.4945446	best: 0.4949792 (185)	total: 46s	remaining: 7m 22s
188:	learn: 0.7537844	test: 0.4957076	best: 0.4957076 (188)	total: 46.2s	remaining: 7m 22s
189:	learn: 0.7535387	test: 0.4953993	best: 0.4957076 (188)	total: 46.5s	remaining: 7m 22s
190:	learn: 0.7540388	test: 0.4943600	best: 0.4957076 (188)	total: 46.7s	remaining: 7m 22s
191:	learn: 0.7542200	test: 0.4944785	best: 0.4957076 (188)	total: 46.9s	remaining: 7m 21s
192:	learn: 0.7544557	test: 0.4946026	best: 0.4957076 (188)	total: 47.2s	remaining: 7m 21s
193:	learn: 0.7547870	test: 0.4947821	best: 0.4957076 (188)	total: 47.5s	remaining: 7m 21s
194:	learn: 0.7556279	test: 0.4944151	best: 0.4957076 (188)	total: 47.7s	remaining: 7m 21s
1

675:	learn: 0.8361332	test: 0.5228420	best: 0.5245538 (667)	total: 2m 53s	remaining: 5m 39s
676:	learn: 0.8363083	test: 0.5226601	best: 0.5245538 (667)	total: 2m 53s	remaining: 5m 39s
677:	learn: 0.8362965	test: 0.5226069	best: 0.5245538 (667)	total: 2m 53s	remaining: 5m 39s
678:	learn: 0.8365374	test: 0.5229064	best: 0.5245538 (667)	total: 2m 54s	remaining: 5m 38s
679:	learn: 0.8365374	test: 0.5221250	best: 0.5245538 (667)	total: 2m 54s	remaining: 5m 38s
680:	learn: 0.8368625	test: 0.5229708	best: 0.5245538 (667)	total: 2m 54s	remaining: 5m 38s
681:	learn: 0.8370679	test: 0.5227889	best: 0.5245538 (667)	total: 2m 55s	remaining: 5m 38s
682:	learn: 0.8370931	test: 0.5231527	best: 0.5245538 (667)	total: 2m 55s	remaining: 5m 38s
683:	learn: 0.8370982	test: 0.5223495	best: 0.5245538 (667)	total: 2m 55s	remaining: 5m 37s
684:	learn: 0.8374113	test: 0.5221566	best: 0.5245538 (667)	total: 2m 55s	remaining: 5m 37s
685:	learn: 0.8373154	test: 0.5236337	best: 0.5245538 (667)	total: 2m 56s	remain

765:	learn: 0.8463147	test: 0.5255240	best: 0.5265361 (745)	total: 3m 17s	remaining: 5m 17s
766:	learn: 0.8465669	test: 0.5257058	best: 0.5265361 (745)	total: 3m 17s	remaining: 5m 17s
767:	learn: 0.8465780	test: 0.5254070	best: 0.5265361 (745)	total: 3m 17s	remaining: 5m 17s
768:	learn: 0.8466501	test: 0.5251603	best: 0.5265361 (745)	total: 3m 17s	remaining: 5m 16s
769:	learn: 0.8468274	test: 0.5250432	best: 0.5265361 (745)	total: 3m 18s	remaining: 5m 16s
770:	learn: 0.8468384	test: 0.5249014	best: 0.5265361 (745)	total: 3m 18s	remaining: 5m 16s
771:	learn: 0.8469825	test: 0.5250308	best: 0.5265361 (745)	total: 3m 18s	remaining: 5m 16s
772:	learn: 0.8471182	test: 0.5247195	best: 0.5265361 (745)	total: 3m 18s	remaining: 5m 15s
773:	learn: 0.8469105	test: 0.5251603	best: 0.5265361 (745)	total: 3m 19s	remaining: 5m 15s
774:	learn: 0.8470156	test: 0.5252899	best: 0.5265361 (745)	total: 3m 19s	remaining: 5m 15s
775:	learn: 0.8471071	test: 0.5254070	best: 0.5265361 (745)	total: 3m 19s	remain

855:	learn: 0.8547162	test: 0.5282367	best: 0.5289338 (847)	total: 3m 41s	remaining: 4m 55s
856:	learn: 0.8546751	test: 0.5280414	best: 0.5289338 (847)	total: 3m 41s	remaining: 4m 55s
857:	learn: 0.8547983	test: 0.5279901	best: 0.5289338 (847)	total: 3m 41s	remaining: 4m 55s
858:	learn: 0.8549728	test: 0.5286506	best: 0.5289338 (847)	total: 3m 42s	remaining: 4m 54s
859:	learn: 0.8549930	test: 0.5282228	best: 0.5289338 (847)	total: 3m 42s	remaining: 4m 54s
860:	learn: 0.8551877	test: 0.5287526	best: 0.5289338 (847)	total: 3m 42s	remaining: 4m 54s
861:	learn: 0.8553518	test: 0.5288829	best: 0.5289338 (847)	total: 3m 42s	remaining: 4m 54s
862:	learn: 0.8553211	test: 0.5272369	best: 0.5289338 (847)	total: 3m 43s	remaining: 4m 53s
863:	learn: 0.8553725	test: 0.5275484	best: 0.5289338 (847)	total: 3m 43s	remaining: 4m 53s
864:	learn: 0.8557617	test: 0.5278600	best: 0.5289338 (847)	total: 3m 43s	remaining: 4m 53s
865:	learn: 0.8556900	test: 0.5281716	best: 0.5289338 (847)	total: 3m 43s	remain

945:	learn: 0.8612003	test: 0.5323670	best: 0.5330223 (943)	total: 4m 5s	remaining: 4m 33s
946:	learn: 0.8612925	test: 0.5331370	best: 0.5331370 (946)	total: 4m 5s	remaining: 4m 33s
947:	learn: 0.8612717	test: 0.5327768	best: 0.5331370 (946)	total: 4m 5s	remaining: 4m 32s
948:	learn: 0.8612925	test: 0.5331040	best: 0.5331370 (946)	total: 4m 6s	remaining: 4m 32s
949:	learn: 0.8615670	test: 0.5327275	best: 0.5331370 (946)	total: 4m 6s	remaining: 4m 32s
950:	learn: 0.8616790	test: 0.5333824	best: 0.5333824 (950)	total: 4m 6s	remaining: 4m 32s
951:	learn: 0.8618008	test: 0.5330061	best: 0.5333824 (950)	total: 4m 6s	remaining: 4m 31s
952:	learn: 0.8622374	test: 0.5329077	best: 0.5333824 (950)	total: 4m 7s	remaining: 4m 31s
953:	learn: 0.8619730	test: 0.5330715	best: 0.5333824 (950)	total: 4m 7s	remaining: 4m 31s
954:	learn: 0.8621870	test: 0.5336444	best: 0.5336444 (954)	total: 4m 7s	remaining: 4m 30s
955:	learn: 0.8623101	test: 0.5330223	best: 0.5336444 (954)	total: 4m 7s	remaining: 4m 30s

1034:	learn: 0.8693766	test: 0.5364060	best: 0.5380561 (1015)	total: 4m 27s	remaining: 4m 9s
1035:	learn: 0.8696796	test: 0.5361128	best: 0.5380561 (1015)	total: 4m 27s	remaining: 4m 9s
1036:	learn: 0.8695776	test: 0.5368305	best: 0.5380561 (1015)	total: 4m 27s	remaining: 4m 8s
1037:	learn: 0.8696889	test: 0.5365017	best: 0.5380561 (1015)	total: 4m 28s	remaining: 4m 8s
1038:	learn: 0.8696363	test: 0.5366511	best: 0.5380561 (1015)	total: 4m 28s	remaining: 4m 8s
1039:	learn: 0.8699239	test: 0.5374158	best: 0.5380561 (1015)	total: 4m 28s	remaining: 4m 8s
1040:	learn: 0.8700320	test: 0.5371415	best: 0.5380561 (1015)	total: 4m 29s	remaining: 4m 7s
1041:	learn: 0.8699207	test: 0.5365375	best: 0.5380561 (1015)	total: 4m 29s	remaining: 4m 7s
1042:	learn: 0.8700412	test: 0.5373683	best: 0.5380561 (1015)	total: 4m 29s	remaining: 4m 7s
1043:	learn: 0.8700505	test: 0.5374341	best: 0.5380561 (1015)	total: 4m 29s	remaining: 4m 7s
1044:	learn: 0.8703625	test: 0.5374525	best: 0.5380561 (1015)	total: 4

1122:	learn: 0.8761520	test: 0.5404809	best: 0.5409776 (1067)	total: 4m 50s	remaining: 3m 46s
1123:	learn: 0.8761830	test: 0.5399436	best: 0.5409776 (1067)	total: 4m 50s	remaining: 3m 46s
1124:	learn: 0.8762229	test: 0.5403681	best: 0.5409776 (1067)	total: 4m 50s	remaining: 3m 46s
1125:	learn: 0.8762850	test: 0.5401424	best: 0.5409776 (1067)	total: 4m 50s	remaining: 3m 45s
1126:	learn: 0.8766919	test: 0.5401890	best: 0.5409776 (1067)	total: 4m 51s	remaining: 3m 45s
1127:	learn: 0.8766919	test: 0.5400098	best: 0.5409776 (1067)	total: 4m 51s	remaining: 3m 45s
1128:	learn: 0.8769792	test: 0.5397644	best: 0.5409776 (1067)	total: 4m 51s	remaining: 3m 45s
1129:	learn: 0.8771470	test: 0.5400098	best: 0.5409776 (1067)	total: 4m 52s	remaining: 3m 44s
1130:	learn: 0.8772265	test: 0.5399436	best: 0.5409776 (1067)	total: 4m 52s	remaining: 3m 44s
1131:	learn: 0.8773906	test: 0.5397448	best: 0.5409776 (1067)	total: 4m 52s	remaining: 3m 44s
1132:	learn: 0.8772091	test: 0.5399706	best: 0.5409776 (1067

1210:	learn: 0.8841659	test: 0.5422483	best: 0.5428047 (1206)	total: 5m 11s	remaining: 3m 22s
1211:	learn: 0.8843629	test: 0.5423604	best: 0.5428047 (1206)	total: 5m 11s	remaining: 3m 22s
1212:	learn: 0.8843792	test: 0.5422277	best: 0.5428047 (1206)	total: 5m 11s	remaining: 3m 22s
1213:	learn: 0.8844022	test: 0.5420950	best: 0.5428047 (1206)	total: 5m 12s	remaining: 3m 22s
1214:	learn: 0.8843547	test: 0.5420950	best: 0.5428047 (1206)	total: 5m 12s	remaining: 3m 21s
1215:	learn: 0.8844728	test: 0.5423397	best: 0.5428047 (1206)	total: 5m 12s	remaining: 3m 21s
1216:	learn: 0.8845041	test: 0.5424061	best: 0.5428047 (1206)	total: 5m 12s	remaining: 3m 21s
1217:	learn: 0.8848814	test: 0.5422483	best: 0.5428047 (1206)	total: 5m 13s	remaining: 3m 20s
1218:	learn: 0.8848570	test: 0.5422483	best: 0.5428047 (1206)	total: 5m 13s	remaining: 3m 20s
1219:	learn: 0.8848733	test: 0.5421156	best: 0.5428047 (1206)	total: 5m 13s	remaining: 3m 20s
1220:	learn: 0.8851175	test: 0.5420034	best: 0.5428047 (1206

1298:	learn: 0.8907321	test: 0.5446090	best: 0.5455881 (1265)	total: 5m 32s	remaining: 2m 59s
1299:	learn: 0.8913952	test: 0.5440529	best: 0.5455881 (1265)	total: 5m 32s	remaining: 2m 59s
1300:	learn: 0.8911302	test: 0.5441860	best: 0.5455881 (1265)	total: 5m 32s	remaining: 2m 58s
1301:	learn: 0.8911225	test: 0.5438747	best: 0.5455881 (1265)	total: 5m 33s	remaining: 2m 58s
1302:	learn: 0.8912623	test: 0.5436299	best: 0.5455881 (1265)	total: 5m 33s	remaining: 2m 58s
1303:	learn: 0.8914494	test: 0.5444757	best: 0.5455881 (1265)	total: 5m 33s	remaining: 2m 58s
1304:	learn: 0.8916052	test: 0.5446308	best: 0.5455881 (1265)	total: 5m 33s	remaining: 2m 57s
1305:	learn: 0.8915967	test: 0.5449871	best: 0.5455881 (1265)	total: 5m 34s	remaining: 2m 57s
1306:	learn: 0.8915341	test: 0.5444308	best: 0.5455881 (1265)	total: 5m 34s	remaining: 2m 57s
1307:	learn: 0.8918701	test: 0.5443642	best: 0.5455881 (1265)	total: 5m 34s	remaining: 2m 56s
1308:	learn: 0.8918539	test: 0.5446090	best: 0.5455881 (1265

1386:	learn: 0.8971747	test: 0.5447871	best: 0.5455881 (1265)	total: 5m 53s	remaining: 2m 36s
1387:	learn: 0.8971505	test: 0.5447423	best: 0.5455881 (1265)	total: 5m 53s	remaining: 2m 35s
1388:	learn: 0.8971891	test: 0.5446756	best: 0.5455881 (1265)	total: 5m 53s	remaining: 2m 35s
1389:	learn: 0.8972277	test: 0.5444975	best: 0.5455881 (1265)	total: 5m 54s	remaining: 2m 35s
1390:	learn: 0.8972277	test: 0.5450985	best: 0.5455881 (1265)	total: 5m 54s	remaining: 2m 35s
1391:	learn: 0.8970804	test: 0.5447204	best: 0.5455881 (1265)	total: 5m 54s	remaining: 2m 34s
1392:	learn: 0.8970104	test: 0.5447204	best: 0.5455881 (1265)	total: 5m 54s	remaining: 2m 34s
1393:	learn: 0.8972250	test: 0.5450318	best: 0.5455881 (1265)	total: 5m 55s	remaining: 2m 34s
1394:	learn: 0.8971479	test: 0.5447204	best: 0.5455881 (1265)	total: 5m 55s	remaining: 2m 34s
1395:	learn: 0.8971865	test: 0.5449651	best: 0.5455881 (1265)	total: 5m 55s	remaining: 2m 33s
1396:	learn: 0.8972107	test: 0.5449204	best: 0.5455881 (1265

1474:	learn: 0.9009991	test: 0.5477676	best: 0.5478112 (1471)	total: 6m 14s	remaining: 2m 13s
1475:	learn: 0.9011004	test: 0.5476569	best: 0.5478112 (1471)	total: 6m 14s	remaining: 2m 13s
1476:	learn: 0.9011526	test: 0.5474354	best: 0.5478112 (1471)	total: 6m 15s	remaining: 2m 12s
1477:	learn: 0.9011526	test: 0.5476803	best: 0.5478112 (1471)	total: 6m 15s	remaining: 2m 12s
1478:	learn: 0.9014596	test: 0.5475462	best: 0.5478112 (1471)	total: 6m 15s	remaining: 2m 12s
1479:	learn: 0.9013022	test: 0.5476132	best: 0.5478112 (1471)	total: 6m 15s	remaining: 2m 12s
1480:	learn: 0.9014940	test: 0.5477239	best: 0.5478112 (1471)	total: 6m 16s	remaining: 2m 11s
1481:	learn: 0.9013406	test: 0.5477006	best: 0.5478112 (1471)	total: 6m 16s	remaining: 2m 11s
1482:	learn: 0.9014940	test: 0.5477676	best: 0.5478112 (1471)	total: 6m 16s	remaining: 2m 11s
1483:	learn: 0.9013583	test: 0.5478112	best: 0.5478112 (1471)	total: 6m 16s	remaining: 2m 11s
1484:	learn: 0.9013858	test: 0.5476336	best: 0.5478112 (1471

1562:	learn: 0.9058377	test: 0.5495231	best: 0.5506360 (1542)	total: 6m 35s	remaining: 1m 50s
1563:	learn: 0.9058312	test: 0.5491683	best: 0.5506360 (1542)	total: 6m 36s	remaining: 1m 50s
1564:	learn: 0.9059073	test: 0.5489908	best: 0.5506360 (1542)	total: 6m 36s	remaining: 1m 50s
1565:	learn: 0.9060281	test: 0.5486358	best: 0.5506360 (1542)	total: 6m 36s	remaining: 1m 49s
1566:	learn: 0.9062946	test: 0.5488805	best: 0.5506360 (1542)	total: 6m 36s	remaining: 1m 49s
1567:	learn: 0.9062630	test: 0.5491252	best: 0.5506360 (1542)	total: 6m 37s	remaining: 1m 49s
1568:	learn: 0.9062565	test: 0.5485924	best: 0.5506360 (1542)	total: 6m 37s	remaining: 1m 49s
1569:	learn: 0.9061869	test: 0.5485924	best: 0.5506360 (1542)	total: 6m 37s	remaining: 1m 48s
1570:	learn: 0.9063392	test: 0.5485924	best: 0.5506360 (1542)	total: 6m 37s	remaining: 1m 48s
1571:	learn: 0.9062630	test: 0.5490148	best: 0.5506360 (1542)	total: 6m 38s	remaining: 1m 48s
1572:	learn: 0.9064913	test: 0.5497920	best: 0.5506360 (1542

1650:	learn: 0.9093562	test: 0.5458094	best: 0.5508133 (1573)	total: 6m 57s	remaining: 1m 28s
1651:	learn: 0.9093436	test: 0.5463189	best: 0.5508133 (1573)	total: 6m 57s	remaining: 1m 27s
1652:	learn: 0.9092362	test: 0.5454767	best: 0.5508133 (1573)	total: 6m 57s	remaining: 1m 27s
1653:	learn: 0.9094131	test: 0.5453437	best: 0.5508133 (1573)	total: 6m 58s	remaining: 1m 27s
1654:	learn: 0.9097036	test: 0.5456318	best: 0.5508133 (1573)	total: 6m 58s	remaining: 1m 27s
1655:	learn: 0.9095900	test: 0.5452108	best: 0.5508133 (1573)	total: 6m 58s	remaining: 1m 26s
1656:	learn: 0.9096216	test: 0.5447452	best: 0.5508133 (1573)	total: 6m 58s	remaining: 1m 26s
1657:	learn: 0.9098047	test: 0.5447006	best: 0.5508133 (1573)	total: 6m 58s	remaining: 1m 26s
1658:	learn: 0.9098173	test: 0.5446341	best: 0.5508133 (1573)	total: 6m 59s	remaining: 1m 26s
1659:	learn: 0.9097162	test: 0.5447006	best: 0.5508133 (1573)	total: 6m 59s	remaining: 1m 25s
1660:	learn: 0.9095772	test: 0.5454767	best: 0.5508133 (1573

1738:	learn: 0.9130676	test: 0.5483596	best: 0.5508133 (1573)	total: 7m 18s	remaining: 1m 5s
1739:	learn: 0.9131249	test: 0.5491773	best: 0.5508133 (1573)	total: 7m 18s	remaining: 1m 5s
1740:	learn: 0.9132062	test: 0.5488236	best: 0.5508133 (1573)	total: 7m 18s	remaining: 1m 5s
1741:	learn: 0.9131882	test: 0.5488236	best: 0.5508133 (1573)	total: 7m 19s	remaining: 1m 5s
1742:	learn: 0.9133645	test: 0.5489574	best: 0.5508133 (1573)	total: 7m 19s	remaining: 1m 4s
1743:	learn: 0.9133962	test: 0.5500978	best: 0.5508133 (1573)	total: 7m 19s	remaining: 1m 4s
1744:	learn: 0.9135092	test: 0.5502322	best: 0.5508133 (1573)	total: 7m 19s	remaining: 1m 4s
1745:	learn: 0.9134219	test: 0.5502322	best: 0.5508133 (1573)	total: 7m 20s	remaining: 1m 4s
1746:	learn: 0.9133405	test: 0.5494989	best: 0.5508133 (1573)	total: 7m 20s	remaining: 1m 3s
1747:	learn: 0.9134475	test: 0.5491873	best: 0.5508133 (1573)	total: 7m 20s	remaining: 1m 3s
1748:	learn: 0.9133662	test: 0.5486553	best: 0.5508133 (1573)	total: 7

In [20]:
train_pool=Pool(X_test1, y_test1, cat_features=categorical_features_indices)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_test1.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

stocn_value_counts: 6.280433477464825
stocn_cano_nunique: 5.709603284489864
etymd: 5.625040551524719
csmcu_value_counts: 4.931195412045831
acqic_cano_nunique: 4.142678456900994
cano_conam_skew: 4.07661403013302
bacno_max_conam: 3.542405055083008
stocn_bacno_nunique: 3.3450273510055393
cano_conam_kurt: 3.3139455025886564
contp: 3.026949464548556
cano_conam_var: 3.014354066458374
bacno_mean_conam: 2.9833815329133744
bacno_txkey_nunique: 2.784879698370222
bacno_ratio_ecfg: 2.579797361833718
bacno_min_conam: 2.5347561135586
ecfg: 2.526727804453583
cano_lastlocdt2: 2.1642838919456255
bacno_stocn_nunique: 2.1594127220292303
bacno_cano_nunique: 2.084445754847033
mchno_bacno_nunique: 2.041926099612818
cano_ratio_ecfg: 1.9656501263972856
mcc_cano_nunique: 1.905698965603659
conam: 1.8883953562724627
mchno_cano_nunique: 1.7739557807709712
cano_mean_conam: 1.6933095295770457
mcc_bacno_nunique: 1.5451998376445635
cano_scity_mode: 1.4623611100720963
scity_bacno_nunique: 1.325042873584116
cano_scale_

In [34]:
## 理論上th設0.5一定是最好的？

y_test1_pred = model.predict_proba(X_test1,verbose=True)[:,1]
th=0.35

y_test1_pred[y_test1_pred>th]=1
y_test1_pred[y_test1_pred<=th]=0
print('F1 score',f1_score(y_test1, y_test1_pred))

tn, fp, fn, tp = confusion_matrix(y_test1, y_test1_pred).ravel()
print('tn fp fn tp')
print(tn, fp, fn, tp)
print('Percision', tp/(tp+fp))
print('Recall',tp/(tp+fn))


F1 score 0.8345204913802003
tn fp fn tp
501351 496 1107 4042
Percision 0.8907007492287351
Recall 0.7850067974363955


In [11]:
model = CatBoostClassifier(**param_cat)

model.fit(
    X_train_all, y_train_all,
    cat_features=categorical_features_indices,    
    silent=False
)


0:	learn: 0.5241322	total: 292ms	remaining: 9m 43s
1:	learn: 0.4834533	total: 603ms	remaining: 10m 2s
2:	learn: 0.4811321	total: 892ms	remaining: 9m 53s
3:	learn: 0.4914018	total: 1.18s	remaining: 9m 46s
4:	learn: 0.4966170	total: 1.48s	remaining: 9m 51s
5:	learn: 0.5031643	total: 1.79s	remaining: 9m 56s
6:	learn: 0.5170365	total: 2.08s	remaining: 9m 53s
7:	learn: 0.5129933	total: 2.36s	remaining: 9m 47s
8:	learn: 0.5242500	total: 2.65s	remaining: 9m 45s
9:	learn: 0.5274479	total: 2.95s	remaining: 9m 47s
10:	learn: 0.5281929	total: 3.24s	remaining: 9m 45s
11:	learn: 0.5431017	total: 3.53s	remaining: 9m 44s
12:	learn: 0.5430696	total: 3.83s	remaining: 9m 44s
13:	learn: 0.5384334	total: 4.12s	remaining: 9m 44s
14:	learn: 0.5429131	total: 4.41s	remaining: 9m 43s
15:	learn: 0.5404094	total: 4.7s	remaining: 9m 43s
16:	learn: 0.5459972	total: 5s	remaining: 9m 42s
17:	learn: 0.5411882	total: 5.29s	remaining: 9m 43s
18:	learn: 0.5422287	total: 5.59s	remaining: 9m 42s
19:	learn: 0.5420816	total

159:	learn: 0.6904415	total: 46.8s	remaining: 8m 58s
160:	learn: 0.6919578	total: 47.2s	remaining: 8m 58s
161:	learn: 0.6928493	total: 47.4s	remaining: 8m 58s
162:	learn: 0.6934435	total: 47.7s	remaining: 8m 57s
163:	learn: 0.6935455	total: 48s	remaining: 8m 57s
164:	learn: 0.6940241	total: 48.3s	remaining: 8m 57s
165:	learn: 0.6937116	total: 48.6s	remaining: 8m 57s
166:	learn: 0.6942953	total: 48.9s	remaining: 8m 56s
167:	learn: 0.6943208	total: 49.2s	remaining: 8m 56s
168:	learn: 0.6942105	total: 49.5s	remaining: 8m 56s
169:	learn: 0.6947886	total: 49.8s	remaining: 8m 55s
170:	learn: 0.6950569	total: 50.1s	remaining: 8m 55s
171:	learn: 0.6961647	total: 50.4s	remaining: 8m 55s
172:	learn: 0.6965021	total: 50.7s	remaining: 8m 55s
173:	learn: 0.6967157	total: 51s	remaining: 8m 54s
174:	learn: 0.6973439	total: 51.2s	remaining: 8m 54s
175:	learn: 0.6973107	total: 51.5s	remaining: 8m 54s
176:	learn: 0.6971178	total: 51.8s	remaining: 8m 53s
177:	learn: 0.6974128	total: 52.1s	remaining: 8m 5

313:	learn: 0.7343976	total: 1m 32s	remaining: 8m 14s
314:	learn: 0.7349464	total: 1m 32s	remaining: 8m 14s
315:	learn: 0.7346384	total: 1m 32s	remaining: 8m 14s
316:	learn: 0.7351530	total: 1m 33s	remaining: 8m 13s
317:	learn: 0.7350679	total: 1m 33s	remaining: 8m 13s
318:	learn: 0.7350620	total: 1m 33s	remaining: 8m 13s
319:	learn: 0.7354607	total: 1m 33s	remaining: 8m 13s
320:	learn: 0.7353944	total: 1m 34s	remaining: 8m 12s
321:	learn: 0.7357624	total: 1m 34s	remaining: 8m 12s
322:	learn: 0.7358262	total: 1m 34s	remaining: 8m 12s
323:	learn: 0.7361424	total: 1m 35s	remaining: 8m 12s
324:	learn: 0.7361637	total: 1m 35s	remaining: 8m 11s
325:	learn: 0.7362520	total: 1m 35s	remaining: 8m 11s
326:	learn: 0.7369516	total: 1m 36s	remaining: 8m 11s
327:	learn: 0.7369242	total: 1m 36s	remaining: 8m 11s
328:	learn: 0.7373040	total: 1m 36s	remaining: 8m 11s
329:	learn: 0.7370914	total: 1m 36s	remaining: 8m 10s
330:	learn: 0.7374498	total: 1m 37s	remaining: 8m 10s
331:	learn: 0.7377290	total:

467:	learn: 0.7618966	total: 2m 17s	remaining: 7m 30s
468:	learn: 0.7619941	total: 2m 17s	remaining: 7m 30s
469:	learn: 0.7619752	total: 2m 18s	remaining: 7m 29s
470:	learn: 0.7616286	total: 2m 18s	remaining: 7m 29s
471:	learn: 0.7619535	total: 2m 18s	remaining: 7m 29s
472:	learn: 0.7621646	total: 2m 19s	remaining: 7m 28s
473:	learn: 0.7616393	total: 2m 19s	remaining: 7m 28s
474:	learn: 0.7617233	total: 2m 19s	remaining: 7m 28s
475:	learn: 0.7620401	total: 2m 19s	remaining: 7m 27s
476:	learn: 0.7620889	total: 2m 20s	remaining: 7m 27s
477:	learn: 0.7625519	total: 2m 20s	remaining: 7m 27s
478:	learn: 0.7625572	total: 2m 20s	remaining: 7m 27s
479:	learn: 0.7627143	total: 2m 21s	remaining: 7m 26s
480:	learn: 0.7628333	total: 2m 21s	remaining: 7m 26s
481:	learn: 0.7626815	total: 2m 21s	remaining: 7m 26s
482:	learn: 0.7632933	total: 2m 21s	remaining: 7m 25s
483:	learn: 0.7633636	total: 2m 22s	remaining: 7m 25s
484:	learn: 0.7636829	total: 2m 22s	remaining: 7m 25s
485:	learn: 0.7640341	total:

620:	learn: 0.7825037	total: 3m 3s	remaining: 6m 46s
621:	learn: 0.7825794	total: 3m 3s	remaining: 6m 46s
622:	learn: 0.7825452	total: 3m 3s	remaining: 6m 46s
623:	learn: 0.7825232	total: 3m 4s	remaining: 6m 45s
624:	learn: 0.7824572	total: 3m 4s	remaining: 6m 45s
625:	learn: 0.7825940	total: 3m 4s	remaining: 6m 45s
626:	learn: 0.7824890	total: 3m 4s	remaining: 6m 44s
627:	learn: 0.7826576	total: 3m 5s	remaining: 6m 44s
628:	learn: 0.7826014	total: 3m 5s	remaining: 6m 44s
629:	learn: 0.7826258	total: 3m 5s	remaining: 6m 44s
630:	learn: 0.7829409	total: 3m 6s	remaining: 6m 43s
631:	learn: 0.7828432	total: 3m 6s	remaining: 6m 43s
632:	learn: 0.7829213	total: 3m 6s	remaining: 6m 43s
633:	learn: 0.7832608	total: 3m 6s	remaining: 6m 42s
634:	learn: 0.7835833	total: 3m 7s	remaining: 6m 42s
635:	learn: 0.7835515	total: 3m 7s	remaining: 6m 42s
636:	learn: 0.7838643	total: 3m 7s	remaining: 6m 41s
637:	learn: 0.7837276	total: 3m 8s	remaining: 6m 41s
638:	learn: 0.7835833	total: 3m 8s	remaining: 

773:	learn: 0.7982842	total: 3m 48s	remaining: 6m 1s
774:	learn: 0.7983399	total: 3m 48s	remaining: 6m 1s
775:	learn: 0.7981503	total: 3m 48s	remaining: 6m 1s
776:	learn: 0.7985073	total: 3m 49s	remaining: 6m
777:	learn: 0.7987859	total: 3m 49s	remaining: 6m
778:	learn: 0.7989199	total: 3m 49s	remaining: 6m
779:	learn: 0.7991760	total: 3m 50s	remaining: 5m 59s
780:	learn: 0.7991761	total: 3m 50s	remaining: 5m 59s
781:	learn: 0.7992430	total: 3m 50s	remaining: 5m 59s
782:	learn: 0.7992763	total: 3m 51s	remaining: 5m 59s
783:	learn: 0.7993988	total: 3m 51s	remaining: 5m 58s
784:	learn: 0.7992092	total: 3m 51s	remaining: 5m 58s
785:	learn: 0.7995880	total: 3m 52s	remaining: 5m 58s
786:	learn: 0.7997216	total: 3m 52s	remaining: 5m 58s
787:	learn: 0.7997662	total: 3m 52s	remaining: 5m 57s
788:	learn: 0.7998553	total: 3m 52s	remaining: 5m 57s
789:	learn: 0.7998775	total: 3m 53s	remaining: 5m 57s
790:	learn: 0.7999332	total: 3m 53s	remaining: 5m 56s
791:	learn: 0.7998776	total: 3m 53s	remaini

926:	learn: 0.8123376	total: 4m 33s	remaining: 5m 16s
927:	learn: 0.8125691	total: 4m 34s	remaining: 5m 16s
928:	learn: 0.8128455	total: 4m 34s	remaining: 5m 16s
929:	learn: 0.8127332	total: 4m 34s	remaining: 5m 15s
930:	learn: 0.8126019	total: 4m 34s	remaining: 5m 15s
931:	learn: 0.8127849	total: 4m 35s	remaining: 5m 15s
932:	learn: 0.8127849	total: 4m 35s	remaining: 5m 15s
933:	learn: 0.8129506	total: 4m 35s	remaining: 5m 14s
934:	learn: 0.8131042	total: 4m 36s	remaining: 5m 14s
935:	learn: 0.8131042	total: 4m 36s	remaining: 5m 14s
936:	learn: 0.8130369	total: 4m 36s	remaining: 5m 13s
937:	learn: 0.8129816	total: 4m 37s	remaining: 5m 13s
938:	learn: 0.8136389	total: 4m 37s	remaining: 5m 13s
939:	learn: 0.8139439	total: 4m 37s	remaining: 5m 13s
940:	learn: 0.8139214	total: 4m 37s	remaining: 5m 12s
941:	learn: 0.8143713	total: 4m 38s	remaining: 5m 12s
942:	learn: 0.8143713	total: 4m 38s	remaining: 5m 12s
943:	learn: 0.8144694	total: 4m 38s	remaining: 5m 11s
944:	learn: 0.8147862	total:

1078:	learn: 0.8243395	total: 5m 18s	remaining: 4m 32s
1079:	learn: 0.8244945	total: 5m 19s	remaining: 4m 31s
1080:	learn: 0.8246914	total: 5m 19s	remaining: 4m 31s
1081:	learn: 0.8246817	total: 5m 19s	remaining: 4m 31s
1082:	learn: 0.8248203	total: 5m 20s	remaining: 4m 31s
1083:	learn: 0.8245397	total: 5m 20s	remaining: 4m 30s
1084:	learn: 0.8244979	total: 5m 20s	remaining: 4m 30s
1085:	learn: 0.8243977	total: 5m 21s	remaining: 4m 30s
1086:	learn: 0.8245234	total: 5m 21s	remaining: 4m 29s
1087:	learn: 0.8245590	total: 5m 21s	remaining: 4m 29s
1088:	learn: 0.8246009	total: 5m 21s	remaining: 4m 29s
1089:	learn: 0.8246394	total: 5m 22s	remaining: 4m 28s
1090:	learn: 0.8248395	total: 5m 22s	remaining: 4m 28s
1091:	learn: 0.8249006	total: 5m 22s	remaining: 4m 28s
1092:	learn: 0.8249973	total: 5m 23s	remaining: 4m 28s
1093:	learn: 0.8249102	total: 5m 23s	remaining: 4m 27s
1094:	learn: 0.8249973	total: 5m 23s	remaining: 4m 27s
1095:	learn: 0.8255482	total: 5m 23s	remaining: 4m 27s
1096:	lear

1229:	learn: 0.8350493	total: 6m 4s	remaining: 3m 47s
1230:	learn: 0.8350313	total: 6m 4s	remaining: 3m 47s
1231:	learn: 0.8351265	total: 6m 4s	remaining: 3m 47s
1232:	learn: 0.8352714	total: 6m 4s	remaining: 3m 47s
1233:	learn: 0.8351990	total: 6m 5s	remaining: 3m 46s
1234:	learn: 0.8351672	total: 6m 5s	remaining: 3m 46s
1235:	learn: 0.8357160	total: 6m 5s	remaining: 3m 46s
1236:	learn: 0.8357298	total: 6m 6s	remaining: 3m 45s
1237:	learn: 0.8355982	total: 6m 6s	remaining: 3m 45s
1238:	learn: 0.8358111	total: 6m 6s	remaining: 3m 45s
1239:	learn: 0.8358567	total: 6m 7s	remaining: 3m 44s
1240:	learn: 0.8360151	total: 6m 7s	remaining: 3m 44s
1241:	learn: 0.8360290	total: 6m 7s	remaining: 3m 44s
1242:	learn: 0.8360151	total: 6m 7s	remaining: 3m 44s
1243:	learn: 0.8358339	total: 6m 8s	remaining: 3m 43s
1244:	learn: 0.8361240	total: 6m 8s	remaining: 3m 43s
1245:	learn: 0.8361557	total: 6m 8s	remaining: 3m 43s
1246:	learn: 0.8362419	total: 6m 9s	remaining: 3m 42s
1247:	learn: 0.8363864	total

1379:	learn: 0.8446455	total: 6m 48s	remaining: 3m 3s
1380:	learn: 0.8448309	total: 6m 49s	remaining: 3m 3s
1381:	learn: 0.8447599	total: 6m 49s	remaining: 3m 3s
1382:	learn: 0.8447599	total: 6m 49s	remaining: 3m 2s
1383:	learn: 0.8450102	total: 6m 49s	remaining: 3m 2s
1384:	learn: 0.8450643	total: 6m 50s	remaining: 3m 2s
1385:	learn: 0.8451978	total: 6m 50s	remaining: 3m 1s
1386:	learn: 0.8451978	total: 6m 50s	remaining: 3m 1s
1387:	learn: 0.8453770	total: 6m 51s	remaining: 3m 1s
1388:	learn: 0.8453290	total: 6m 51s	remaining: 3m 1s
1389:	learn: 0.8453312	total: 6m 51s	remaining: 3m
1390:	learn: 0.8452001	total: 6m 52s	remaining: 3m
1391:	learn: 0.8452481	total: 6m 52s	remaining: 3m
1392:	learn: 0.8452565	total: 6m 52s	remaining: 2m 59s
1393:	learn: 0.8453647	total: 6m 52s	remaining: 2m 59s
1394:	learn: 0.8454021	total: 6m 53s	remaining: 2m 59s
1395:	learn: 0.8454250	total: 6m 53s	remaining: 2m 58s
1396:	learn: 0.8453625	total: 6m 53s	remaining: 2m 58s
1397:	learn: 0.8454792	total: 6m

1530:	learn: 0.8549511	total: 7m 33s	remaining: 2m 18s
1531:	learn: 0.8550206	total: 7m 33s	remaining: 2m 18s
1532:	learn: 0.8550436	total: 7m 34s	remaining: 2m 18s
1533:	learn: 0.8551899	total: 7m 34s	remaining: 2m 18s
1534:	learn: 0.8551899	total: 7m 34s	remaining: 2m 17s
1535:	learn: 0.8555361	total: 7m 34s	remaining: 2m 17s
1536:	learn: 0.8556901	total: 7m 35s	remaining: 2m 17s
1537:	learn: 0.8556596	total: 7m 35s	remaining: 2m 16s
1538:	learn: 0.8557979	total: 7m 35s	remaining: 2m 16s
1539:	learn: 0.8557902	total: 7m 36s	remaining: 2m 16s
1540:	learn: 0.8558287	total: 7m 36s	remaining: 2m 15s
1541:	learn: 0.8557289	total: 7m 36s	remaining: 2m 15s
1542:	learn: 0.8555520	total: 7m 36s	remaining: 2m 15s
1543:	learn: 0.8555902	total: 7m 37s	remaining: 2m 15s
1544:	learn: 0.8555287	total: 7m 37s	remaining: 2m 14s
1545:	learn: 0.8555439	total: 7m 37s	remaining: 2m 14s
1546:	learn: 0.8555591	total: 7m 38s	remaining: 2m 14s
1547:	learn: 0.8557286	total: 7m 38s	remaining: 2m 13s
1548:	lear

1681:	learn: 0.8646522	total: 8m 18s	remaining: 1m 34s
1682:	learn: 0.8646059	total: 8m 18s	remaining: 1m 33s
1683:	learn: 0.8645900	total: 8m 19s	remaining: 1m 33s
1684:	learn: 0.8646812	total: 8m 19s	remaining: 1m 33s
1685:	learn: 0.8647795	total: 8m 19s	remaining: 1m 33s
1686:	learn: 0.8648417	total: 8m 19s	remaining: 1m 32s
1687:	learn: 0.8649777	total: 8m 20s	remaining: 1m 32s
1688:	learn: 0.8649849	total: 8m 20s	remaining: 1m 32s
1689:	learn: 0.8649704	total: 8m 20s	remaining: 1m 31s
1690:	learn: 0.8646277	total: 8m 21s	remaining: 1m 31s
1691:	learn: 0.8646884	total: 8m 21s	remaining: 1m 31s
1692:	learn: 0.8647419	total: 8m 21s	remaining: 1m 30s
1693:	learn: 0.8647419	total: 8m 22s	remaining: 1m 30s
1694:	learn: 0.8648634	total: 8m 22s	remaining: 1m 30s
1695:	learn: 0.8651150	total: 8m 22s	remaining: 1m 30s
1696:	learn: 0.8653579	total: 8m 22s	remaining: 1m 29s
1697:	learn: 0.8653579	total: 8m 23s	remaining: 1m 29s
1698:	learn: 0.8653882	total: 8m 23s	remaining: 1m 29s
1699:	lear

1832:	learn: 0.8709893	total: 9m 3s	remaining: 49.5s
1833:	learn: 0.8710357	total: 9m 3s	remaining: 49.2s
1834:	learn: 0.8709961	total: 9m 4s	remaining: 48.9s
1835:	learn: 0.8710331	total: 9m 4s	remaining: 48.6s
1836:	learn: 0.8710357	total: 9m 4s	remaining: 48.3s
1837:	learn: 0.8712026	total: 9m 5s	remaining: 48s
1838:	learn: 0.8712095	total: 9m 5s	remaining: 47.7s
1839:	learn: 0.8712929	total: 9m 5s	remaining: 47.5s
1840:	learn: 0.8713204	total: 9m 6s	remaining: 47.2s
1841:	learn: 0.8714545	total: 9m 6s	remaining: 46.9s
1842:	learn: 0.8715337	total: 9m 6s	remaining: 46.6s
1843:	learn: 0.8715870	total: 9m 6s	remaining: 46.3s
1844:	learn: 0.8715939	total: 9m 7s	remaining: 46s
1845:	learn: 0.8716540	total: 9m 7s	remaining: 45.7s
1846:	learn: 0.8716677	total: 9m 7s	remaining: 45.4s
1847:	learn: 0.8716910	total: 9m 8s	remaining: 45.1s
1848:	learn: 0.8720750	total: 9m 8s	remaining: 44.8s
1849:	learn: 0.8721284	total: 9m 8s	remaining: 44.5s
1850:	learn: 0.8720149	total: 9m 8s	remaining: 44.

1985:	learn: 0.8773226	total: 9m 49s	remaining: 4.16s
1986:	learn: 0.8772461	total: 9m 49s	remaining: 3.86s
1987:	learn: 0.8772629	total: 9m 50s	remaining: 3.56s
1988:	learn: 0.8772629	total: 9m 50s	remaining: 3.26s
1989:	learn: 0.8772863	total: 9m 50s	remaining: 2.97s
1990:	learn: 0.8773460	total: 9m 50s	remaining: 2.67s
1991:	learn: 0.8773058	total: 9m 51s	remaining: 2.37s
1992:	learn: 0.8772695	total: 9m 51s	remaining: 2.08s
1993:	learn: 0.8775716	total: 9m 51s	remaining: 1.78s
1994:	learn: 0.8775781	total: 9m 52s	remaining: 1.48s
1995:	learn: 0.8775819	total: 9m 52s	remaining: 1.19s
1996:	learn: 0.8774989	total: 9m 52s	remaining: 890ms
1997:	learn: 0.8775223	total: 9m 53s	remaining: 594ms
1998:	learn: 0.8774327	total: 9m 53s	remaining: 297ms
1999:	learn: 0.8775223	total: 9m 53s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f4f46730eb8>

In [12]:
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]

print(X_test_all.index)

th=0.37
y_test_pred_cat[y_test_pred_cat>th]=1
y_test_pred_cat[y_test_pred_cat<=th]=0

Int64Index([1521787, 1521788, 1521789, 1521790, 1521791, 1521792, 1521793,
            1521794, 1521795, 1521796,
            ...
            1943442, 1943443, 1943444, 1943445, 1943446, 1943447, 1943448,
            1943449, 1943450, 1943451],
           dtype='int64', length=421665)


## write csv

In [18]:
import time
print(time.localtime( time.time() ))
a = time.localtime( time.time() )
print(str(a.tm_mon)+str(a.tm_mday))

time.struct_time(tm_year=2019, tm_mon=10, tm_mday=23, tm_hour=19, tm_min=50, tm_sec=36, tm_wday=2, tm_yday=296, tm_isdst=0)
1023


In [19]:
result = y_test_pred_cat
test_data_txkey = all_data[all_data['locdt']>90]['txkey'].values

print('{}: prediction positive ratio'.format(result.sum()/result.shape[0]))
print('{}: training positive ratio'.format(y_train_all.sum()/y_train_all.shape[0]))

import time
t_now = time.localtime( time.time() )
t = str(t_now.tm_mon)+str(t_now.tm_mday)+str(t_now.tm_hour)+str(t_now.tm_min)
print('Now:',t)

submit_file_name='submit_cat_th{}_time{}.csv'.format(th,t)
import csv
with open('../prediction/{}'.format(submit_file_name),'w') as f:
    writer = csv.writer(f)
    writer.writerow(['txkey','fraud_ind'])
    for i in range(result.shape[0]):
        writer.writerow([test_data_txkey[i], result[i]])
        
with open('../prediction/log.txt','w') as f:
    print('{}'.format(submit_file_name),file=f)
    print('delete_list:\n{}'.format(delete_list),file=f)

0.013385033142423487: prediction positive ratio
0.013375722095142093: training positive ratio
Now: 10231952


## 贝叶斯调参
* http://nohup.cc/article/258/
* https://github.com/fmfn/BayesianOptimization

Results:
{'params': {'bagging_temperature': 1.0,
  'learning_rate': 0.2,
  'reg_lambda': 17.267046492047776},
 'target': 0.5339179149920837}

In [14]:
# from bayes_opt import BayesianOptimization
# def cat_train(bagging_temperature, reg_lambda, learning_rate):
#     params = {
#         'iterations':2000,
#         'depth':6,
#         'bagging_temperature':bagging_temperature,
#         'reg_lambda':reg_lambda,
#         'learning_rate':learning_rate,
#         'loss_function':'Logloss',
#         'eval_metric':'F1',
#         'random_seed':random_seed,
#         'verbose':30
#     }
 
#     model = CatBoostClassifier(**params)
#     # 评价数据集是验证集，评价指标是AUC
#     model.fit(X_train1, y_train1,\
#               eval_set=(X_test1, y_test1),\
#               cat_features=categorical_features_indices,\
#               early_stopping_rounds=200) 
     
#     print(params)
#     score_max = model.best_score_.get('validation').get('F1')
#     return score_max
 
# cat_opt = BayesianOptimization(cat_train, 
#                            {
#                               'bagging_temperature': (1, 50),  
#                               'reg_lambda': (1, 200),
#                               'learning_rate':(0.05, 0.2)
#                             })
 
# cat_opt.maximize(n_iter=15, init_points=random_seed)
# cat_opt.max

|   iter    |  target   | baggin... | learni... | reg_la... |
-------------------------------------------------------------
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 272ms	remaining: 9m 3s
30:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 7.55s	remaining: 7m 59s
60:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 14.4s	remaining: 7m 37s
90:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 21.3s	remaining: 7m 27s
120:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 28.2s	remaining: 7m 18s
150:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 35.2s	remaining: 7m 10s
180:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 42.1s	remaining: 7m 2s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0
bestIteration = 0

Shrink model to first 1 iterations.
{'eval_metric': 'F1', 'learning_rate': 0.18080951378529359, 'bagging_temperature': 40.878019189903966, 'loss_function': 'Logloss', 'verb

KeyboardInterrupt: 

## 將使用者分類來訓練模型:
1. 根據cano的個數分類
2. 根據txkey的個數分類
3. 根據stocn的眾數分類

murmur: 感覺這方法怪怪的,tree模型應該就能涵蓋進去

In [45]:
## txkey
txkey_qcut_id = pd.qcut(all_data['bacno_txkey_nunique'],3,labels=[0,1,2])

# print(all_data['bacno_txkey_nunique'])
# print(txkey_qcut_id)
models=[]
for i in range(3):
    print(i)

0           11
1           11
2           11
3           11
4           11
5           11
6           11
7           11
8           11
9           11
10          11
11         158
12         158
13         158
14         158
15         158
16         158
17         158
18         158
19         158
20         158
21         158
22         158
23         158
24         158
25         158
26         158
27         158
28         158
29         158
          ... 
1943422     17
1943423     17
1943424     17
1943425     17
1943426     17
1943427     17
1943428     17
1943429     17
1943430     17
1943431     17
1943432     17
1943433     17
1943434     17
1943435     17
1943436     17
1943437     17
1943438     17
1943439     11
1943440     11
1943441     11
1943442     11
1943443     11
1943444     11
1943445     11
1943446     11
1943447     11
1943448     11
1943449     11
1943450      2
1943451      2
Name: bacno_txkey_nunique, Length: 1943452, dtype: int64
0          0
1          0
2 

## Anomaly detection
* one class svm
* isolation tree
* replicator NN
* Kmeans?
* KNN(take too much time)

## 異常偵測
wiki<br>
https://zh.wikipedia.org/wiki/%E5%BC%82%E5%B8%B8%E6%A3%80%E6%B5%8B#cite_note-9

因為盜刷很可能都是outlier，一般的機器學習方法在outlier上表現會很差，因此可以用來解釋為什麼會train不好的原因
http://www.cainiaoxueyuan.com/suanfa/7017.html<br>
https://towardsdatascience.com/outlier-detection-with-isolation-forest-3d190448d45e<br>
https://medium.com/@cyeninesky3/oneclass-svm-%E7%95%B0%E5%B8%B8%E6%AA%A2%E6%B8%AC%E4%BB%BB%E5%8B%99-anomaly-detection-%E7%9A%84%E7%AE%97%E6%B3%95%E7%90%86%E8%A7%A3%E8%88%87%E5%AF%A6%E8%B8%90-cf5f0bbb01c0<br>

isolation tree<br>
https://zhuanlan.zhihu.com/p/25040651
https://scikit-learn.org/stable/auto_examples/ensemble/plot_isolation_forest.html
https://towardsdatascience.com/outlier-detection-with-extended-isolation-forest-1e248a3fe97b

oneclass svm<br>
https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html

Replicator NN<br>
https://togaware.com/papers/dawak02.pdf

one class kmeans?<br>
https://ai100-2.cupoy.com/mission/D57

## 製作特徵
XGB, LGB, PCA, Isolation Forest, Kmean距離？, oneclass SVM?
當作新feature

In [None]:
import xgboost as xgb
param_dist_xgb = {'learning_rate':0.01, #默认0.3
              'n_estimators':1000, #树的个数
#               'max_depth':5,
#               'min_child_weight':1,
#               'gamma':0.2,
#               'subsample':0.8,
#               'colsample_bytree':0.8,
#               'objective': 'binary:logistic', #逻辑回归损失函数
#               'nthread':4,  #cpu线程数
#               'scale_pos_weight':1,
              'seed':random_seed}  #随机种子

evals_result = {}

xgb_clf = xgb.XGBClassifier(**param_dist_xgb)
xgb_clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=600,
        verbose=True,
#         callbacks=[xgb.record_evaluation(evals_result)]
        )

print('F1',f1_score(y_test, xgb_clf.predict(X_test)))
xgb_X_train = xgb_clf.apply(X_train)
xgb_X_test = xgb_clf.apply(X_test)

## Train on LGB(未調參數)(效果不好)

In [None]:
print(delete_list)
print('Training num',X_train1.shape)
print('positive label ratio-train',y_train1.sum()/y_train1.shape[0])
print('positive label ratio-test',y_test1.sum()/y_test1.shape[0])

def lgb_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred) # scikits f1 doesn't like probabilities
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print()
    print('tn, fp, fn, tp')
    print(tn, fp, fn, tp)
    return 'f1', f1_score(y_true, y_pred), True

param_dist_lgb = {
#                   'num_leaves':45, 
#                   'max_depth':5, 
                  'learning_rate':0.1, 
                  'n_estimators':600,
                  'objective': 'binary',
#                   'subsample': 1, 
#                   'colsample_bytree': 0.5, 
#                   'lambda_l1': 0.1,
#                   'lambda_l2': 0,
#                   'min_child_weight': 1,
                  'random_state': random_seed,
                 }
evals_result = {}

lgb_clf = LGBMClassifier(**param_dist_lgb)
lgb_clf.fit(X_train1, y_train1,
        eval_set=[(X_train1, y_train1),(X_test1, y_test1)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=50,
        verbose=True,
        callbacks=[lgb.record_evaluation(evals_result)]
        )
y_test_pred = lgb_clf.predict(X_test1)
print('F1',f1_score(y_test1, y_test_pred))
tn, fp, fn, tp = confusion_matrix(y_test1, y_test_pred).ravel()
print(tn, fp, fn, tp)

In [None]:
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='f1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(lgb_clf, max_num_features=30)
plt.show()

print('Plotting 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgb_clf, tree_index=3, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgb_clf, tree_index=3, name='Tree4')
graph.render(view=True)

In [None]:
feature_importance = np.stack([X_train1.columns.values,lgb_clf.feature_importances_]).transpose()
feature_importance = pd.DataFrame(feature_importance,columns=['feature_name','importance'])
feature_importance.sort_values(by=['importance'],inplace=True,ascending=False)
print(feature_importance)

## PCA visualization in one person who has fraud data

In [None]:
from sklearn.decomposition import PCA
def PCA_plot(x,label):
    x = x.drop(columns=delete_list)
    
    ## 應該先轉dummy,標準化,再PCA
#     dummy_list=['contp','etymd','stscd','hcefg']
#     dummy_list2=['stocn','scity','csmcu']#'mchno','acqic','mcc',
#     x[dummy_list] = x[dummy_list].astype(object)
#     x[dummy_list2] = x[dummy_list2].astype(object)
#     x = pd.get_dummies(x)    
    
    from sklearn.preprocessing import StandardScaler 
    stdsc = StandardScaler() 
    x = stdsc.fit_transform(x)
    print(x.shape,label.sum())

    PCA_model = PCA(n_components=2)
    train_data_pca = PCA_model.fit_transform(x)
    train_data_pca1 = train_data_pca[label==1]
    train_data_pca0 = train_data_pca[label==0]
    
    plt.clf()
    plt.figure(figsize=(10,10))
    plt.scatter(train_data_pca1[:, 0], train_data_pca1[:, 1], c='r',label='fraud transaction',s=100)
    plt.scatter(train_data_pca0[:, 0], train_data_pca0[:, 1], c='b',label='normal transaction',s=3)
    plt.legend()
    plt.show()
    
bacno_hasfraud = all_data[all_data['fraud_ind']==1]['bacno'].unique()
print(bacno_hasfraud.shape[0])
print(all_data[all_data['fraud_ind']==1].shape[0])

for i in range(bacno_hasfraud.shape[0]):
    if all_data[all_data['bacno']==bacno_hasfraud[i]].shape[0]>300:
        print('Ploting PCA on bacno-{}'.format(bacno_hasfraud[i]))
        PCA_plot(all_data[all_data['bacno']==bacno_hasfraud[i]],all_data[all_data['bacno']==bacno_hasfraud[i]]['fraud_ind'])

## TSNE, Kmeans作圖?

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

c_ratio = y_train.sum()/y_train.shape[0]
# fit the model
clf = IsolationForest(behaviour='new', max_samples=0.8, max_features=1,
                      random_state=random_seed, contamination=c_ratio)
clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

y_pred_test2 = -y_pred_test
y_pred_test2[y_pred_test2==-1]=0
y_pred_test2.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

isolationtree_X_train = clf.score_samples(X_train)
isolationtree_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## One class SVM

In [None]:
from sklearn import svm

clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='scale',verbose=True, random_state=random_seed)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_test.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

svm_X_train = clf.score_samples(X_train)
svm_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## one class Kmeans

In [None]:
# 用hinge loss(當SVM)

In [None]:
# X_train['cents']
# encoding data

# GroupKfold
# vanilla KFold