### Task Schedule:
1. 訓練模型,調整參數(預計使用lgb，速度較快)(更:使用catboost,效果較好)
2. 嘗試使用不同模型,做Ensamble(blending, stacking)
3. Anomaly detection

### 注意事項:
1. 因為test data和train data時間不相關,在驗證時採取前60天訓練61~90天驗證,但仍需小心時間差異造成的影響
2. Anomaly detection: 看這類的模型能不能取代boosting(似乎是不行，盜刷數據並沒有那麼Anomaly）,但可以嘗試將Anomaly結果當成新feature

### <font color=green>Results:</font>

#### Catboost:
    * FE1~4,catboost訓練 validation:0.5, LB:0.55

#### LGB:
    * 不做處理,直接丟lgb訓練 leaderboard score:0.45

## 讀取,轉換字串成可以訓練的資料

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import math

import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
data_path = '../data'

random_seed = 20

In [2]:
data_list=['raw_data.csv','FE_data1.csv','FE_data3.csv','FE_data4.csv']#,'FE_data2.csv'

data=[]
for d in data_list:
    x = pd.read_csv('../data/preprocess/{}'.format(d))
    print(d,x.shape)
    x_null = x.isnull().sum()
    print("Null columns:\n",x_null[x_null>0])
    
    if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
        x.fillna(value=-1,inplace=True)
        
    data.append(x)

all_data = pd.concat(data,axis=1)

all_data_numsum = all_data.isnull().sum()
print('ALL data null:')
print(all_data_numsum[all_data_numsum>0])

raw_data.csv (1943452, 23)
Null columns:
 fraud_ind    421665
dtype: int64
FE_data1.csv (1943452, 50)
Null columns:
 cano_conam_skew      92612
cano_conam_kurt     155720
cano_conam_var       38678
bacno_locdt_skew     58303
bacno_locdt_kurt    101191
cano_locdt_skew      92612
cano_locdt_kurt     155720
dtype: int64
FE_data3.csv (1943452, 3)
Null columns:
 Series([], dtype: int64)
FE_data4.csv (1943452, 6)
Null columns:
 Series([], dtype: int64)
ALL data null:
fraud_ind    421665
dtype: int64


In [3]:
## 除掉一些可能會overfit,distribution不同,受時間影響大的feature

delete_list1 = ['bacno','locdt','loctm','cano','fraud_ind']
delete_list2 = ['mchno','acqic','mcc']
delete_list3 = ['stocn','scity','csmcu']
delete_list4 = ['iterm']
delete_list6 = ['mchno_fraud_mean','mcc_fraud_mean','acqic_fraud_mean']
delete_list7 = ['bacno_locdt_skew','bacno_locdt_kurt','cano_locdt_skew','cano_locdt_kurt']
delete_list8 = ['bacno_lastlocdt','cano_lastlocdt']

delete_list5 = ['contp','etymd','hcefg','insfg','ovrlt','flbmk','flg_3dsmk']
# cano_hasfraud_before

delete_list = delete_list1+delete_list2+delete_list3+delete_list4+delete_list6+delete_list7+['txkey']+delete_list8

In [4]:
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic','mchno','etymd','contp']
all_data[category_list]=all_data[category_list].astype('category')

In [5]:
## 切三種不同的訓練集驗證

X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train2 = all_data[all_data['locdt']<=45].drop(columns=delete_list)
y_train2 = all_data[all_data['locdt']<=45]['fraud_ind']
X_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)]['fraud_ind']

X_train3 = all_data[all_data['locdt']<=30].drop(columns=delete_list)
y_train3 = all_data[all_data['locdt']<=30]['fraud_ind']
X_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)]['fraud_ind']


test_data_txkey = all_data[all_data['locdt']>90]['txkey'].copy().values
X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list) 
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 

X_test_all = all_data[all_data['locdt']>90].drop(columns=delete_list) 
# y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

## Train on catboost
* https://catboost.ai/docs/concepts/python-reference_parameters-list.html
* 研究有哪些可以用的function

In [6]:
categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]

print(X_train1.dtypes[categorical_features_indices])
category_list2 = X_train1.dtypes[categorical_features_indices].index

contp    category
etymd    category
hcefg    category
stscd    category
dtype: object


In [7]:
param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':2000,
    'learning_rate':0.1,
    'l2_leaf_reg':3,
#     'sampling_frequency':'PerTreeLevel',
    
    'depth':6,
    'one_hot_max_size':300,
    
#     'min_data_in_leaf':1,
#     'max_leaves':31,
#     'task_type':"GPU",
#     'devices':1',
    'rsm':1,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'verbose':True    
}

In [8]:
from catboost import CatBoostClassifier, Pool

print(categorical_features_indices)

model = CatBoostClassifier(**param_cat)

model.fit(
    X_train1, y_train1,
    cat_features=categorical_features_indices,    
    eval_set=(X_test1, y_test1),
    early_stopping_rounds=200,
#     use_best_model=True,
    silent=False,
#     plot=True,
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())


# preds_class = model.predict(test_data)
# preds_proba = model.predict_proba(test_data)

[1 3 6 9]
0:	learn: 0.5536888	test: 0.2988986	best: 0.2988986 (0)	total: 378ms	remaining: 12m 35s
1:	learn: 0.5141290	test: 0.2812723	best: 0.2988986 (0)	total: 671ms	remaining: 11m 10s
2:	learn: 0.5705763	test: 0.3198699	best: 0.3198699 (2)	total: 975ms	remaining: 10m 49s
3:	learn: 0.5822443	test: 0.3431858	best: 0.3431858 (3)	total: 1.28s	remaining: 10m 36s
4:	learn: 0.5854157	test: 0.3429101	best: 0.3431858 (3)	total: 1.59s	remaining: 10m 34s
5:	learn: 0.5949167	test: 0.3471338	best: 0.3471338 (5)	total: 1.91s	remaining: 10m 33s
6:	learn: 0.5942397	test: 0.3436299	best: 0.3471338 (5)	total: 2.22s	remaining: 10m 32s
7:	learn: 0.6065934	test: 0.3517524	best: 0.3517524 (7)	total: 2.5s	remaining: 10m 23s
8:	learn: 0.6056378	test: 0.3542571	best: 0.3542571 (8)	total: 2.82s	remaining: 10m 24s
9:	learn: 0.6077526	test: 0.3550574	best: 0.3550574 (9)	total: 3.13s	remaining: 10m 22s
10:	learn: 0.6091383	test: 0.3557394	best: 0.3557394 (10)	total: 3.45s	remaining: 10m 23s
11:	learn: 0.6106957	

93:	learn: 0.6983384	test: 0.4511108	best: 0.4524615 (92)	total: 28s	remaining: 9m 28s
94:	learn: 0.6984684	test: 0.4530428	best: 0.4530428 (94)	total: 28.3s	remaining: 9m 28s
95:	learn: 0.6987839	test: 0.4544201	best: 0.4544201 (95)	total: 28.6s	remaining: 9m 27s
96:	learn: 0.6989210	test: 0.4542492	best: 0.4544201 (95)	total: 28.9s	remaining: 9m 27s
97:	learn: 0.6993731	test: 0.4574362	best: 0.4574362 (97)	total: 29.2s	remaining: 9m 27s
98:	learn: 0.6999144	test: 0.4574362	best: 0.4574362 (97)	total: 29.5s	remaining: 9m 27s
99:	learn: 0.7005804	test: 0.4564565	best: 0.4574362 (97)	total: 29.9s	remaining: 9m 27s
100:	learn: 0.7011288	test: 0.4561491	best: 0.4574362 (97)	total: 30.2s	remaining: 9m 27s
101:	learn: 0.7012805	test: 0.4558768	best: 0.4574362 (97)	total: 30.5s	remaining: 9m 27s
102:	learn: 0.7011208	test: 0.4558547	best: 0.4574362 (97)	total: 30.8s	remaining: 9m 27s
103:	learn: 0.7032890	test: 0.4570425	best: 0.4574362 (97)	total: 31.1s	remaining: 9m 27s
104:	learn: 0.70320

184:	learn: 0.7404828	test: 0.4750062	best: 0.4771372 (173)	total: 55.8s	remaining: 9m 7s
185:	learn: 0.7408597	test: 0.4743446	best: 0.4771372 (173)	total: 56.1s	remaining: 9m 7s
186:	learn: 0.7416778	test: 0.4744036	best: 0.4771372 (173)	total: 56.4s	remaining: 9m 7s
187:	learn: 0.7421720	test: 0.4747236	best: 0.4771372 (173)	total: 56.8s	remaining: 9m 7s
188:	learn: 0.7422310	test: 0.4753508	best: 0.4771372 (173)	total: 57.1s	remaining: 9m 6s
189:	learn: 0.7429771	test: 0.4750777	best: 0.4771372 (173)	total: 57.4s	remaining: 9m 6s
190:	learn: 0.7432912	test: 0.4746985	best: 0.4771372 (173)	total: 57.7s	remaining: 9m 6s
191:	learn: 0.7433939	test: 0.4741551	best: 0.4771372 (173)	total: 58s	remaining: 9m 6s
192:	learn: 0.7436319	test: 0.4744625	best: 0.4771372 (173)	total: 58.3s	remaining: 9m 5s
193:	learn: 0.7439127	test: 0.4747111	best: 0.4771372 (173)	total: 58.6s	remaining: 9m 5s
194:	learn: 0.7435486	test: 0.4755280	best: 0.4771372 (173)	total: 58.9s	remaining: 9m 5s
195:	learn: 

275:	learn: 0.7623149	test: 0.4822888	best: 0.4839390 (244)	total: 1m 23s	remaining: 8m 40s
276:	learn: 0.7625127	test: 0.4822888	best: 0.4839390 (244)	total: 1m 23s	remaining: 8m 39s
277:	learn: 0.7627464	test: 0.4831948	best: 0.4839390 (244)	total: 1m 23s	remaining: 8m 39s
278:	learn: 0.7626312	test: 0.4841792	best: 0.4841792 (278)	total: 1m 24s	remaining: 8m 38s
279:	learn: 0.7629224	test: 0.4835628	best: 0.4841792 (278)	total: 1m 24s	remaining: 8m 38s
280:	learn: 0.7646904	test: 0.4834511	best: 0.4841792 (278)	total: 1m 24s	remaining: 8m 38s
281:	learn: 0.7645906	test: 0.4858981	best: 0.4858981 (281)	total: 1m 25s	remaining: 8m 37s
282:	learn: 0.7648123	test: 0.4833313	best: 0.4858981 (281)	total: 1m 25s	remaining: 8m 37s
283:	learn: 0.7646460	test: 0.4857178	best: 0.4858981 (281)	total: 1m 25s	remaining: 8m 37s
284:	learn: 0.7646704	test: 0.4860252	best: 0.4860252 (284)	total: 1m 25s	remaining: 8m 37s
285:	learn: 0.7649207	test: 0.4835274	best: 0.4860252 (284)	total: 1m 26s	remain

365:	learn: 0.7817978	test: 0.4939371	best: 0.4941206 (364)	total: 1m 50s	remaining: 8m 14s
366:	learn: 0.7819639	test: 0.4941846	best: 0.4941846 (366)	total: 1m 51s	remaining: 8m 14s
367:	learn: 0.7819019	test: 0.4936254	best: 0.4941846 (366)	total: 1m 51s	remaining: 8m 13s
368:	learn: 0.7819932	test: 0.4933746	best: 0.4941846 (366)	total: 1m 51s	remaining: 8m 13s
369:	learn: 0.7822375	test: 0.4938700	best: 0.4941846 (366)	total: 1m 51s	remaining: 8m 13s
370:	learn: 0.7825241	test: 0.4938088	best: 0.4941846 (366)	total: 1m 52s	remaining: 8m 12s
371:	learn: 0.7824003	test: 0.4935001	best: 0.4941846 (366)	total: 1m 52s	remaining: 8m 12s
372:	learn: 0.7823743	test: 0.4941817	best: 0.4941846 (366)	total: 1m 52s	remaining: 8m 12s
373:	learn: 0.7825989	test: 0.4938730	best: 0.4941846 (366)	total: 1m 53s	remaining: 8m 11s
374:	learn: 0.7826478	test: 0.4947511	best: 0.4947511 (374)	total: 1m 53s	remaining: 8m 11s
375:	learn: 0.7826803	test: 0.4941293	best: 0.4947511 (374)	total: 1m 53s	remain

455:	learn: 0.7919869	test: 0.4977107	best: 0.4977118 (422)	total: 2m 17s	remaining: 7m 47s
456:	learn: 0.7919138	test: 0.4975851	best: 0.4977118 (422)	total: 2m 18s	remaining: 7m 46s
457:	learn: 0.7922713	test: 0.4974619	best: 0.4977118 (422)	total: 2m 18s	remaining: 7m 46s
458:	learn: 0.7922262	test: 0.4976479	best: 0.4977118 (422)	total: 2m 18s	remaining: 7m 46s
459:	learn: 0.7923599	test: 0.4977712	best: 0.4977712 (459)	total: 2m 19s	remaining: 7m 45s
460:	learn: 0.7923740	test: 0.4980208	best: 0.4980208 (460)	total: 2m 19s	remaining: 7m 45s
461:	learn: 0.7925851	test: 0.4975851	best: 0.4980208 (460)	total: 2m 19s	remaining: 7m 45s
462:	learn: 0.7925837	test: 0.4979562	best: 0.4980208 (460)	total: 2m 20s	remaining: 7m 44s
463:	learn: 0.7923585	test: 0.4983282	best: 0.4983282 (463)	total: 2m 20s	remaining: 7m 44s
464:	learn: 0.7924472	test: 0.4983282	best: 0.4983282 (463)	total: 2m 20s	remaining: 7m 44s
465:	learn: 0.7927189	test: 0.4980198	best: 0.4983282 (463)	total: 2m 20s	remain

545:	learn: 0.8028216	test: 0.5018541	best: 0.5034014 (514)	total: 2m 44s	remaining: 7m 17s
546:	learn: 0.8032361	test: 0.5029037	best: 0.5034014 (514)	total: 2m 44s	remaining: 7m 17s
547:	learn: 0.8032659	test: 0.5021014	best: 0.5034014 (514)	total: 2m 45s	remaining: 7m 17s
548:	learn: 0.8033554	test: 0.5021634	best: 0.5034014 (514)	total: 2m 45s	remaining: 7m 16s
549:	learn: 0.8040082	test: 0.5023486	best: 0.5034014 (514)	total: 2m 45s	remaining: 7m 16s
550:	learn: 0.8044817	test: 0.5021634	best: 0.5034014 (514)	total: 2m 45s	remaining: 7m 16s
551:	learn: 0.8046446	test: 0.5029674	best: 0.5034014 (514)	total: 2m 46s	remaining: 7m 15s
552:	learn: 0.8047785	test: 0.5030918	best: 0.5034014 (514)	total: 2m 46s	remaining: 7m 15s
553:	learn: 0.8047649	test: 0.5023497	best: 0.5034014 (514)	total: 2m 46s	remaining: 7m 15s
554:	learn: 0.8050310	test: 0.5023497	best: 0.5034014 (514)	total: 2m 47s	remaining: 7m 14s
555:	learn: 0.8048817	test: 0.5033391	best: 0.5034014 (514)	total: 2m 47s	remain

635:	learn: 0.8151722	test: 0.5030887	best: 0.5068714 (609)	total: 3m 11s	remaining: 6m 51s
636:	learn: 0.8152622	test: 0.5029037	best: 0.5068714 (609)	total: 3m 12s	remaining: 6m 51s
637:	learn: 0.8152458	test: 0.5035242	best: 0.5068714 (609)	total: 3m 12s	remaining: 6m 51s
638:	learn: 0.8156219	test: 0.5035847	best: 0.5068714 (609)	total: 3m 12s	remaining: 6m 50s
639:	learn: 0.8155783	test: 0.5037696	best: 0.5068714 (609)	total: 3m 13s	remaining: 6m 50s
640:	learn: 0.8156519	test: 0.5038319	best: 0.5068714 (609)	total: 3m 13s	remaining: 6m 50s
641:	learn: 0.8156655	test: 0.5036451	best: 0.5068714 (609)	total: 3m 13s	remaining: 6m 49s
642:	learn: 0.8157875	test: 0.5039545	best: 0.5068714 (609)	total: 3m 14s	remaining: 6m 49s
643:	learn: 0.8157875	test: 0.5038942	best: 0.5068714 (609)	total: 3m 14s	remaining: 6m 49s
644:	learn: 0.8158011	test: 0.5040791	best: 0.5068714 (609)	total: 3m 14s	remaining: 6m 48s
645:	learn: 0.8159017	test: 0.5039545	best: 0.5068714 (609)	total: 3m 14s	remain

725:	learn: 0.8237149	test: 0.5073557	best: 0.5076029 (680)	total: 3m 38s	remaining: 6m 24s
726:	learn: 0.8236890	test: 0.5075402	best: 0.5076029 (680)	total: 3m 39s	remaining: 6m 23s
727:	learn: 0.8237149	test: 0.5074184	best: 0.5076029 (680)	total: 3m 39s	remaining: 6m 23s
728:	learn: 0.8237580	test: 0.5072966	best: 0.5076029 (680)	total: 3m 39s	remaining: 6m 23s
729:	learn: 0.8236674	test: 0.5070457	best: 0.5076029 (680)	total: 3m 40s	remaining: 6m 22s
730:	learn: 0.8235768	test: 0.5072930	best: 0.5076029 (680)	total: 3m 40s	remaining: 6m 22s
731:	learn: 0.8235035	test: 0.5071676	best: 0.5076029 (680)	total: 3m 40s	remaining: 6m 22s
732:	learn: 0.8236976	test: 0.5075992	best: 0.5076029 (680)	total: 3m 40s	remaining: 6m 21s
733:	learn: 0.8240598	test: 0.5073557	best: 0.5076029 (680)	total: 3m 41s	remaining: 6m 21s
734:	learn: 0.8240340	test: 0.5074774	best: 0.5076029 (680)	total: 3m 41s	remaining: 6m 21s
735:	learn: 0.8240038	test: 0.5072930	best: 0.5076029 (680)	total: 3m 41s	remain

815:	learn: 0.8332544	test: 0.5131628	best: 0.5131628 (814)	total: 4m 5s	remaining: 5m 55s
816:	learn: 0.8331815	test: 0.5130360	best: 0.5131628 (814)	total: 4m 5s	remaining: 5m 55s
817:	learn: 0.8333212	test: 0.5131628	best: 0.5131628 (814)	total: 4m 5s	remaining: 5m 54s
818:	learn: 0.8335762	test: 0.5130424	best: 0.5131628 (814)	total: 4m 5s	remaining: 5m 54s
819:	learn: 0.8335822	test: 0.5132262	best: 0.5132262 (819)	total: 4m 6s	remaining: 5m 54s
820:	learn: 0.8336551	test: 0.5131628	best: 0.5132262 (819)	total: 4m 6s	remaining: 5m 54s
821:	learn: 0.8336430	test: 0.5132831	best: 0.5132831 (821)	total: 4m 6s	remaining: 5m 53s
822:	learn: 0.8337887	test: 0.5127381	best: 0.5132831 (821)	total: 4m 7s	remaining: 5m 53s
823:	learn: 0.8339586	test: 0.5129790	best: 0.5132831 (821)	total: 4m 7s	remaining: 5m 53s
824:	learn: 0.8339768	test: 0.5124845	best: 0.5132831 (821)	total: 4m 7s	remaining: 5m 53s
825:	learn: 0.8339889	test: 0.5131058	best: 0.5132831 (821)	total: 4m 8s	remaining: 5m 52s

905:	learn: 0.8405302	test: 0.5101435	best: 0.5137274 (849)	total: 4m 32s	remaining: 5m 28s
906:	learn: 0.8405723	test: 0.5102016	best: 0.5137274 (849)	total: 4m 32s	remaining: 5m 28s
907:	learn: 0.8406376	test: 0.5098912	best: 0.5137274 (849)	total: 4m 32s	remaining: 5m 27s
908:	learn: 0.8407523	test: 0.5101965	best: 0.5137274 (849)	total: 4m 32s	remaining: 5m 27s
909:	learn: 0.8409017	test: 0.5100173	best: 0.5137274 (849)	total: 4m 33s	remaining: 5m 27s
910:	learn: 0.8409132	test: 0.5112485	best: 0.5137274 (849)	total: 4m 33s	remaining: 5m 26s
911:	learn: 0.8409363	test: 0.5111331	best: 0.5137274 (849)	total: 4m 33s	remaining: 5m 26s
912:	learn: 0.8410163	test: 0.5108225	best: 0.5137274 (849)	total: 4m 34s	remaining: 5m 26s
913:	learn: 0.8412687	test: 0.5107015	best: 0.5137274 (849)	total: 4m 34s	remaining: 5m 26s
914:	learn: 0.8413298	test: 0.5111331	best: 0.5137274 (849)	total: 4m 34s	remaining: 5m 25s
915:	learn: 0.8413833	test: 0.5110067	best: 0.5137274 (849)	total: 4m 34s	remain

995:	learn: 0.8469934	test: 0.5141515	best: 0.5162802 (981)	total: 4m 58s	remaining: 5m 1s
996:	learn: 0.8471710	test: 0.5137342	best: 0.5162802 (981)	total: 4m 59s	remaining: 5m
997:	learn: 0.8473793	test: 0.5150204	best: 0.5162802 (981)	total: 4m 59s	remaining: 5m
998:	learn: 0.8472127	test: 0.5129284	best: 0.5162802 (981)	total: 4m 59s	remaining: 5m
999:	learn: 0.8470461	test: 0.5125541	best: 0.5162802 (981)	total: 5m	remaining: 5m
1000:	learn: 0.8471906	test: 0.5120653	best: 0.5162802 (981)	total: 5m	remaining: 4m 59s
1001:	learn: 0.8473878	test: 0.5129348	best: 0.5162802 (981)	total: 5m	remaining: 4m 59s
1002:	learn: 0.8476682	test: 0.5127381	best: 0.5162802 (981)	total: 5m 1s	remaining: 4m 59s
1003:	learn: 0.8477209	test: 0.5128015	best: 0.5162802 (981)	total: 5m 1s	remaining: 4m 58s
1004:	learn: 0.8479156	test: 0.5133465	best: 0.5162802 (981)	total: 5m 1s	remaining: 4m 58s
1005:	learn: 0.8477625	test: 0.5137773	best: 0.5162802 (981)	total: 5m 1s	remaining: 4m 58s
1006:	learn: 0.

1084:	learn: 0.8538913	test: 0.5167016	best: 0.5173859 (1041)	total: 5m 26s	remaining: 4m 35s
1085:	learn: 0.8542431	test: 0.5160256	best: 0.5173859 (1041)	total: 5m 26s	remaining: 4m 35s
1086:	learn: 0.8543661	test: 0.5163995	best: 0.5173859 (1041)	total: 5m 27s	remaining: 4m 34s
1087:	learn: 0.8544074	test: 0.5164632	best: 0.5173859 (1041)	total: 5m 27s	remaining: 4m 34s
1088:	learn: 0.8546030	test: 0.5164551	best: 0.5173859 (1041)	total: 5m 27s	remaining: 4m 34s
1089:	learn: 0.8546030	test: 0.5160813	best: 0.5173859 (1041)	total: 5m 28s	remaining: 4m 33s
1090:	learn: 0.8547882	test: 0.5157869	best: 0.5173859 (1041)	total: 5m 28s	remaining: 4m 33s
1091:	learn: 0.8548497	test: 0.5158583	best: 0.5173859 (1041)	total: 5m 28s	remaining: 4m 33s
1092:	learn: 0.8544381	test: 0.5154919	best: 0.5173859 (1041)	total: 5m 29s	remaining: 4m 33s
1093:	learn: 0.8545723	test: 0.5156751	best: 0.5173859 (1041)	total: 5m 29s	remaining: 4m 32s
1094:	learn: 0.8545311	test: 0.5150469	best: 0.5173859 (1041

1172:	learn: 0.8607822	test: 0.5189562	best: 0.5203893 (1163)	total: 5m 53s	remaining: 4m 9s
1173:	learn: 0.8607922	test: 0.5186553	best: 0.5203893 (1163)	total: 5m 54s	remaining: 4m 9s
1174:	learn: 0.8606387	test: 0.5182904	best: 0.5203893 (1163)	total: 5m 54s	remaining: 4m 9s
1175:	learn: 0.8605878	test: 0.5189562	best: 0.5203893 (1163)	total: 5m 55s	remaining: 4m 8s
1176:	learn: 0.8607304	test: 0.5187739	best: 0.5203893 (1163)	total: 5m 55s	remaining: 4m 8s
1177:	learn: 0.8607613	test: 0.5190746	best: 0.5203893 (1163)	total: 5m 55s	remaining: 4m 8s
1178:	learn: 0.8607822	test: 0.5192568	best: 0.5203893 (1163)	total: 5m 56s	remaining: 4m 7s
1179:	learn: 0.8606796	test: 0.5193207	best: 0.5203893 (1163)	total: 5m 56s	remaining: 4m 7s
1180:	learn: 0.8606078	test: 0.5193112	best: 0.5203893 (1163)	total: 5m 56s	remaining: 4m 7s
1181:	learn: 0.8606478	test: 0.5190652	best: 0.5203893 (1163)	total: 5m 56s	remaining: 4m 7s
1182:	learn: 0.8606787	test: 0.5191835	best: 0.5203893 (1163)	total: 5

1261:	learn: 0.8653564	test: 0.5174068	best: 0.5203893 (1163)	total: 6m 22s	remaining: 3m 43s
1262:	learn: 0.8655808	test: 0.5171692	best: 0.5203893 (1163)	total: 6m 22s	remaining: 3m 43s
1263:	learn: 0.8655691	test: 0.5168595	best: 0.5203893 (1163)	total: 6m 23s	remaining: 3m 43s
1264:	learn: 0.8656097	test: 0.5167959	best: 0.5203893 (1163)	total: 6m 23s	remaining: 3m 42s
1265:	learn: 0.8658724	test: 0.5169231	best: 0.5203893 (1163)	total: 6m 23s	remaining: 3m 42s
1266:	learn: 0.8658532	test: 0.5173432	best: 0.5203893 (1163)	total: 6m 24s	remaining: 3m 42s
1267:	learn: 0.8659845	test: 0.5173432	best: 0.5203893 (1163)	total: 6m 24s	remaining: 3m 41s
1268:	learn: 0.8659750	test: 0.5172795	best: 0.5203893 (1163)	total: 6m 24s	remaining: 3m 41s
1269:	learn: 0.8660465	test: 0.5175255	best: 0.5203893 (1163)	total: 6m 25s	remaining: 3m 41s
1270:	learn: 0.8660059	test: 0.5179538	best: 0.5203893 (1163)	total: 6m 25s	remaining: 3m 41s
1271:	learn: 0.8658201	test: 0.5174705	best: 0.5203893 (1163

1349:	learn: 0.8712094	test: 0.5221759	best: 0.5224430 (1330)	total: 6m 48s	remaining: 3m 16s
1350:	learn: 0.8709367	test: 0.5221228	best: 0.5224430 (1330)	total: 6m 49s	remaining: 3m 16s
1351:	learn: 0.8711105	test: 0.5214513	best: 0.5224430 (1330)	total: 6m 49s	remaining: 3m 16s
1352:	learn: 0.8709988	test: 0.5204908	best: 0.5224430 (1330)	total: 6m 49s	remaining: 3m 15s
1353:	learn: 0.8709275	test: 0.5199460	best: 0.5224430 (1330)	total: 6m 49s	remaining: 3m 15s
1354:	learn: 0.8711381	test: 0.5209713	best: 0.5224430 (1330)	total: 6m 50s	remaining: 3m 15s
1355:	learn: 0.8711197	test: 0.5210991	best: 0.5224430 (1330)	total: 6m 50s	remaining: 3m 14s
1356:	learn: 0.8713119	test: 0.5212166	best: 0.5224430 (1330)	total: 6m 50s	remaining: 3m 14s
1357:	learn: 0.8712716	test: 0.5207899	best: 0.5224430 (1330)	total: 6m 51s	remaining: 3m 14s
1358:	learn: 0.8712313	test: 0.5201816	best: 0.5224430 (1330)	total: 6m 51s	remaining: 3m 14s
1359:	learn: 0.8713119	test: 0.5203631	best: 0.5224430 (1330

1438:	learn: 0.8760001	test: 0.5228919	best: 0.5239671 (1420)	total: 7m 14s	remaining: 2m 49s
1439:	learn: 0.8765507	test: 0.5232016	best: 0.5239671 (1420)	total: 7m 15s	remaining: 2m 49s
1440:	learn: 0.8766305	test: 0.5230731	best: 0.5239671 (1420)	total: 7m 15s	remaining: 2m 48s
1441:	learn: 0.8765195	test: 0.5235641	best: 0.5239671 (1420)	total: 7m 15s	remaining: 2m 48s
1442:	learn: 0.8764309	test: 0.5237569	best: 0.5239671 (1420)	total: 7m 16s	remaining: 2m 48s
1443:	learn: 0.8763909	test: 0.5238329	best: 0.5239671 (1420)	total: 7m 16s	remaining: 2m 48s
1444:	learn: 0.8765419	test: 0.5238329	best: 0.5239671 (1420)	total: 7m 16s	remaining: 2m 47s
1445:	learn: 0.8767727	test: 0.5235114	best: 0.5239671 (1420)	total: 7m 17s	remaining: 2m 47s
1446:	learn: 0.8764708	test: 0.5238212	best: 0.5239671 (1420)	total: 7m 17s	remaining: 2m 47s
1447:	learn: 0.8766042	test: 0.5235756	best: 0.5239671 (1420)	total: 7m 17s	remaining: 2m 46s
1448:	learn: 0.8766305	test: 0.5240025	best: 0.5240025 (1448

1526:	learn: 0.8812250	test: 0.5283945	best: 0.5284961 (1510)	total: 7m 41s	remaining: 2m 23s
1527:	learn: 0.8813812	test: 0.5283945	best: 0.5284961 (1510)	total: 7m 42s	remaining: 2m 22s
1528:	learn: 0.8813643	test: 0.5295271	best: 0.5295271 (1528)	total: 7m 42s	remaining: 2m 22s
1529:	learn: 0.8815146	test: 0.5294622	best: 0.5295271 (1528)	total: 7m 42s	remaining: 2m 22s
1530:	learn: 0.8813499	test: 0.5294622	best: 0.5295271 (1528)	total: 7m 43s	remaining: 2m 21s
1531:	learn: 0.8812874	test: 0.5287272	best: 0.5295271 (1528)	total: 7m 43s	remaining: 2m 21s
1532:	learn: 0.8812790	test: 0.5284822	best: 0.5295271 (1528)	total: 7m 43s	remaining: 2m 21s
1533:	learn: 0.8814773	test: 0.5286765	best: 0.5295271 (1528)	total: 7m 44s	remaining: 2m 20s
1534:	learn: 0.8813439	test: 0.5289864	best: 0.5295271 (1528)	total: 7m 44s	remaining: 2m 20s
1535:	learn: 0.8815193	test: 0.5278391	best: 0.5295271 (1528)	total: 7m 44s	remaining: 2m 20s
1536:	learn: 0.8815817	test: 0.5281491	best: 0.5295271 (1528

1615:	learn: 0.8855113	test: 0.5288202	best: 0.5302326 (1547)	total: 8m 9s	remaining: 1m 56s
1616:	learn: 0.8853699	test: 0.5288851	best: 0.5302326 (1547)	total: 8m 9s	remaining: 1m 56s
1617:	learn: 0.8854244	test: 0.5288061	best: 0.5302326 (1547)	total: 8m 10s	remaining: 1m 55s
1618:	learn: 0.8854082	test: 0.5288568	best: 0.5302326 (1547)	total: 8m 10s	remaining: 1m 55s
1619:	learn: 0.8854476	test: 0.5286257	best: 0.5302326 (1547)	total: 8m 10s	remaining: 1m 55s
1620:	learn: 0.8857708	test: 0.5281216	best: 0.5302326 (1547)	total: 8m 11s	remaining: 1m 54s
1621:	learn: 0.8858728	test: 0.5279412	best: 0.5302326 (1547)	total: 8m 11s	remaining: 1m 54s
1622:	learn: 0.8861332	test: 0.5281216	best: 0.5302326 (1547)	total: 8m 11s	remaining: 1m 54s
1623:	learn: 0.8860938	test: 0.5282001	best: 0.5302326 (1547)	total: 8m 12s	remaining: 1m 53s
1624:	learn: 0.8861645	test: 0.5279549	best: 0.5302326 (1547)	total: 8m 12s	remaining: 1m 53s
1625:	learn: 0.8861725	test: 0.5279549	best: 0.5302326 (1547)	

1703:	learn: 0.8902753	test: 0.5315062	best: 0.5317354 (1694)	total: 8m 37s	remaining: 1m 29s
1704:	learn: 0.8904394	test: 0.5315558	best: 0.5317354 (1694)	total: 8m 37s	remaining: 1m 29s
1705:	learn: 0.8905336	test: 0.5311315	best: 0.5317354 (1694)	total: 8m 38s	remaining: 1m 29s
1706:	learn: 0.8907450	test: 0.5310016	best: 0.5317354 (1694)	total: 8m 38s	remaining: 1m 29s
1707:	learn: 0.8907368	test: 0.5303123	best: 0.5317354 (1694)	total: 8m 38s	remaining: 1m 28s
1708:	learn: 0.8907681	test: 0.5302474	best: 0.5317354 (1694)	total: 8m 39s	remaining: 1m 28s
1709:	learn: 0.8906977	test: 0.5310320	best: 0.5317354 (1694)	total: 8m 39s	remaining: 1m 28s
1710:	learn: 0.8908469	test: 0.5303827	best: 0.5317354 (1694)	total: 8m 39s	remaining: 1m 27s
1711:	learn: 0.8909174	test: 0.5300086	best: 0.5317354 (1694)	total: 8m 40s	remaining: 1m 27s
1712:	learn: 0.8909879	test: 0.5294693	best: 0.5317354 (1694)	total: 8m 40s	remaining: 1m 27s
1713:	learn: 0.8909879	test: 0.5309713	best: 0.5317354 (1694

1791:	learn: 0.8938776	test: 0.5322423	best: 0.5325357 (1789)	total: 9m 4s	remaining: 1m 3s
1792:	learn: 0.8939479	test: 0.5317043	best: 0.5325357 (1789)	total: 9m 5s	remaining: 1m 2s
1793:	learn: 0.8939628	test: 0.5319331	best: 0.5325357 (1789)	total: 9m 5s	remaining: 1m 2s
1794:	learn: 0.8939645	test: 0.5321280	best: 0.5325357 (1789)	total: 9m 5s	remaining: 1m 2s
1795:	learn: 0.8939885	test: 0.5321930	best: 0.5325357 (1789)	total: 9m 6s	remaining: 1m 2s
1796:	learn: 0.8940812	test: 0.5323723	best: 0.5325357 (1789)	total: 9m 6s	remaining: 1m 1s
1797:	learn: 0.8940978	test: 0.5328449	best: 0.5328449 (1797)	total: 9m 6s	remaining: 1m 1s
1798:	learn: 0.8942220	test: 0.5326167	best: 0.5328449 (1797)	total: 9m 7s	remaining: 1m 1s
1799:	learn: 0.8941665	test: 0.5327959	best: 0.5328449 (1797)	total: 9m 7s	remaining: 1m
1800:	learn: 0.8941590	test: 0.5328610	best: 0.5328610 (1800)	total: 9m 7s	remaining: 1m
1801:	learn: 0.8941276	test: 0.5328610	best: 0.5328610 (1800)	total: 9m 8s	remaining: 

1880:	learn: 0.8981771	test: 0.5335611	best: 0.5341312 (1874)	total: 9m 32s	remaining: 36.2s
1881:	learn: 0.8985110	test: 0.5337401	best: 0.5341312 (1874)	total: 9m 32s	remaining: 35.9s
1882:	learn: 0.8985670	test: 0.5326658	best: 0.5341312 (1874)	total: 9m 32s	remaining: 35.6s
1883:	learn: 0.8986230	test: 0.5312805	best: 0.5341312 (1874)	total: 9m 33s	remaining: 35.3s
1884:	learn: 0.8986546	test: 0.5318837	best: 0.5341312 (1874)	total: 9m 33s	remaining: 35s
1885:	learn: 0.8987320	test: 0.5320630	best: 0.5341312 (1874)	total: 9m 33s	remaining: 34.7s
1886:	learn: 0.8986759	test: 0.5313455	best: 0.5341312 (1874)	total: 9m 34s	remaining: 34.4s
1887:	learn: 0.8987146	test: 0.5312805	best: 0.5341312 (1874)	total: 9m 34s	remaining: 34.1s
1888:	learn: 0.8986444	test: 0.5313951	best: 0.5341312 (1874)	total: 9m 34s	remaining: 33.8s
1889:	learn: 0.8987706	test: 0.5313302	best: 0.5341312 (1874)	total: 9m 34s	remaining: 33.5s
1890:	learn: 0.8987849	test: 0.5315744	best: 0.5341312 (1874)	total: 9m 

1969:	learn: 0.9016365	test: 0.5317848	best: 0.5341312 (1874)	total: 9m 59s	remaining: 9.13s
1970:	learn: 0.9017835	test: 0.5311812	best: 0.5341312 (1874)	total: 9m 59s	remaining: 8.82s
1971:	learn: 0.9018536	test: 0.5313608	best: 0.5341312 (1874)	total: 9m 59s	remaining: 8.52s
1972:	learn: 0.9018151	test: 0.5314104	best: 0.5341312 (1874)	total: 10m	remaining: 8.21s
1973:	learn: 0.9018783	test: 0.5312309	best: 0.5341312 (1874)	total: 10m	remaining: 7.91s
1974:	learn: 0.9019306	test: 0.5312309	best: 0.5341312 (1874)	total: 10m	remaining: 7.6s
1975:	learn: 0.9019938	test: 0.5306771	best: 0.5341312 (1874)	total: 10m 1s	remaining: 7.3s
1976:	learn: 0.9019938	test: 0.5306771	best: 0.5341312 (1874)	total: 10m 1s	remaining: 7s
1977:	learn: 0.9021655	test: 0.5306621	best: 0.5341312 (1874)	total: 10m 1s	remaining: 6.69s
1978:	learn: 0.9022425	test: 0.5307767	best: 0.5341312 (1874)	total: 10m 1s	remaining: 6.39s
1979:	learn: 0.9023578	test: 0.5312653	best: 0.5341312 (1874)	total: 10m 2s	remainin

In [9]:
train_pool=Pool(X_train1, y_train1, cat_features=categorical_features_indices)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train1.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

bacno_txkey_nunique: 9.341650868031778
stocn_bacno_nunique: 9.046105986848705
bacno_cano_nunique: 6.591355550958711
cano_conam_skew: 4.745902852103425
bacno_ratio_ecfg: 4.687138207053269
etymd: 4.0317346706534645
bacno_max_conam: 3.8854150498706823
cano_conam_kurt: 3.5427145189734697
cano_conam_var: 3.4518859312980945
conam: 2.7161383081703927
cano_lastlocdt2: 2.496235208211179
ecfg: 2.328453062932891
mchno_bacno_nunique: 2.3153844211332055
stocn_cano_nunique: 2.1747619067405384
cano_mean_conam: 2.135879708967521
cano_ratio_ecfg: 2.1017181258038398
mcc_cano_nunique: 2.050259590125583
bacno_mean_conam: 2.0235068103793044
mchno_cano_nunique: 1.9921302962517144
mcc_bacno_nunique: 1.9429015914110623
contp: 1.9033560877492783
cano_scale_conam: 1.7802032774777454
bacno_min_conam: 1.7778520368911992
flg_3dsmk: 1.6087974513599572
cano_scity_mode: 1.4906743774485973
bacno_scale_conam: 1.298078090689005
bacno_scity_mode: 1.275864201520908
loctm_hr: 1.2637731853552725
scity_bacno_nunique: 1.14408

In [10]:
## 理論上th設0.5一定是最好的？

y_test1_pred = model.predict_proba(X_test1,verbose=True)[:,1]
th=0.5

y_test1_pred[y_test1_pred>th]=1
y_test1_pred[y_test1_pred<=th]=0
print(f1_score(y_test1, y_test1_pred))

0.5341311515447552


In [12]:
model = CatBoostClassifier(**param_cat)

model.fit(
    X_train_all, y_train_all,
    cat_features=categorical_features_indices,    
    silent=False
)


0:	learn: 0.4876262	total: 408ms	remaining: 13m 36s
1:	learn: 0.4125456	total: 793ms	remaining: 13m 12s
2:	learn: 0.4349388	total: 1.22s	remaining: 13m 33s
3:	learn: 0.4641631	total: 1.61s	remaining: 13m 24s
4:	learn: 0.4773822	total: 2.01s	remaining: 13m 20s
5:	learn: 0.4968102	total: 2.42s	remaining: 13m 23s
6:	learn: 0.5086227	total: 2.81s	remaining: 13m 21s
7:	learn: 0.5171368	total: 3.2s	remaining: 13m 17s
8:	learn: 0.5095507	total: 3.57s	remaining: 13m 10s
9:	learn: 0.5218861	total: 3.96s	remaining: 13m 7s
10:	learn: 0.5292633	total: 4.34s	remaining: 13m 5s
11:	learn: 0.5226516	total: 4.71s	remaining: 13m
12:	learn: 0.5248246	total: 5.1s	remaining: 12m 59s
13:	learn: 0.5303172	total: 5.45s	remaining: 12m 53s
14:	learn: 0.5324615	total: 5.85s	remaining: 12m 54s
15:	learn: 0.5383366	total: 6.31s	remaining: 13m 2s
16:	learn: 0.5340272	total: 6.74s	remaining: 13m 6s
17:	learn: 0.5386787	total: 7.18s	remaining: 13m 10s
18:	learn: 0.5393776	total: 7.59s	remaining: 13m 11s
19:	learn: 0.

155:	learn: 0.6694285	total: 1m 2s	remaining: 12m 21s
156:	learn: 0.6698912	total: 1m 3s	remaining: 12m 20s
157:	learn: 0.6702321	total: 1m 3s	remaining: 12m 20s
158:	learn: 0.6710574	total: 1m 3s	remaining: 12m 20s
159:	learn: 0.6714191	total: 1m 4s	remaining: 12m 20s
160:	learn: 0.6710777	total: 1m 4s	remaining: 12m 19s
161:	learn: 0.6720589	total: 1m 5s	remaining: 12m 19s
162:	learn: 0.6721994	total: 1m 5s	remaining: 12m 18s
163:	learn: 0.6723024	total: 1m 5s	remaining: 12m 18s
164:	learn: 0.6725023	total: 1m 6s	remaining: 12m 17s
165:	learn: 0.6727618	total: 1m 6s	remaining: 12m 17s
166:	learn: 0.6734023	total: 1m 7s	remaining: 12m 16s
167:	learn: 0.6739058	total: 1m 7s	remaining: 12m 16s
168:	learn: 0.6740075	total: 1m 7s	remaining: 12m 15s
169:	learn: 0.6746087	total: 1m 8s	remaining: 12m 14s
170:	learn: 0.6752683	total: 1m 8s	remaining: 12m 14s
171:	learn: 0.6762014	total: 1m 9s	remaining: 12m 13s
172:	learn: 0.6765884	total: 1m 9s	remaining: 12m 13s
173:	learn: 0.6770018	total:

305:	learn: 0.7121431	total: 2m 2s	remaining: 11m 18s
306:	learn: 0.7123118	total: 2m 2s	remaining: 11m 18s
307:	learn: 0.7127121	total: 2m 3s	remaining: 11m 17s
308:	learn: 0.7125983	total: 2m 3s	remaining: 11m 17s
309:	learn: 0.7130286	total: 2m 4s	remaining: 11m 16s
310:	learn: 0.7131760	total: 2m 4s	remaining: 11m 16s
311:	learn: 0.7135838	total: 2m 4s	remaining: 11m 15s
312:	learn: 0.7137857	total: 2m 5s	remaining: 11m 15s
313:	learn: 0.7139580	total: 2m 5s	remaining: 11m 14s
314:	learn: 0.7146050	total: 2m 6s	remaining: 11m 14s
315:	learn: 0.7147770	total: 2m 6s	remaining: 11m 14s
316:	learn: 0.7148025	total: 2m 6s	remaining: 11m 13s
317:	learn: 0.7149327	total: 2m 7s	remaining: 11m 13s
318:	learn: 0.7149662	total: 2m 7s	remaining: 11m 12s
319:	learn: 0.7155920	total: 2m 8s	remaining: 11m 12s
320:	learn: 0.7152478	total: 2m 8s	remaining: 11m 12s
321:	learn: 0.7156131	total: 2m 8s	remaining: 11m 11s
322:	learn: 0.7157134	total: 2m 9s	remaining: 11m 11s
323:	learn: 0.7156632	total:

456:	learn: 0.7403501	total: 3m 3s	remaining: 10m 19s
457:	learn: 0.7404939	total: 3m 3s	remaining: 10m 19s
458:	learn: 0.7402899	total: 3m 4s	remaining: 10m 18s
459:	learn: 0.7405969	total: 3m 4s	remaining: 10m 18s
460:	learn: 0.7405261	total: 3m 5s	remaining: 10m 17s
461:	learn: 0.7409682	total: 3m 5s	remaining: 10m 17s
462:	learn: 0.7413264	total: 3m 5s	remaining: 10m 17s
463:	learn: 0.7415001	total: 3m 6s	remaining: 10m 16s
464:	learn: 0.7414572	total: 3m 6s	remaining: 10m 16s
465:	learn: 0.7414188	total: 3m 7s	remaining: 10m 15s
466:	learn: 0.7417019	total: 3m 7s	remaining: 10m 15s
467:	learn: 0.7423027	total: 3m 7s	remaining: 10m 15s
468:	learn: 0.7428505	total: 3m 8s	remaining: 10m 14s
469:	learn: 0.7430341	total: 3m 8s	remaining: 10m 14s
470:	learn: 0.7432342	total: 3m 9s	remaining: 10m 13s
471:	learn: 0.7434391	total: 3m 9s	remaining: 10m 13s
472:	learn: 0.7433367	total: 3m 9s	remaining: 10m 13s
473:	learn: 0.7433152	total: 3m 10s	remaining: 10m 12s
474:	learn: 0.7432143	total

609:	learn: 0.7616216	total: 4m 4s	remaining: 9m 18s
610:	learn: 0.7612401	total: 4m 5s	remaining: 9m 17s
611:	learn: 0.7614390	total: 4m 5s	remaining: 9m 17s
612:	learn: 0.7617795	total: 4m 6s	remaining: 9m 16s
613:	learn: 0.7619483	total: 4m 6s	remaining: 9m 16s
614:	learn: 0.7622642	total: 4m 6s	remaining: 9m 16s
615:	learn: 0.7623321	total: 4m 7s	remaining: 9m 15s
616:	learn: 0.7622613	total: 4m 7s	remaining: 9m 15s
617:	learn: 0.7624954	total: 4m 8s	remaining: 9m 14s
618:	learn: 0.7627375	total: 4m 8s	remaining: 9m 14s
619:	learn: 0.7629955	total: 4m 8s	remaining: 9m 14s
620:	learn: 0.7629467	total: 4m 9s	remaining: 9m 13s
621:	learn: 0.7631534	total: 4m 9s	remaining: 9m 13s
622:	learn: 0.7631534	total: 4m 10s	remaining: 9m 12s
623:	learn: 0.7630609	total: 4m 10s	remaining: 9m 12s
624:	learn: 0.7632864	total: 4m 10s	remaining: 9m 11s
625:	learn: 0.7631940	total: 4m 11s	remaining: 9m 11s
626:	learn: 0.7630061	total: 4m 11s	remaining: 9m 11s
627:	learn: 0.7634630	total: 4m 12s	remai

763:	learn: 0.7794213	total: 5m 7s	remaining: 8m 17s
764:	learn: 0.7794338	total: 5m 7s	remaining: 8m 17s
765:	learn: 0.7794962	total: 5m 8s	remaining: 8m 16s
766:	learn: 0.7793926	total: 5m 8s	remaining: 8m 16s
767:	learn: 0.7797971	total: 5m 9s	remaining: 8m 15s
768:	learn: 0.7797818	total: 5m 9s	remaining: 8m 15s
769:	learn: 0.7799322	total: 5m 9s	remaining: 8m 15s
770:	learn: 0.7801266	total: 5m 10s	remaining: 8m 14s
771:	learn: 0.7801707	total: 5m 10s	remaining: 8m 14s
772:	learn: 0.7803279	total: 5m 11s	remaining: 8m 13s
773:	learn: 0.7803844	total: 5m 11s	remaining: 8m 13s
774:	learn: 0.7803816	total: 5m 11s	remaining: 8m 13s
775:	learn: 0.7806601	total: 5m 12s	remaining: 8m 12s
776:	learn: 0.7808076	total: 5m 12s	remaining: 8m 12s
777:	learn: 0.7810143	total: 5m 13s	remaining: 8m 11s
778:	learn: 0.7817247	total: 5m 13s	remaining: 8m 11s
779:	learn: 0.7817812	total: 5m 13s	remaining: 8m 11s
780:	learn: 0.7817345	total: 5m 14s	remaining: 8m 10s
781:	learn: 0.7819188	total: 5m 14s

917:	learn: 0.7933393	total: 6m 10s	remaining: 7m 16s
918:	learn: 0.7931373	total: 6m 10s	remaining: 7m 16s
919:	learn: 0.7931479	total: 6m 11s	remaining: 7m 15s
920:	learn: 0.7931595	total: 6m 11s	remaining: 7m 15s
921:	learn: 0.7931266	total: 6m 11s	remaining: 7m 14s
922:	learn: 0.7935200	total: 6m 12s	remaining: 7m 14s
923:	learn: 0.7934185	total: 6m 12s	remaining: 7m 13s
924:	learn: 0.7935983	total: 6m 13s	remaining: 7m 13s
925:	learn: 0.7936446	total: 6m 13s	remaining: 7m 13s
926:	learn: 0.7936437	total: 6m 13s	remaining: 7m 12s
927:	learn: 0.7934755	total: 6m 14s	remaining: 7m 12s
928:	learn: 0.7935431	total: 6m 14s	remaining: 7m 11s
929:	learn: 0.7935325	total: 6m 15s	remaining: 7m 11s
930:	learn: 0.7940385	total: 6m 15s	remaining: 7m 11s
931:	learn: 0.7939603	total: 6m 15s	remaining: 7m 10s
932:	learn: 0.7939710	total: 6m 16s	remaining: 7m 10s
933:	learn: 0.7941605	total: 6m 16s	remaining: 7m 9s
934:	learn: 0.7944530	total: 6m 17s	remaining: 7m 9s
935:	learn: 0.7944753	total: 6

1069:	learn: 0.8037107	total: 7m 11s	remaining: 6m 15s
1070:	learn: 0.8036993	total: 7m 12s	remaining: 6m 14s
1071:	learn: 0.8038669	total: 7m 12s	remaining: 6m 14s
1072:	learn: 0.8039887	total: 7m 12s	remaining: 6m 13s
1073:	learn: 0.8040783	total: 7m 13s	remaining: 6m 13s
1074:	learn: 0.8038571	total: 7m 13s	remaining: 6m 13s
1075:	learn: 0.8038904	total: 7m 13s	remaining: 6m 12s
1076:	learn: 0.8038680	total: 7m 14s	remaining: 6m 12s
1077:	learn: 0.8041237	total: 7m 14s	remaining: 6m 11s
1078:	learn: 0.8042231	total: 7m 15s	remaining: 6m 11s
1079:	learn: 0.8045785	total: 7m 15s	remaining: 6m 11s
1080:	learn: 0.8045228	total: 7m 15s	remaining: 6m 10s
1081:	learn: 0.8046457	total: 7m 16s	remaining: 6m 10s
1082:	learn: 0.8050111	total: 7m 16s	remaining: 6m 9s
1083:	learn: 0.8051565	total: 7m 17s	remaining: 6m 9s
1084:	learn: 0.8050574	total: 7m 17s	remaining: 6m 8s
1085:	learn: 0.8054364	total: 7m 17s	remaining: 6m 8s
1086:	learn: 0.8056143	total: 7m 18s	remaining: 6m 8s
1087:	learn: 0.

1220:	learn: 0.8168999	total: 8m 12s	remaining: 5m 14s
1221:	learn: 0.8169022	total: 8m 12s	remaining: 5m 13s
1222:	learn: 0.8171335	total: 8m 12s	remaining: 5m 13s
1223:	learn: 0.8170658	total: 8m 13s	remaining: 5m 12s
1224:	learn: 0.8169123	total: 8m 13s	remaining: 5m 12s
1225:	learn: 0.8170003	total: 8m 14s	remaining: 5m 11s
1226:	learn: 0.8170330	total: 8m 14s	remaining: 5m 11s
1227:	learn: 0.8169575	total: 8m 14s	remaining: 5m 11s
1228:	learn: 0.8170985	total: 8m 15s	remaining: 5m 10s
1229:	learn: 0.8173726	total: 8m 15s	remaining: 5m 10s
1230:	learn: 0.8173726	total: 8m 16s	remaining: 5m 9s
1231:	learn: 0.8176365	total: 8m 16s	remaining: 5m 9s
1232:	learn: 0.8177471	total: 8m 16s	remaining: 5m 9s
1233:	learn: 0.8177697	total: 8m 17s	remaining: 5m 8s
1234:	learn: 0.8176841	total: 8m 17s	remaining: 5m 8s
1235:	learn: 0.8178048	total: 8m 18s	remaining: 5m 7s
1236:	learn: 0.8178702	total: 8m 18s	remaining: 5m 7s
1237:	learn: 0.8180360	total: 8m 19s	remaining: 5m 7s
1238:	learn: 0.818

1371:	learn: 0.8262508	total: 9m 12s	remaining: 4m 13s
1372:	learn: 0.8263799	total: 9m 13s	remaining: 4m 12s
1373:	learn: 0.8261444	total: 9m 13s	remaining: 4m 12s
1374:	learn: 0.8262544	total: 9m 14s	remaining: 4m 11s
1375:	learn: 0.8263381	total: 9m 14s	remaining: 4m 11s
1376:	learn: 0.8264995	total: 9m 14s	remaining: 4m 11s
1377:	learn: 0.8266036	total: 9m 15s	remaining: 4m 10s
1378:	learn: 0.8267745	total: 9m 15s	remaining: 4m 10s
1379:	learn: 0.8268390	total: 9m 16s	remaining: 4m 9s
1380:	learn: 0.8269585	total: 9m 16s	remaining: 4m 9s
1381:	learn: 0.8269490	total: 9m 16s	remaining: 4m 9s
1382:	learn: 0.8271880	total: 9m 17s	remaining: 4m 8s
1383:	learn: 0.8270135	total: 9m 17s	remaining: 4m 8s
1384:	learn: 0.8269167	total: 9m 18s	remaining: 4m 7s
1385:	learn: 0.8269490	total: 9m 18s	remaining: 4m 7s
1386:	learn: 0.8272905	total: 9m 18s	remaining: 4m 7s
1387:	learn: 0.8272487	total: 9m 19s	remaining: 4m 6s
1388:	learn: 0.8273132	total: 9m 19s	remaining: 4m 6s
1389:	learn: 0.82746

1521:	learn: 0.8341040	total: 10m 13s	remaining: 3m 12s
1522:	learn: 0.8343092	total: 10m 14s	remaining: 3m 12s
1523:	learn: 0.8343411	total: 10m 14s	remaining: 3m 11s
1524:	learn: 0.8342363	total: 10m 14s	remaining: 3m 11s
1525:	learn: 0.8345233	total: 10m 15s	remaining: 3m 11s
1526:	learn: 0.8343458	total: 10m 15s	remaining: 3m 10s
1527:	learn: 0.8345052	total: 10m 16s	remaining: 3m 10s
1528:	learn: 0.8346646	total: 10m 16s	remaining: 3m 9s
1529:	learn: 0.8346556	total: 10m 17s	remaining: 3m 9s
1530:	learn: 0.8348059	total: 10m 17s	remaining: 3m 9s
1531:	learn: 0.8348378	total: 10m 17s	remaining: 3m 8s
1532:	learn: 0.8347422	total: 10m 18s	remaining: 3m 8s
1533:	learn: 0.8351065	total: 10m 18s	remaining: 3m 7s
1534:	learn: 0.8351384	total: 10m 19s	remaining: 3m 7s
1535:	learn: 0.8350651	total: 10m 19s	remaining: 3m 7s
1536:	learn: 0.8351246	total: 10m 19s	remaining: 3m 6s
1537:	learn: 0.8351107	total: 10m 20s	remaining: 3m 6s
1538:	learn: 0.8351107	total: 10m 20s	remaining: 3m 5s
153

1669:	learn: 0.8423664	total: 11m 13s	remaining: 2m 13s
1670:	learn: 0.8426017	total: 11m 13s	remaining: 2m 12s
1671:	learn: 0.8425070	total: 11m 14s	remaining: 2m 12s
1672:	learn: 0.8426017	total: 11m 14s	remaining: 2m 11s
1673:	learn: 0.8426877	total: 11m 14s	remaining: 2m 11s
1674:	learn: 0.8423578	total: 11m 15s	remaining: 2m 11s
1675:	learn: 0.8426706	total: 11m 15s	remaining: 2m 10s
1676:	learn: 0.8425557	total: 11m 16s	remaining: 2m 10s
1677:	learn: 0.8427423	total: 11m 16s	remaining: 2m 9s
1678:	learn: 0.8428139	total: 11m 17s	remaining: 2m 9s
1679:	learn: 0.8429315	total: 11m 17s	remaining: 2m 9s
1680:	learn: 0.8430117	total: 11m 18s	remaining: 2m 8s
1681:	learn: 0.8431121	total: 11m 18s	remaining: 2m 8s
1682:	learn: 0.8431752	total: 11m 18s	remaining: 2m 7s
1683:	learn: 0.8432757	total: 11m 19s	remaining: 2m 7s
1684:	learn: 0.8432612	total: 11m 19s	remaining: 2m 7s
1685:	learn: 0.8435133	total: 11m 20s	remaining: 2m 6s
1686:	learn: 0.8436623	total: 11m 20s	remaining: 2m 6s
16

1817:	learn: 0.8497122	total: 12m 14s	remaining: 1m 13s
1818:	learn: 0.8497055	total: 12m 14s	remaining: 1m 13s
1819:	learn: 0.8497367	total: 12m 14s	remaining: 1m 12s
1820:	learn: 0.8497679	total: 12m 15s	remaining: 1m 12s
1821:	learn: 0.8498684	total: 12m 15s	remaining: 1m 11s
1822:	learn: 0.8498982	total: 12m 16s	remaining: 1m 11s
1823:	learn: 0.8499376	total: 12m 16s	remaining: 1m 11s
1824:	learn: 0.8499063	total: 12m 16s	remaining: 1m 10s
1825:	learn: 0.8498914	total: 12m 17s	remaining: 1m 10s
1826:	learn: 0.8499851	total: 12m 17s	remaining: 1m 9s
1827:	learn: 0.8500000	total: 12m 18s	remaining: 1m 9s
1828:	learn: 0.8499919	total: 12m 18s	remaining: 1m 9s
1829:	learn: 0.8500842	total: 12m 18s	remaining: 1m 8s
1830:	learn: 0.8499742	total: 12m 19s	remaining: 1m 8s
1831:	learn: 0.8500679	total: 12m 19s	remaining: 1m 7s
1832:	learn: 0.8499674	total: 12m 20s	remaining: 1m 7s
1833:	learn: 0.8499919	total: 12m 20s	remaining: 1m 7s
1834:	learn: 0.8501547	total: 12m 20s	remaining: 1m 6s
1

1967:	learn: 0.8561073	total: 13m 15s	remaining: 12.9s
1968:	learn: 0.8561305	total: 13m 15s	remaining: 12.5s
1969:	learn: 0.8561227	total: 13m 15s	remaining: 12.1s
1970:	learn: 0.8561924	total: 13m 16s	remaining: 11.7s
1971:	learn: 0.8561692	total: 13m 16s	remaining: 11.3s
1972:	learn: 0.8562001	total: 13m 17s	remaining: 10.9s
1973:	learn: 0.8562465	total: 13m 17s	remaining: 10.5s
1974:	learn: 0.8562233	total: 13m 17s	remaining: 10.1s
1975:	learn: 0.8562774	total: 13m 18s	remaining: 9.69s
1976:	learn: 0.8563856	total: 13m 18s	remaining: 9.29s
1977:	learn: 0.8563469	total: 13m 19s	remaining: 8.89s
1978:	learn: 0.8564706	total: 13m 19s	remaining: 8.48s
1979:	learn: 0.8564706	total: 13m 19s	remaining: 8.08s
1980:	learn: 0.8564165	total: 13m 20s	remaining: 7.67s
1981:	learn: 0.8564862	total: 13m 20s	remaining: 7.27s
1982:	learn: 0.8565403	total: 13m 21s	remaining: 6.87s
1983:	learn: 0.8564785	total: 13m 21s	remaining: 6.46s
1984:	learn: 0.8564861	total: 13m 21s	remaining: 6.06s
1985:	lear

In [22]:
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]

print(X_test_all.index)

th=0.37
y_test_pred_cat[y_test_pred_cat>th]=1
y_test_pred_cat[y_test_pred_cat<=th]=0

Int64Index([1521787, 1521788, 1521789, 1521790, 1521791, 1521792, 1521793,
            1521794, 1521795, 1521796,
            ...
            1943442, 1943443, 1943444, 1943445, 1943446, 1943447, 1943448,
            1943449, 1943450, 1943451],
           dtype='int64', length=421665)


## write csv

In [24]:
result = y_test_pred_cat
test_data_txkey = all_data[all_data['locdt']>90]['txkey'].values

print('{}: prediction positive ratio'.format(result.sum()/result.shape[0]))
print('{}: training positive ratio'.format(y_train_all.sum()/y_train_all.shape[0]))


submit_file_name='submit_cat_FE_th{}.csv'.format(th)
import csv
with open('../prediction/{}'.format(submit_file_name),'w') as f:
    writer = csv.writer(f)
    writer.writerow(['txkey','fraud_ind'])
    for i in range(result.shape[0]):
        writer.writerow([test_data_txkey[i], result[i]])
        
with open('../prediction/log.txt','w') as f:
    print('{}'.format(submit_file_name),file=f)
    print('delete_list:\n{}'.format(delete_list),file=f)

0.013439578812564476: prediction positive ratio
0.013375722095142093: training positive ratio


In [None]:
def cat_train(bagging_temperature, reg_lambda, learning_rate):
    params = {
        'iterations':800,
        'depth':3,
        'bagging_temperature':bagging_temperature,
        'reg_lambda':reg_lambda,
        'learning_rate':learning_rate,
        'loss_function':'Logloss',
        'eval_metric':'AUC',
        'random_seed':696,
        'verbose':30
    }
 
    model = CatBoost(params)
    # 评价数据集是验证集，评价指标是AUC
    model.fit(data_train_X, data_train_y, eval_set=(data_val_X, data_val_y), plot=False, early_stopping_rounds=20) 
     
    print(params)
    score_max = model.best_score_.get('validation').get('AUC')
    return score_max
 
cat_opt = BayesianOptimization(cat_train, 
                           {
                              'bagging_temperature': (1, 50),  
                              'reg_lambda': (1, 200),
                              'learning_rate':(0.05, 0.2)
                            })
 
cat_opt.maximize(n_iter=15, init_points=5)

## Anomaly detection
* one class svm
* isolation tree
* replicator NN
* Kmeans?
* KNN(take too much time)

## 製作特徵
XGB, LGB, PCA, Isolation Forest, Kmean距離？, oneclass SVM?
當作新feature

In [None]:
import xgboost as xgb
param_dist_xgb = {'learning_rate':0.01, #默认0.3
              'n_estimators':1000, #树的个数
#               'max_depth':5,
#               'min_child_weight':1,
#               'gamma':0.2,
#               'subsample':0.8,
#               'colsample_bytree':0.8,
#               'objective': 'binary:logistic', #逻辑回归损失函数
#               'nthread':4,  #cpu线程数
#               'scale_pos_weight':1,
              'seed':random_seed}  #随机种子

evals_result = {}

xgb_clf = xgb.XGBClassifier(**param_dist_xgb)
xgb_clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=600,
        verbose=True,
#         callbacks=[xgb.record_evaluation(evals_result)]
        )

print('F1',f1_score(y_test, xgb_clf.predict(X_test)))
xgb_X_train = xgb_clf.apply(X_train)
xgb_X_test = xgb_clf.apply(X_test)

## Train on LGB(未調參數)(效果不好)

In [None]:
print(delete_list)
print('Training num',X_train1.shape)
print('positive label ratio-train',y_train1.sum()/y_train1.shape[0])
print('positive label ratio-test',y_test1.sum()/y_test1.shape[0])

def lgb_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred) # scikits f1 doesn't like probabilities
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print()
    print('tn, fp, fn, tp')
    print(tn, fp, fn, tp)
    return 'f1', f1_score(y_true, y_pred), True

param_dist_lgb = {
#                   'num_leaves':45, 
#                   'max_depth':5, 
                  'learning_rate':0.1, 
                  'n_estimators':600,
                  'objective': 'binary',
#                   'subsample': 1, 
#                   'colsample_bytree': 0.5, 
#                   'lambda_l1': 0.1,
#                   'lambda_l2': 0,
#                   'min_child_weight': 1,
                  'random_state': random_seed,
                 }
evals_result = {}

lgb_clf = LGBMClassifier(**param_dist_lgb)
lgb_clf.fit(X_train1, y_train1,
        eval_set=[(X_train1, y_train1),(X_test1, y_test1)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=50,
        verbose=True,
        callbacks=[lgb.record_evaluation(evals_result)]
        )
y_test_pred = lgb_clf.predict(X_test1)
print('F1',f1_score(y_test1, y_test_pred))
tn, fp, fn, tp = confusion_matrix(y_test1, y_test_pred).ravel()
print(tn, fp, fn, tp)

In [None]:
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='f1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(lgb_clf, max_num_features=30)
plt.show()

print('Plotting 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgb_clf, tree_index=3, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgb_clf, tree_index=3, name='Tree4')
graph.render(view=True)

In [None]:
feature_importance = np.stack([X_train1.columns.values,lgb_clf.feature_importances_]).transpose()
feature_importance = pd.DataFrame(feature_importance,columns=['feature_name','importance'])
feature_importance.sort_values(by=['importance'],inplace=True,ascending=False)
print(feature_importance)

## PCA visualization in one person who has fraud data

In [None]:
from sklearn.decomposition import PCA
def PCA_plot(x,label):
    x = x.drop(columns=delete_list)
    
    ## 應該先轉dummy,標準化,再PCA
#     dummy_list=['contp','etymd','stscd','hcefg']
#     dummy_list2=['stocn','scity','csmcu']#'mchno','acqic','mcc',
#     x[dummy_list] = x[dummy_list].astype(object)
#     x[dummy_list2] = x[dummy_list2].astype(object)
#     x = pd.get_dummies(x)    
    
    from sklearn.preprocessing import StandardScaler 
    stdsc = StandardScaler() 
    x = stdsc.fit_transform(x)
    print(x.shape,label.sum())

    PCA_model = PCA(n_components=2)
    train_data_pca = PCA_model.fit_transform(x)
    train_data_pca1 = train_data_pca[label==1]
    train_data_pca0 = train_data_pca[label==0]
    
    plt.clf()
    plt.figure(figsize=(10,10))
    plt.scatter(train_data_pca1[:, 0], train_data_pca1[:, 1], c='r',label='fraud transaction',s=100)
    plt.scatter(train_data_pca0[:, 0], train_data_pca0[:, 1], c='b',label='normal transaction',s=3)
    plt.legend()
    plt.show()
    
bacno_hasfraud = all_data[all_data['fraud_ind']==1]['bacno'].unique()
print(bacno_hasfraud.shape[0])
print(all_data[all_data['fraud_ind']==1].shape[0])

for i in range(bacno_hasfraud.shape[0]):
    if all_data[all_data['bacno']==bacno_hasfraud[i]].shape[0]>300:
        print('Ploting PCA on bacno-{}'.format(bacno_hasfraud[i]))
        PCA_plot(all_data[all_data['bacno']==bacno_hasfraud[i]],all_data[all_data['bacno']==bacno_hasfraud[i]]['fraud_ind'])

## TSNE, Kmeans作圖?

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

c_ratio = y_train.sum()/y_train.shape[0]
# fit the model
clf = IsolationForest(behaviour='new', max_samples=0.8, max_features=1,
                      random_state=random_seed, contamination=c_ratio)
clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

y_pred_test2 = -y_pred_test
y_pred_test2[y_pred_test2==-1]=0
y_pred_test2.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

isolationtree_X_train = clf.score_samples(X_train)
isolationtree_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## One class SVM

In [None]:
from sklearn import svm

clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='scale',verbose=True, random_state=random_seed)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_test.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

svm_X_train = clf.score_samples(X_train)
svm_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## one class Kmeans

In [None]:
# 用hinge loss(當SVM)

In [None]:
# X_train['cents']
# encoding data

# GroupKfold
# vanilla KFold