### Task Schedule:
1. 訓練模型,調整參數(預計使用lgb，速度較快)(更:使用catboost,效果較好)
2. 嘗試使用不同模型,做Ensamble(blending, stacking)
3. Anomaly detection

### 注意事項:
1. 因為test data和train data時間不相關,在驗證時採取前60天訓練61~90天驗證,但仍需小心時間差異造成的影響
2. Anomaly detection: 看這類的模型能不能取代boosting(似乎是不行，盜刷數據並沒有那麼Anomaly）,但可以嘗試將Anomaly結果當成新feature

### <font color=green>Results:</font>

#### Catboost:
    * FE1~4,catboost訓練 validation:0.5, LB:0.55

#### LGB:
    * 不做處理,直接丟lgb訓練 leaderboard score:0.45

## 讀取,轉換字串成可以訓練的資料

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import math

import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
data_path = '../data'

random_seed = 2000

In [2]:
data_list=['raw_data.csv','FE_data1.csv','FE_data2.csv','FE_data4.csv']

data=[]
for d in data_list:
    x = pd.read_csv('../data/preprocess/{}'.format(d),index_col=False)
    print(d,x.shape)
    x_null = x.isnull().sum()
    print("Null columns:\n",x_null[x_null>0])
    
    if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
        x=x.fillna(value=-1)
        
    data.append(x)

all_data = pd.concat(data,axis=1)

all_data_numsum = all_data.isnull().sum()
print('ALL data null:')
print(all_data_numsum[all_data_numsum>0])

raw_data.csv (1943452, 23)
Null columns:
 fraud_ind    421665
dtype: int64
FE_data1.csv (1943452, 54)
Null columns:
 cano_conam_skew       93942
cano_conam_kurt      156402
cano_conam_var        39970
bacno_locdt_skew      60927
bacno_locdt_kurt     104832
cano_locdt_skew       93942
cano_locdt_kurt      156402
mchno_fraud_mean    1204852
mcc_fraud_mean      1680560
acqic_fraud_mean    1643540
dtype: int64
FE_data2.csv (1943452, 8)
Null columns:
 Series([], dtype: int64)
FE_data4.csv (1943452, 6)
Null columns:
 Series([], dtype: int64)


In [4]:
## 除掉一些可能會overfit,distribution不同,受時間影響大的feature

delete_list1 = ['bacno','locdt','loctm','cano','fraud_ind']
delete_list2 = ['mchno','acqic','mcc']
delete_list3 = ['stocn','scity','csmcu']
delete_list4 = ['iterm']
delete_list5 = ['contp','etymd','hcefg','insfg','ovrlt','flbmk','flg_3dsmk']
delete_list6 = ['mchno_fraud_mean','mcc_fraud_mean','acqic_fraud_mean']
delete_list7 = ['bacno_locdt_skew','bacno_locdt_kurt','cano_locdt_skew','cano_locdt_kurt']
delete_list8 = ['bacno_lastlocdt','cano_lastlocdt']
# 
delete_list = delete_list1+delete_list2+delete_list3+delete_list4+delete_list6+['txkey']+delete_list8

In [5]:
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic','mchno','etymd','contp',\
              'stocn_bin','scity_bin','csmcu_bin','txkey_bin']
all_data[category_list]=all_data[category_list].astype('category')

In [6]:
## 切三種不同的訓練集驗證

X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train2 = all_data[all_data['locdt']<=45].drop(columns=delete_list)
y_train2 = all_data[all_data['locdt']<=45]['fraud_ind']
X_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)]['fraud_ind']

X_train3 = all_data[all_data['locdt']<=30].drop(columns=delete_list)
y_train3 = all_data[all_data['locdt']<=30]['fraud_ind']
X_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)]['fraud_ind']


test_data_txkey = all_data[all_data['locdt']>90]['txkey'].copy().values
X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list) 
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 
X_test_all = all_data[all_data['locdt']>90].drop(columns=delete_list) 
# y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

## Train on catboost
* https://catboost.ai/docs/concepts/python-reference_parameters-list.html
* 研究有哪些可以用的function

In [7]:
categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]

print(X_train1.dtypes[categorical_features_indices])
category_list2 = X_train1.dtypes[categorical_features_indices].index

contp        category
etymd        category
hcefg        category
stscd        category
stocn_bin    category
scity_bin    category
csmcu_bin    category
txkey_bin    category
dtype: object


In [13]:
param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':2000,
    'learning_rate':0.1,
    'l2_leaf_reg':5,
#     'sampling_frequency':'PerTreeLevel',
    
    'depth':5,
    'one_hot_max_size':300,
    
#     'min_data_in_leaf':1,
#     'max_leaves':31,
#     'task_type':"GPU",
#     'devices':1',
    'rsm':1,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'verbose':True    
}

In [14]:
from catboost import CatBoostClassifier, Pool

print(categorical_features_indices)

model = CatBoostClassifier(**param_cat)

model.fit(
    X_train1, y_train1,
    cat_features=categorical_features_indices,    
    eval_set=(X_test1, y_test1),
    early_stopping_rounds=200,
#     use_best_model=True,
    silent=False,
#     plot=True,
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())


# preds_class = model.predict(test_data)
# preds_proba = model.predict_proba(test_data)

[ 1  3  6  9 10 11 12 13]
0:	learn: 0.5868041	test: 0.3128907	best: 0.3128907 (0)	total: 246ms	remaining: 8m 11s
1:	learn: 0.5628067	test: 0.3038955	best: 0.3128907 (0)	total: 476ms	remaining: 7m 55s
2:	learn: 0.5588534	test: 0.3116074	best: 0.3128907 (0)	total: 710ms	remaining: 7m 52s
3:	learn: 0.5642382	test: 0.3800647	best: 0.3800647 (3)	total: 941ms	remaining: 7m 49s
4:	learn: 0.5947866	test: 0.3910798	best: 0.3910798 (4)	total: 1.17s	remaining: 7m 47s
5:	learn: 0.6002311	test: 0.3856691	best: 0.3910798 (4)	total: 1.4s	remaining: 7m 45s
6:	learn: 0.5993446	test: 0.4110791	best: 0.4110791 (6)	total: 1.62s	remaining: 7m 40s
7:	learn: 0.5872482	test: 0.3802261	best: 0.4110791 (6)	total: 1.82s	remaining: 7m 34s
8:	learn: 0.5896552	test: 0.3775497	best: 0.4110791 (6)	total: 2.06s	remaining: 7m 35s
9:	learn: 0.5960124	test: 0.3884117	best: 0.4110791 (6)	total: 2.29s	remaining: 7m 34s
10:	learn: 0.6019559	test: 0.4101920	best: 0.4110791 (6)	total: 2.51s	remaining: 7m 33s
11:	learn: 0.6080

93:	learn: 0.6688113	test: 0.5238095	best: 0.5238095 (93)	total: 21.2s	remaining: 7m 9s
94:	learn: 0.6686768	test: 0.5236891	best: 0.5238095 (93)	total: 21.4s	remaining: 7m 10s
95:	learn: 0.6692048	test: 0.5237165	best: 0.5238095 (93)	total: 21.7s	remaining: 7m 9s
96:	learn: 0.6697073	test: 0.5240064	best: 0.5240064 (96)	total: 21.9s	remaining: 7m 10s
97:	learn: 0.6694156	test: 0.5241268	best: 0.5241268 (97)	total: 22.2s	remaining: 7m 10s
98:	learn: 0.6695221	test: 0.5241982	best: 0.5241982 (98)	total: 22.4s	remaining: 7m 10s
99:	learn: 0.6698117	test: 0.5241268	best: 0.5241982 (98)	total: 22.6s	remaining: 7m 9s
100:	learn: 0.6700487	test: 0.5243566	best: 0.5243566 (100)	total: 22.9s	remaining: 7m 10s
101:	learn: 0.6697317	test: 0.5237165	best: 0.5243566 (100)	total: 23.1s	remaining: 7m 9s
102:	learn: 0.6697328	test: 0.5235963	best: 0.5243566 (100)	total: 23.3s	remaining: 7m 9s
103:	learn: 0.6703105	test: 0.5238423	best: 0.5243566 (100)	total: 23.6s	remaining: 7m 9s
104:	learn: 0.67091

184:	learn: 0.7028707	test: 0.5340745	best: 0.5349639 (176)	total: 41.8s	remaining: 6m 50s
185:	learn: 0.7032318	test: 0.5341767	best: 0.5349639 (176)	total: 42s	remaining: 6m 49s
186:	learn: 0.7031456	test: 0.5352394	best: 0.5352394 (186)	total: 42.3s	remaining: 6m 49s
187:	learn: 0.7033744	test: 0.5352394	best: 0.5352394 (186)	total: 42.5s	remaining: 6m 49s
188:	learn: 0.7038547	test: 0.5353423	best: 0.5353423 (188)	total: 42.7s	remaining: 6m 48s
189:	learn: 0.7039870	test: 0.5349712	best: 0.5353423 (188)	total: 42.9s	remaining: 6m 48s
190:	learn: 0.7045181	test: 0.5350867	best: 0.5353423 (188)	total: 43.2s	remaining: 6m 48s
191:	learn: 0.7049791	test: 0.5346600	best: 0.5353423 (188)	total: 43.4s	remaining: 6m 48s
192:	learn: 0.7048380	test: 0.5338927	best: 0.5353423 (188)	total: 43.6s	remaining: 6m 48s
193:	learn: 0.7049022	test: 0.5337159	best: 0.5353423 (188)	total: 43.8s	remaining: 6m 47s
194:	learn: 0.7054813	test: 0.5337898	best: 0.5353423 (188)	total: 44.1s	remaining: 6m 47s
1

275:	learn: 0.7272379	test: 0.5348913	best: 0.5371040 (226)	total: 1m 2s	remaining: 6m 30s
276:	learn: 0.7274466	test: 0.5357413	best: 0.5371040 (226)	total: 1m 2s	remaining: 6m 30s
277:	learn: 0.7268483	test: 0.5354246	best: 0.5371040 (226)	total: 1m 2s	remaining: 6m 30s
278:	learn: 0.7270293	test: 0.5350498	best: 0.5371040 (226)	total: 1m 3s	remaining: 6m 29s
279:	learn: 0.7267162	test: 0.5349768	best: 0.5371040 (226)	total: 1m 3s	remaining: 6m 29s
280:	learn: 0.7275230	test: 0.5352935	best: 0.5371040 (226)	total: 1m 3s	remaining: 6m 29s
281:	learn: 0.7273492	test: 0.5351076	best: 0.5371040 (226)	total: 1m 3s	remaining: 6m 29s
282:	learn: 0.7274396	test: 0.5350498	best: 0.5371040 (226)	total: 1m 4s	remaining: 6m 28s
283:	learn: 0.7283219	test: 0.5351930	best: 0.5371040 (226)	total: 1m 4s	remaining: 6m 28s
284:	learn: 0.7283219	test: 0.5358417	best: 0.5371040 (226)	total: 1m 4s	remaining: 6m 28s
285:	learn: 0.7287177	test: 0.5356255	best: 0.5371040 (226)	total: 1m 4s	remaining: 6m 28s

365:	learn: 0.7366457	test: 0.5362241	best: 0.5378332 (323)	total: 1m 23s	remaining: 6m 11s
366:	learn: 0.7365014	test: 0.5372974	best: 0.5378332 (323)	total: 1m 23s	remaining: 6m 10s
367:	learn: 0.7363972	test: 0.5370987	best: 0.5378332 (323)	total: 1m 23s	remaining: 6m 10s
368:	learn: 0.7380590	test: 0.5369934	best: 0.5378332 (323)	total: 1m 23s	remaining: 6m 10s
369:	learn: 0.7386627	test: 0.5367213	best: 0.5378332 (323)	total: 1m 24s	remaining: 6m 10s
370:	learn: 0.7389867	test: 0.5369358	best: 0.5378332 (323)	total: 1m 24s	remaining: 6m 10s
371:	learn: 0.7387504	test: 0.5368782	best: 0.5378332 (323)	total: 1m 24s	remaining: 6m 9s
372:	learn: 0.7387620	test: 0.5365018	best: 0.5378332 (323)	total: 1m 24s	remaining: 6m 9s
373:	learn: 0.7387785	test: 0.5377863	best: 0.5378332 (323)	total: 1m 24s	remaining: 6m 9s
374:	learn: 0.7389619	test: 0.5378277	best: 0.5378332 (323)	total: 1m 25s	remaining: 6m 9s
375:	learn: 0.7389106	test: 0.5370390	best: 0.5378332 (323)	total: 1m 25s	remaining:

455:	learn: 0.7470517	test: 0.5387470	best: 0.5388612 (453)	total: 1m 43s	remaining: 5m 51s
456:	learn: 0.7471373	test: 0.5383637	best: 0.5388612 (453)	total: 1m 44s	remaining: 5m 51s
457:	learn: 0.7471846	test: 0.5383637	best: 0.5388612 (453)	total: 1m 44s	remaining: 5m 51s
458:	learn: 0.7473831	test: 0.5387712	best: 0.5388612 (453)	total: 1m 44s	remaining: 5m 50s
459:	learn: 0.7475160	test: 0.5391710	best: 0.5391710 (459)	total: 1m 44s	remaining: 5m 50s
460:	learn: 0.7474205	test: 0.5386571	best: 0.5391710 (459)	total: 1m 44s	remaining: 5m 50s
461:	learn: 0.7474961	test: 0.5384942	best: 0.5391710 (459)	total: 1m 45s	remaining: 5m 50s
462:	learn: 0.7474961	test: 0.5387798	best: 0.5391710 (459)	total: 1m 45s	remaining: 5m 49s
463:	learn: 0.7475625	test: 0.5382575	best: 0.5391710 (459)	total: 1m 45s	remaining: 5m 49s
464:	learn: 0.7477610	test: 0.5387313	best: 0.5391710 (459)	total: 1m 45s	remaining: 5m 49s
465:	learn: 0.7478649	test: 0.5389845	best: 0.5391710 (459)	total: 1m 46s	remain

545:	learn: 0.7551005	test: 0.5389920	best: 0.5407816 (526)	total: 2m 4s	remaining: 5m 31s
546:	learn: 0.7550437	test: 0.5397095	best: 0.5407816 (526)	total: 2m 4s	remaining: 5m 31s
547:	learn: 0.7555405	test: 0.5396691	best: 0.5407816 (526)	total: 2m 5s	remaining: 5m 31s
548:	learn: 0.7577556	test: 0.5394569	best: 0.5407816 (526)	total: 2m 5s	remaining: 5m 31s
549:	learn: 0.7580276	test: 0.5383882	best: 0.5407816 (526)	total: 2m 5s	remaining: 5m 30s
550:	learn: 0.7580094	test: 0.5383312	best: 0.5407816 (526)	total: 2m 5s	remaining: 5m 30s
551:	learn: 0.7580379	test: 0.5381603	best: 0.5407816 (526)	total: 2m 5s	remaining: 5m 30s
552:	learn: 0.7582995	test: 0.5388273	best: 0.5407816 (526)	total: 2m 6s	remaining: 5m 30s
553:	learn: 0.7582814	test: 0.5387703	best: 0.5407816 (526)	total: 2m 6s	remaining: 5m 29s
554:	learn: 0.7584316	test: 0.5389652	best: 0.5407816 (526)	total: 2m 6s	remaining: 5m 29s
555:	learn: 0.7588330	test: 0.5385833	best: 0.5407816 (526)	total: 2m 6s	remaining: 5m 29s

635:	learn: 0.7631569	test: 0.5402963	best: 0.5420443 (598)	total: 2m 24s	remaining: 5m 10s
636:	learn: 0.7632387	test: 0.5404667	best: 0.5420443 (598)	total: 2m 25s	remaining: 5m 10s
637:	learn: 0.7632850	test: 0.5405235	best: 0.5420443 (598)	total: 2m 25s	remaining: 5m 10s
638:	learn: 0.7633885	test: 0.5407734	best: 0.5420443 (598)	total: 2m 25s	remaining: 5m 9s
639:	learn: 0.7636091	test: 0.5390994	best: 0.5420443 (598)	total: 2m 25s	remaining: 5m 9s
640:	learn: 0.7637017	test: 0.5389863	best: 0.5420443 (598)	total: 2m 25s	remaining: 5m 9s
641:	learn: 0.7641400	test: 0.5388003	best: 0.5420443 (598)	total: 2m 26s	remaining: 5m 9s
642:	learn: 0.7640828	test: 0.5383084	best: 0.5420443 (598)	total: 2m 26s	remaining: 5m 8s
643:	learn: 0.7641047	test: 0.5382519	best: 0.5420443 (598)	total: 2m 26s	remaining: 5m 8s
644:	learn: 0.7641047	test: 0.5385583	best: 0.5420443 (598)	total: 2m 26s	remaining: 5m 8s
645:	learn: 0.7642191	test: 0.5388568	best: 0.5420443 (598)	total: 2m 27s	remaining: 5m

725:	learn: 0.7731399	test: 0.5389034	best: 0.5420443 (598)	total: 2m 45s	remaining: 4m 49s
726:	learn: 0.7731449	test: 0.5391522	best: 0.5420443 (598)	total: 2m 45s	remaining: 4m 49s
727:	learn: 0.7732599	test: 0.5390397	best: 0.5420443 (598)	total: 2m 45s	remaining: 4m 49s
728:	learn: 0.7735744	test: 0.5389434	best: 0.5420443 (598)	total: 2m 45s	remaining: 4m 49s
729:	learn: 0.7736776	test: 0.5388634	best: 0.5420443 (598)	total: 2m 46s	remaining: 4m 48s
730:	learn: 0.7735000	test: 0.5391123	best: 0.5420443 (598)	total: 2m 46s	remaining: 4m 48s
731:	learn: 0.7733800	test: 0.5397555	best: 0.5420443 (598)	total: 2m 46s	remaining: 4m 48s
732:	learn: 0.7736032	test: 0.5397323	best: 0.5420443 (598)	total: 2m 46s	remaining: 4m 48s
733:	learn: 0.7736390	test: 0.5399017	best: 0.5420443 (598)	total: 2m 46s	remaining: 4m 47s
734:	learn: 0.7734276	test: 0.5394668	best: 0.5420443 (598)	total: 2m 47s	remaining: 4m 47s
735:	learn: 0.7733701	test: 0.5400544	best: 0.5420443 (598)	total: 2m 47s	remain

In [10]:
train_pool=Pool(X_train1, y_train1, cat_features=categorical_features_indices)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train1.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

stocn_bin: 17.711914163935763
ecfg: 12.310425090120397
stocn_value_counts: 8.635739927876106
etymd: 6.509586051205589
cano_conam_skew: 4.2699324773420235
csmcu_value_counts: 4.017651221365238
bacno_stocn_nunique: 3.7528797346476206
bacno_cano_not1: 3.4916503773053345
cano_lastlocdt2: 2.8402607442359287
mcc_cano_nunique: 2.831033250927126
stocn_bacno_nunique: 2.791795651997583
conam_log: 2.575575703552253
flg_3dsmk: 2.525510690146207
conam: 2.5220229922085107
cano_lastlocdt: 2.498042103226354
stocn_cano_nunique: 2.3111255101214683
hcefg: 1.4312686351590416
stscd: 1.0950217006461018
contp: 1.0071187491452644
mchno_cano_nunique: 0.9902948425357645
scity_bin: 0.9473951541134755
bacno_scale_conam: 0.9091435006187264
cano_csmcu_mode: 0.9045029260428326
cano_conam_mean: 0.867888418074275
cano_ratio_ecfg: 0.8676135248895516
cano_mean_conam: 0.8452648167774616
acqic_bacno_nunique: 0.7801957048620435
cano_scale_conam: 0.7396423437985798
bacno_max_conam: 0.6370788017304739
bacno_ratio_ecfg: 0.624

In [None]:
## 理論上th設0.5一定是最好的？

y_test1_pred = model.predict_proba(X_test1,verbose=True)[:,1]
th=0.5

y_test1_pred[y_test1_pred>th]=1
y_test1_pred[y_test1_pred<=th]=0
print(f1_score(y_test1, y_test1_pred))

In [None]:
model = CatBoostClassifier(**param_cat)

model.fit(
    X_train_all, y_train_all,
    cat_features=categorical_features_indices,    
    silent=False
)
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]

print(X_test_all.index)

th=0.5
y_test_pred_cat[y_test_pred_cat>th]=1
y_test_pred_cat[y_test_pred_cat<=th]=0

## write csv

In [None]:
result = y_test_pred_cat
print('{}: prediction positive ratio'.format(result.sum()/result.shape[0]))
print('{}: training positive ratio'.format(y_train_all.sum()/y_train_all.shape[0]))


submit_file_name='submit_cat.csv'
import csv
with open('../prediction/{}'.format(submit_file_name),'w') as f:
    writer = csv.writer(f)
    writer.writerow(['txkey','fraud_ind'])
    for i in range(result.shape[0]):
        writer.writerow([test_data_txkey[i], result[i]])
        
with open('../prediction/log.txt','w') as f:
    print('{}'.format(submit_file_name),file=f)
    print('delete_list:\n{}'.format(delete_list),file=f)

## Anomaly detection
* one class svm
* isolation tree
* replicator NN
* Kmeans?
* KNN(take too much time)

## 製作特徵
XGB, LGB, PCA, Isolation Forest, Kmean距離？, oneclass SVM?
當作新feature

In [None]:
import xgboost as xgb
param_dist_xgb = {'learning_rate':0.01, #默认0.3
              'n_estimators':1000, #树的个数
#               'max_depth':5,
#               'min_child_weight':1,
#               'gamma':0.2,
#               'subsample':0.8,
#               'colsample_bytree':0.8,
#               'objective': 'binary:logistic', #逻辑回归损失函数
#               'nthread':4,  #cpu线程数
#               'scale_pos_weight':1,
              'seed':random_seed}  #随机种子

evals_result = {}

xgb_clf = xgb.XGBClassifier(**param_dist_xgb)
xgb_clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=600,
        verbose=True,
#         callbacks=[xgb.record_evaluation(evals_result)]
        )

print('F1',f1_score(y_test, xgb_clf.predict(X_test)))
xgb_X_train = xgb_clf.apply(X_train)
xgb_X_test = xgb_clf.apply(X_test)

## Train on LGB(未調參數)(效果不好)

In [None]:
print(delete_list)
print('Training num',X_train1.shape)
print('positive label ratio-train',y_train1.sum()/y_train1.shape[0])
print('positive label ratio-test',y_test1.sum()/y_test1.shape[0])

def lgb_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred) # scikits f1 doesn't like probabilities
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print()
    print('tn, fp, fn, tp')
    print(tn, fp, fn, tp)
    return 'f1', f1_score(y_true, y_pred), True

param_dist_lgb = {
#                   'num_leaves':45, 
#                   'max_depth':5, 
                  'learning_rate':0.1, 
                  'n_estimators':600,
                  'objective': 'binary',
#                   'subsample': 1, 
#                   'colsample_bytree': 0.5, 
#                   'lambda_l1': 0.1,
#                   'lambda_l2': 0,
#                   'min_child_weight': 1,
                  'random_state': random_seed,
                 }
evals_result = {}

lgb_clf = LGBMClassifier(**param_dist_lgb)
lgb_clf.fit(X_train1, y_train1,
        eval_set=[(X_train1, y_train1),(X_test1, y_test1)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=50,
        verbose=True,
        callbacks=[lgb.record_evaluation(evals_result)]
        )
y_test_pred = lgb_clf.predict(X_test1)
print('F1',f1_score(y_test1, y_test_pred))
tn, fp, fn, tp = confusion_matrix(y_test1, y_test_pred).ravel()
print(tn, fp, fn, tp)

In [None]:
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='f1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(lgb_clf, max_num_features=30)
plt.show()

print('Plotting 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgb_clf, tree_index=3, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgb_clf, tree_index=3, name='Tree4')
graph.render(view=True)

In [None]:
feature_importance = np.stack([X_train1.columns.values,lgb_clf.feature_importances_]).transpose()
feature_importance = pd.DataFrame(feature_importance,columns=['feature_name','importance'])
feature_importance.sort_values(by=['importance'],inplace=True,ascending=False)
print(feature_importance)

## PCA visualization in one person who has fraud data

In [None]:
from sklearn.decomposition import PCA
def PCA_plot(x,label):
    x = x.drop(columns=delete_list)
    
    ## 應該先轉dummy,標準化,再PCA
#     dummy_list=['contp','etymd','stscd','hcefg']
#     dummy_list2=['stocn','scity','csmcu']#'mchno','acqic','mcc',
#     x[dummy_list] = x[dummy_list].astype(object)
#     x[dummy_list2] = x[dummy_list2].astype(object)
#     x = pd.get_dummies(x)    
    
    from sklearn.preprocessing import StandardScaler 
    stdsc = StandardScaler() 
    x = stdsc.fit_transform(x)
    print(x.shape,label.sum())

    PCA_model = PCA(n_components=2)
    train_data_pca = PCA_model.fit_transform(x)
    train_data_pca1 = train_data_pca[label==1]
    train_data_pca0 = train_data_pca[label==0]
    
    plt.clf()
    plt.figure(figsize=(10,10))
    plt.scatter(train_data_pca1[:, 0], train_data_pca1[:, 1], c='r',label='fraud transaction',s=100)
    plt.scatter(train_data_pca0[:, 0], train_data_pca0[:, 1], c='b',label='normal transaction',s=3)
    plt.legend()
    plt.show()
    
bacno_hasfraud = all_data[all_data['fraud_ind']==1]['bacno'].unique()
print(bacno_hasfraud.shape[0])
print(all_data[all_data['fraud_ind']==1].shape[0])

for i in range(bacno_hasfraud.shape[0]):
    if all_data[all_data['bacno']==bacno_hasfraud[i]].shape[0]>300:
        print('Ploting PCA on bacno-{}'.format(bacno_hasfraud[i]))
        PCA_plot(all_data[all_data['bacno']==bacno_hasfraud[i]],all_data[all_data['bacno']==bacno_hasfraud[i]]['fraud_ind'])

## TSNE, Kmeans作圖?

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

c_ratio = y_train.sum()/y_train.shape[0]
# fit the model
clf = IsolationForest(behaviour='new', max_samples=0.8, max_features=1,
                      random_state=random_seed, contamination=c_ratio)
clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

y_pred_test2 = -y_pred_test
y_pred_test2[y_pred_test2==-1]=0
y_pred_test2.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

isolationtree_X_train = clf.score_samples(X_train)
isolationtree_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## One class SVM

In [None]:
from sklearn import svm

clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='scale',verbose=True, random_state=random_seed)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_test.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

svm_X_train = clf.score_samples(X_train)
svm_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## one class Kmeans

In [None]:
# 用hinge loss(當SVM)

In [None]:
# X_train['cents']
# encoding data

# GroupKfold
# vanilla KFold