### Task Schedule:
1. 訓練模型,調整參數(預計使用lgb，速度較快)(更:使用catboost,效果較好)
2. 嘗試使用不同模型,做Ensamble(blending, stacking)
3. Anomaly detection

### 注意事項:
1. 因為test data和train data時間不相關,在驗證時採取前60天訓練61~90天驗證,但仍需小心時間差異造成的影響
2. Anomaly detection: 看這類的模型能不能取代boosting(似乎是不行，盜刷數據並沒有那麼Anomaly）,但可以嘗試將Anomaly結果當成新feature

### <font color=green>Results:</font>

#### Catboost:
    * FE1~4,catboost訓練 validation:0.5, LB:0.55

#### LGB:
    * 不做處理,直接丟lgb訓練 leaderboard score:0.45

## 讀取,轉換字串成可以訓練的資料

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import math

import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
data_path = '../data'

random_seed = 2000

In [2]:
data_list=['raw_data.csv','FE_data1.csv','FE_data2.csv','FE_data4.csv']

data=[]
for d in data_list:
    x = pd.read_csv('../data/preprocess/{}'.format(d),index_col=False)
    print(d,x.shape)
    x_null = x.isnull().sum()
    print("Null columns:\n",x_null[x_null>0])
    
    if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
        x=x.fillna(value=-1)
        
    data.append(x)

all_data = pd.concat(data,axis=1)

all_data_numsum = all_data.isnull().sum()
print('ALL data null:')
print(all_data_numsum[all_data_numsum>0])

raw_data.csv (1943452, 23)
Null columns:
 fraud_ind    421665
dtype: int64
FE_data1.csv (1943452, 54)
Null columns:
 cano_conam_skew       93942
cano_conam_kurt      156402
cano_conam_var        39970
bacno_locdt_skew      60927
bacno_locdt_kurt     104832
cano_locdt_skew       93942
cano_locdt_kurt      156402
mchno_fraud_mean    1204852
mcc_fraud_mean      1680560
acqic_fraud_mean    1643540
dtype: int64
FE_data2.csv (1943452, 8)
Null columns:
 Series([], dtype: int64)
FE_data4.csv (1943452, 6)
Null columns:
 Series([], dtype: int64)


In [15]:
## 除掉一些可能會overfit,distribution不同,受時間影響大的feature

delete_list1 = ['bacno','locdt','loctm','cano','fraud_ind']
delete_list2 = ['mchno','acqic','mcc']
delete_list3 = ['stocn','scity','csmcu']
delete_list4 = ['iterm']
delete_list5 = ['contp','etymd','hcefg','insfg','ovrlt','flbmk','flg_3dsmk']
delete_list6 = ['mchno_fraud_mean','mcc_fraud_mean','acqic_fraud_mean']
delete_list7 = ['bacno_locdt_skew','bacno_locdt_kurt','cano_locdt_skew','cano_locdt_kurt']
delete_list8 = ['bacno_lastlocdt','cano_lastlocdt']
# 
delete_list = delete_list1+delete_list2+delete_list3+delete_list4+delete_list6+['txkey']+delete_list8

In [16]:
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic','mchno','etymd','contp',\
              'stocn_bin','scity_bin','csmcu_bin','txkey_bin']
all_data[category_list]=all_data[category_list].astype('category')

In [17]:
## 切三種不同的訓練集驗證

X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train2 = all_data[all_data['locdt']<=45].drop(columns=delete_list)
y_train2 = all_data[all_data['locdt']<=45]['fraud_ind']
X_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test2 = all_data[(all_data['locdt']>45) & (all_data['locdt']<=90)]['fraud_ind']

X_train3 = all_data[all_data['locdt']<=30].drop(columns=delete_list)
y_train3 = all_data[all_data['locdt']<=30]['fraud_ind']
X_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test3 = all_data[(all_data['locdt']>30) & (all_data['locdt']<=90)]['fraud_ind']


test_data_txkey = all_data[all_data['locdt']>90]['txkey'].copy().values
X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list) 
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 
X_test_all = all_data[all_data['locdt']>90].drop(columns=delete_list) 
# y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

## Train on catboost
* https://catboost.ai/docs/concepts/python-reference_parameters-list.html
* 研究有哪些可以用的function

In [18]:
categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]

print(X_train1.dtypes[categorical_features_indices])
category_list2 = X_train1.dtypes[categorical_features_indices].index

contp        category
etymd        category
hcefg        category
stscd        category
stocn_bin    category
scity_bin    category
csmcu_bin    category
txkey_bin    category
dtype: object


In [19]:
param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':2000,
    'learning_rate':0.1,
    'l2_leaf_reg':5,
#     'sampling_frequency':'PerTreeLevel',
    
    'depth':5,
    'one_hot_max_size':300,
    
#     'min_data_in_leaf':1,
#     'max_leaves':31,
#     'task_type':"GPU",
#     'devices':1',
    'rsm':1,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'verbose':True    
}

In [20]:
from catboost import CatBoostClassifier, Pool

print(categorical_features_indices)

model = CatBoostClassifier(**param_cat)

model.fit(
    X_train1, y_train1,
    cat_features=categorical_features_indices,    
    eval_set=(X_test1, y_test1),
    early_stopping_rounds=200,
#     use_best_model=True,
    silent=False,
#     plot=True,
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())


# preds_class = model.predict(test_data)
# preds_proba = model.predict_proba(test_data)

[ 1  3  6  9 10 11 12 13]
0:	learn: 0.5700970	test: 0.3176514	best: 0.3176514 (0)	total: 217ms	remaining: 7m 14s
1:	learn: 0.5062043	test: 0.2806570	best: 0.3176514 (0)	total: 429ms	remaining: 7m 8s
2:	learn: 0.5269852	test: 0.3002189	best: 0.3176514 (0)	total: 644ms	remaining: 7m 8s
3:	learn: 0.5573586	test: 0.3243243	best: 0.3243243 (3)	total: 861ms	remaining: 7m 9s
4:	learn: 0.5702641	test: 0.3316800	best: 0.3316800 (4)	total: 1.07s	remaining: 7m 6s
5:	learn: 0.5747932	test: 0.3335868	best: 0.3335868 (5)	total: 1.28s	remaining: 7m 5s
6:	learn: 0.5771954	test: 0.3304239	best: 0.3335868 (5)	total: 1.49s	remaining: 7m 4s
7:	learn: 0.5825288	test: 0.3377139	best: 0.3377139 (7)	total: 1.7s	remaining: 7m 4s
8:	learn: 0.5825561	test: 0.3363140	best: 0.3377139 (7)	total: 1.92s	remaining: 7m 4s
9:	learn: 0.5829401	test: 0.3396786	best: 0.3396786 (9)	total: 2.12s	remaining: 7m 2s
10:	learn: 0.5826834	test: 0.3440196	best: 0.3440196 (10)	total: 2.31s	remaining: 6m 58s
11:	learn: 0.5802716	test

94:	learn: 0.6800376	test: 0.4957760	best: 0.4957760 (94)	total: 19.7s	remaining: 6m 34s
95:	learn: 0.6804059	test: 0.4969829	best: 0.4969829 (95)	total: 19.9s	remaining: 6m 34s
96:	learn: 0.6807026	test: 0.4980113	best: 0.4980113 (96)	total: 20.1s	remaining: 6m 33s
97:	learn: 0.6806258	test: 0.4974093	best: 0.4980113 (96)	total: 20.3s	remaining: 6m 33s
98:	learn: 0.6808794	test: 0.4982541	best: 0.4982541 (98)	total: 20.5s	remaining: 6m 33s
99:	learn: 0.6806073	test: 0.4979543	best: 0.4982541 (98)	total: 20.7s	remaining: 6m 33s
100:	learn: 0.6821548	test: 0.4996402	best: 0.4996402 (100)	total: 20.9s	remaining: 6m 32s
101:	learn: 0.6826708	test: 0.4998801	best: 0.4998801 (101)	total: 21.1s	remaining: 6m 32s
102:	learn: 0.6831753	test: 0.5004792	best: 0.5004792 (102)	total: 21.3s	remaining: 6m 32s
103:	learn: 0.6836395	test: 0.5005396	best: 0.5005396 (103)	total: 21.5s	remaining: 6m 32s
104:	learn: 0.6836176	test: 0.4998801	best: 0.5005396 (103)	total: 21.7s	remaining: 6m 32s
105:	learn:

185:	learn: 0.7129476	test: 0.5168352	best: 0.5183333 (179)	total: 38.6s	remaining: 6m 16s
186:	learn: 0.7133007	test: 0.5170650	best: 0.5183333 (179)	total: 38.8s	remaining: 6m 16s
187:	learn: 0.7133557	test: 0.5182013	best: 0.5183333 (179)	total: 39.1s	remaining: 6m 16s
188:	learn: 0.7140767	test: 0.5179634	best: 0.5183333 (179)	total: 39.3s	remaining: 6m 16s
189:	learn: 0.7145277	test: 0.5177254	best: 0.5183333 (179)	total: 39.5s	remaining: 6m 15s
190:	learn: 0.7149677	test: 0.5181742	best: 0.5183333 (179)	total: 39.7s	remaining: 6m 15s
191:	learn: 0.7151874	test: 0.5181298	best: 0.5183333 (179)	total: 39.9s	remaining: 6m 15s
192:	learn: 0.7154784	test: 0.5178210	best: 0.5183333 (179)	total: 40.1s	remaining: 6m 15s
193:	learn: 0.7163077	test: 0.5184832	best: 0.5184832 (193)	total: 40.3s	remaining: 6m 15s
194:	learn: 0.7165385	test: 0.5193502	best: 0.5193502 (194)	total: 40.5s	remaining: 6m 14s
195:	learn: 0.7163846	test: 0.5189964	best: 0.5193502 (194)	total: 40.7s	remaining: 6m 14s

276:	learn: 0.7284804	test: 0.5253104	best: 0.5253104 (276)	total: 57.5s	remaining: 5m 57s
277:	learn: 0.7285709	test: 0.5253610	best: 0.5253610 (277)	total: 57.7s	remaining: 5m 57s
278:	learn: 0.7294172	test: 0.5264288	best: 0.5264288 (278)	total: 58s	remaining: 5m 57s
279:	learn: 0.7294306	test: 0.5272575	best: 0.5272575 (279)	total: 58.2s	remaining: 5m 57s
280:	learn: 0.7298050	test: 0.5271447	best: 0.5272575 (279)	total: 58.4s	remaining: 5m 57s
281:	learn: 0.7299019	test: 0.5264162	best: 0.5272575 (279)	total: 58.6s	remaining: 5m 56s
282:	learn: 0.7301309	test: 0.5261777	best: 0.5272575 (279)	total: 58.8s	remaining: 5m 56s
283:	learn: 0.7300679	test: 0.5261150	best: 0.5272575 (279)	total: 59s	remaining: 5m 56s
284:	learn: 0.7304978	test: 0.5255753	best: 0.5272575 (279)	total: 59.2s	remaining: 5m 56s
285:	learn: 0.7304905	test: 0.5253368	best: 0.5272575 (279)	total: 59.4s	remaining: 5m 56s
286:	learn: 0.7306709	test: 0.5246332	best: 0.5272575 (279)	total: 59.6s	remaining: 5m 55s
287

366:	learn: 0.7378264	test: 0.5246994	best: 0.5272575 (279)	total: 1m 16s	remaining: 5m 39s
367:	learn: 0.7379025	test: 0.5246994	best: 0.5272575 (279)	total: 1m 16s	remaining: 5m 38s
368:	learn: 0.7380427	test: 0.5246487	best: 0.5272575 (279)	total: 1m 16s	remaining: 5m 38s
369:	learn: 0.7380708	test: 0.5248244	best: 0.5272575 (279)	total: 1m 16s	remaining: 5m 38s
370:	learn: 0.7381667	test: 0.5249494	best: 0.5272575 (279)	total: 1m 17s	remaining: 5m 38s
371:	learn: 0.7386182	test: 0.5251250	best: 0.5272575 (279)	total: 1m 17s	remaining: 5m 38s
372:	learn: 0.7386744	test: 0.5250625	best: 0.5272575 (279)	total: 1m 17s	remaining: 5m 37s
373:	learn: 0.7390776	test: 0.5254136	best: 0.5272575 (279)	total: 1m 17s	remaining: 5m 37s
374:	learn: 0.7392972	test: 0.5251131	best: 0.5272575 (279)	total: 1m 17s	remaining: 5m 37s
375:	learn: 0.7396209	test: 0.5248126	best: 0.5272575 (279)	total: 1m 18s	remaining: 5m 37s
376:	learn: 0.7395169	test: 0.5251756	best: 0.5272575 (279)	total: 1m 18s	remain

456:	learn: 0.7502174	test: 0.5292219	best: 0.5292852 (455)	total: 1m 35s	remaining: 5m 21s
457:	learn: 0.7498014	test: 0.5298614	best: 0.5298614 (457)	total: 1m 35s	remaining: 5m 20s
458:	learn: 0.7500662	test: 0.5300370	best: 0.5300370 (458)	total: 1m 35s	remaining: 5m 20s
459:	learn: 0.7503970	test: 0.5297982	best: 0.5300370 (458)	total: 1m 35s	remaining: 5m 20s
460:	learn: 0.7508409	test: 0.5299737	best: 0.5300370 (458)	total: 1m 35s	remaining: 5m 20s
461:	learn: 0.7509639	test: 0.5296226	best: 0.5300370 (458)	total: 1m 36s	remaining: 5m 20s
462:	learn: 0.7506806	test: 0.5296716	best: 0.5300370 (458)	total: 1m 36s	remaining: 5m 19s
463:	learn: 0.7507750	test: 0.5302270	best: 0.5302270 (463)	total: 1m 36s	remaining: 5m 19s
464:	learn: 0.7505672	test: 0.5299104	best: 0.5302270 (463)	total: 1m 36s	remaining: 5m 19s
465:	learn: 0.7506333	test: 0.5299104	best: 0.5302270 (463)	total: 1m 37s	remaining: 5m 19s
466:	learn: 0.7508793	test: 0.5298472	best: 0.5302270 (463)	total: 1m 37s	remain

546:	learn: 0.7592335	test: 0.5315821	best: 0.5336831 (533)	total: 1m 54s	remaining: 5m 3s
547:	learn: 0.7593192	test: 0.5319962	best: 0.5336831 (533)	total: 1m 54s	remaining: 5m 2s
548:	learn: 0.7595137	test: 0.5319327	best: 0.5336831 (533)	total: 1m 54s	remaining: 5m 2s
549:	learn: 0.7595680	test: 0.5319327	best: 0.5336831 (533)	total: 1m 54s	remaining: 5m 2s
550:	learn: 0.7596328	test: 0.5319327	best: 0.5336831 (533)	total: 1m 54s	remaining: 5m 2s
551:	learn: 0.7599594	test: 0.5319327	best: 0.5336831 (533)	total: 1m 55s	remaining: 5m 2s
552:	learn: 0.7599413	test: 0.5313098	best: 0.5336831 (533)	total: 1m 55s	remaining: 5m 1s
553:	learn: 0.7600090	test: 0.5313098	best: 0.5336831 (533)	total: 1m 55s	remaining: 5m 1s
554:	learn: 0.7599338	test: 0.5313098	best: 0.5336831 (533)	total: 1m 55s	remaining: 5m 1s
555:	learn: 0.7601956	test: 0.5312463	best: 0.5336831 (533)	total: 1m 56s	remaining: 5m 1s
556:	learn: 0.7600376	test: 0.5313098	best: 0.5336831 (533)	total: 1m 56s	remaining: 5m 1s

637:	learn: 0.7662143	test: 0.5296226	best: 0.5336831 (533)	total: 2m 13s	remaining: 4m 44s
638:	learn: 0.7665142	test: 0.5296858	best: 0.5336831 (533)	total: 2m 13s	remaining: 4m 44s
639:	learn: 0.7668553	test: 0.5294961	best: 0.5336831 (533)	total: 2m 13s	remaining: 4m 44s
640:	learn: 0.7668091	test: 0.5294961	best: 0.5336831 (533)	total: 2m 13s	remaining: 4m 43s
641:	learn: 0.7671038	test: 0.5297349	best: 0.5336831 (533)	total: 2m 14s	remaining: 4m 43s
642:	learn: 0.7676004	test: 0.5294679	best: 0.5336831 (533)	total: 2m 14s	remaining: 4m 43s
643:	learn: 0.7678197	test: 0.5300084	best: 0.5336831 (533)	total: 2m 14s	remaining: 4m 43s
644:	learn: 0.7676352	test: 0.5299451	best: 0.5336831 (533)	total: 2m 14s	remaining: 4m 43s
645:	learn: 0.7675429	test: 0.5299451	best: 0.5336831 (533)	total: 2m 14s	remaining: 4m 42s
646:	learn: 0.7675943	test: 0.5301205	best: 0.5336831 (533)	total: 2m 15s	remaining: 4m 42s
647:	learn: 0.7678364	test: 0.5302958	best: 0.5336831 (533)	total: 2m 15s	remain

728:	learn: 0.7735504	test: 0.5328093	best: 0.5336831 (533)	total: 2m 32s	remaining: 4m 25s
729:	learn: 0.7734419	test: 0.5324289	best: 0.5336831 (533)	total: 2m 32s	remaining: 4m 25s
730:	learn: 0.7737286	test: 0.5324923	best: 0.5336831 (533)	total: 2m 32s	remaining: 4m 25s
731:	learn: 0.7737455	test: 0.5326190	best: 0.5336831 (533)	total: 2m 32s	remaining: 4m 24s
732:	learn: 0.7737455	test: 0.5326190	best: 0.5336831 (533)	total: 2m 33s	remaining: 4m 24s
733:	learn: 0.7734468	test: 0.5326190	best: 0.5336831 (533)	total: 2m 33s	remaining: 4m 24s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.533683081
bestIteration = 533

Shrink model to first 534 iterations.
Model is fitted: True
Model params:
{'l2_leaf_reg': 5, 'random_seed': 2000, 'depth': 5, 'iterations': 2000, 'eval_metric': 'F1', 'learning_rate': 0.1, 'one_hot_max_size': 300, 'scale_pos_weight': 1, 'rsm': 1, 'loss_function': 'Logloss', 'verbose': True, 'target_border': 0.5}


In [10]:
train_pool=Pool(X_train1, y_train1, cat_features=categorical_features_indices)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train1.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

stocn_bin: 17.711914163935763
ecfg: 12.310425090120397
stocn_value_counts: 8.635739927876106
etymd: 6.509586051205589
cano_conam_skew: 4.2699324773420235
csmcu_value_counts: 4.017651221365238
bacno_stocn_nunique: 3.7528797346476206
bacno_cano_not1: 3.4916503773053345
cano_lastlocdt2: 2.8402607442359287
mcc_cano_nunique: 2.831033250927126
stocn_bacno_nunique: 2.791795651997583
conam_log: 2.575575703552253
flg_3dsmk: 2.525510690146207
conam: 2.5220229922085107
cano_lastlocdt: 2.498042103226354
stocn_cano_nunique: 2.3111255101214683
hcefg: 1.4312686351590416
stscd: 1.0950217006461018
contp: 1.0071187491452644
mchno_cano_nunique: 0.9902948425357645
scity_bin: 0.9473951541134755
bacno_scale_conam: 0.9091435006187264
cano_csmcu_mode: 0.9045029260428326
cano_conam_mean: 0.867888418074275
cano_ratio_ecfg: 0.8676135248895516
cano_mean_conam: 0.8452648167774616
acqic_bacno_nunique: 0.7801957048620435
cano_scale_conam: 0.7396423437985798
bacno_max_conam: 0.6370788017304739
bacno_ratio_ecfg: 0.624

In [None]:
## 理論上th設0.5一定是最好的？

y_test1_pred = model.predict_proba(X_test1,verbose=True)[:,1]
th=0.5

y_test1_pred[y_test1_pred>th]=1
y_test1_pred[y_test1_pred<=th]=0
print(f1_score(y_test1, y_test1_pred))

In [None]:
model = CatBoostClassifier(**param_cat)

model.fit(
    X_train_all, y_train_all,
    cat_features=categorical_features_indices,    
    silent=False
)
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]

print(X_test_all.index)

th=0.5
y_test_pred_cat[y_test_pred_cat>th]=1
y_test_pred_cat[y_test_pred_cat<=th]=0

## write csv

In [None]:
result = y_test_pred_cat
print('{}: prediction positive ratio'.format(result.sum()/result.shape[0]))
print('{}: training positive ratio'.format(y_train_all.sum()/y_train_all.shape[0]))


submit_file_name='submit_cat.csv'
import csv
with open('../prediction/{}'.format(submit_file_name),'w') as f:
    writer = csv.writer(f)
    writer.writerow(['txkey','fraud_ind'])
    for i in range(result.shape[0]):
        writer.writerow([test_data_txkey[i], result[i]])
        
with open('../prediction/log.txt','w') as f:
    print('{}'.format(submit_file_name),file=f)
    print('delete_list:\n{}'.format(delete_list),file=f)

## Anomaly detection
* one class svm
* isolation tree
* replicator NN
* Kmeans?
* KNN(take too much time)

## 製作特徵
XGB, LGB, PCA, Isolation Forest, Kmean距離？, oneclass SVM?
當作新feature

In [None]:
import xgboost as xgb
param_dist_xgb = {'learning_rate':0.01, #默认0.3
              'n_estimators':1000, #树的个数
#               'max_depth':5,
#               'min_child_weight':1,
#               'gamma':0.2,
#               'subsample':0.8,
#               'colsample_bytree':0.8,
#               'objective': 'binary:logistic', #逻辑回归损失函数
#               'nthread':4,  #cpu线程数
#               'scale_pos_weight':1,
              'seed':random_seed}  #随机种子

evals_result = {}

xgb_clf = xgb.XGBClassifier(**param_dist_xgb)
xgb_clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=600,
        verbose=True,
#         callbacks=[xgb.record_evaluation(evals_result)]
        )

print('F1',f1_score(y_test, xgb_clf.predict(X_test)))
xgb_X_train = xgb_clf.apply(X_train)
xgb_X_test = xgb_clf.apply(X_test)

## Train on LGB(未調參數)(效果不好)

In [None]:
print(delete_list)
print('Training num',X_train1.shape)
print('positive label ratio-train',y_train1.sum()/y_train1.shape[0])
print('positive label ratio-test',y_test1.sum()/y_test1.shape[0])

def lgb_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred) # scikits f1 doesn't like probabilities
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print()
    print('tn, fp, fn, tp')
    print(tn, fp, fn, tp)
    return 'f1', f1_score(y_true, y_pred), True

param_dist_lgb = {
#                   'num_leaves':45, 
#                   'max_depth':5, 
                  'learning_rate':0.1, 
                  'n_estimators':600,
                  'objective': 'binary',
#                   'subsample': 1, 
#                   'colsample_bytree': 0.5, 
#                   'lambda_l1': 0.1,
#                   'lambda_l2': 0,
#                   'min_child_weight': 1,
                  'random_state': random_seed,
                 }
evals_result = {}

lgb_clf = LGBMClassifier(**param_dist_lgb)
lgb_clf.fit(X_train1, y_train1,
        eval_set=[(X_train1, y_train1),(X_test1, y_test1)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=50,
        verbose=True,
        callbacks=[lgb.record_evaluation(evals_result)]
        )
y_test_pred = lgb_clf.predict(X_test1)
print('F1',f1_score(y_test1, y_test_pred))
tn, fp, fn, tp = confusion_matrix(y_test1, y_test_pred).ravel()
print(tn, fp, fn, tp)

In [None]:
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='f1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(lgb_clf, max_num_features=30)
plt.show()

print('Plotting 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgb_clf, tree_index=3, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgb_clf, tree_index=3, name='Tree4')
graph.render(view=True)

In [None]:
feature_importance = np.stack([X_train1.columns.values,lgb_clf.feature_importances_]).transpose()
feature_importance = pd.DataFrame(feature_importance,columns=['feature_name','importance'])
feature_importance.sort_values(by=['importance'],inplace=True,ascending=False)
print(feature_importance)

## PCA visualization in one person who has fraud data

In [None]:
from sklearn.decomposition import PCA
def PCA_plot(x,label):
    x = x.drop(columns=delete_list)
    
    ## 應該先轉dummy,標準化,再PCA
#     dummy_list=['contp','etymd','stscd','hcefg']
#     dummy_list2=['stocn','scity','csmcu']#'mchno','acqic','mcc',
#     x[dummy_list] = x[dummy_list].astype(object)
#     x[dummy_list2] = x[dummy_list2].astype(object)
#     x = pd.get_dummies(x)    
    
    from sklearn.preprocessing import StandardScaler 
    stdsc = StandardScaler() 
    x = stdsc.fit_transform(x)
    print(x.shape,label.sum())

    PCA_model = PCA(n_components=2)
    train_data_pca = PCA_model.fit_transform(x)
    train_data_pca1 = train_data_pca[label==1]
    train_data_pca0 = train_data_pca[label==0]
    
    plt.clf()
    plt.figure(figsize=(10,10))
    plt.scatter(train_data_pca1[:, 0], train_data_pca1[:, 1], c='r',label='fraud transaction',s=100)
    plt.scatter(train_data_pca0[:, 0], train_data_pca0[:, 1], c='b',label='normal transaction',s=3)
    plt.legend()
    plt.show()
    
bacno_hasfraud = all_data[all_data['fraud_ind']==1]['bacno'].unique()
print(bacno_hasfraud.shape[0])
print(all_data[all_data['fraud_ind']==1].shape[0])

for i in range(bacno_hasfraud.shape[0]):
    if all_data[all_data['bacno']==bacno_hasfraud[i]].shape[0]>300:
        print('Ploting PCA on bacno-{}'.format(bacno_hasfraud[i]))
        PCA_plot(all_data[all_data['bacno']==bacno_hasfraud[i]],all_data[all_data['bacno']==bacno_hasfraud[i]]['fraud_ind'])

## TSNE, Kmeans作圖?

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

c_ratio = y_train.sum()/y_train.shape[0]
# fit the model
clf = IsolationForest(behaviour='new', max_samples=0.8, max_features=1,
                      random_state=random_seed, contamination=c_ratio)
clf.fit(X_train)

y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

y_pred_test2 = -y_pred_test
y_pred_test2[y_pred_test2==-1]=0
y_pred_test2.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

isolationtree_X_train = clf.score_samples(X_train)
isolationtree_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## One class SVM

In [None]:
from sklearn import svm

clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='scale',verbose=True, random_state=random_seed)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_test.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

svm_X_train = clf.score_samples(X_train)
svm_X_test = clf.score_samples(X_test)

print(isolationtree_X_train)

## one class Kmeans

In [None]:
# 用hinge loss(當SVM)

In [None]:
# X_train['cents']
# encoding data

# GroupKfold
# vanilla KFold