In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from catboost import CatBoostClassifier, Pool, sum_models
import sys
sys.path.append('..')
import utils
from sklearn.metrics import f1_score

In [2]:
model_FU = model = CatBoostClassifier()
model_FU.load_model('./pretrain_models/FirstUse/db-v4-FirstUse-1_2_0.75_31_0.8_7(allin).cbm', format='cbm')

model_AF = model = CatBoostClassifier()
model_AF.load_model('./pretrain_models/AfterFraud/db-v4-AfterFraud-1_3_0.8_31_0.8_8(allin).cbm', format='cbm')

model_UBF = model = CatBoostClassifier()
model_UBF.load_model('./pretrain_models/UsedBeforeFraud/db-v4-UsedBeforeFraud-1_3.5_0.66_31_0.8_8(allin).cbm', format='cbm')

<catboost.core.CatBoostClassifier at 0x7f98825b36d0>

In [3]:
D_FU = joblib.load('./dataset_3rd/db-v5-FirstUse.joblib')
utils.catboost_fillna(D_FU['train'], D_FU['pred'])

D_FU['train']["label"] = D_FU['train']["label"].astype(np.uint8)
D_FU['pred']["label"] = D_FU['pred']["label"].astype(np.uint8)

train_X_FU = D_FU['train'].drop(["label"], axis=1)
train_Y_FU = D_FU['train']["label"].values

val_X_FU = D_FU['pred'].drop(["label"], axis=1)
val_Y_FU = D_FU['pred']["label"]

In [4]:
cat_features_list = np.nonzero((train_X_FU.dtypes==bool).to_numpy() | (train_X_FU.dtypes=='category').to_numpy())[0]

train_data_FU = Pool(train_X_FU,
                  train_Y_FU,
                  cat_features = list(train_X_FU.columns[cat_features_list]))

val_data_FU = Pool(val_X_FU,
                  val_Y_FU,
                  cat_features = list(val_X_FU.columns[cat_features_list]))

In [22]:
params_FU =  {
 'eval_metric': 'F1', #
 'iterations': 1000, #
 'boosting_type': 'Ordered', # Plain* or Ordered(better but slow)
 'one_hot_max_size': 10, # 2*
 'l2_leaf_reg': 25, # 2*
 'random_strength': 10, #
 'od_type': 'Iter',
 # 'rsm': 0.5, # 1*, 0~1   # CPU only
 'max_ctr_complexity': 4, # 允許的特徵交叉的最大特徵數量
 'use_best_model': True,
 'od_wait': 999,
 'random_seed': 8,
 'depth': 2,
 'loss_function': 'Logloss',
 'learning_rate': 0.8,
 'task_type': 'CPU',
 'max_leaves': 4, # Can be used only with the Lossguide growing policy. should < 64
 'verbose': 50,
    
 'thread_count': 16}

# params_FU =  {
#             'max_depth': 7,
#             'learning_rate': 0.8,
#             'max_leaves': 31,
#             'subsample': 0.75,
#             'reg_lambda': 2,
#             'min_data_in_leaf':1
#         } 

In [9]:
base_FU = model_FU.predict(train_data_FU, prediction_type='RawFormulaVal')
train_data_FU.set_baseline(base_FU)
val_data_FU.set_baseline(model_FU.predict(val_data_FU, prediction_type='RawFormulaVal'))

<catboost.core.Pool at 0x7f98265838e0>

In [23]:
model_FU_final=CatBoostClassifier(**params_FU)
model_FU_final.fit(train_data_FU, eval_set=val_data_FU, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8422998	test: 0.8547215	best: 0.8547215 (0)	total: 148ms	remaining: 2m 27s
50:	learn: 0.8441217	test: 0.8491811	best: 0.8547215 (0)	total: 14.7s	remaining: 4m 33s
100:	learn: 0.8455521	test: 0.8510846	best: 0.8547215 (0)	total: 27.7s	remaining: 4m 6s
150:	learn: 0.8455857	test: 0.8482624	best: 0.8547215 (0)	total: 40.3s	remaining: 3m 46s
200:	learn: 0.8460051	test: 0.8478314	best: 0.8547215 (0)	total: 53.5s	remaining: 3m 32s
250:	learn: 0.8466652	test: 0.8474743	best: 0.8547215 (0)	total: 1m 5s	remaining: 3m 16s
300:	learn: 0.8467714	test: 0.8460594	best: 0.8547215 (0)	total: 1m 17s	remaining: 3m 1s
350:	learn: 0.8473302	test: 0.8459081	best: 0.8547215 (0)	total: 1m 30s	remaining: 2m 47s
400:	learn: 0.8477293	test: 0.8476821	best: 0.8547215 (0)	total: 1m 43s	remaining: 2m 35s
450:	learn: 0.8479903	test: 0.8476724	best: 0.8547215 (0)	total: 1m 56s	remaining: 2m 21s
500:	learn: 0.8479271	test: 0.8503517	best: 0.8547215 (0)	total: 2m 8s	remaining: 2m 8s
550:	learn: 0.8484717	t

<catboost.core.CatBoostClassifier at 0x7f9851b84820>

In [25]:
D_AF = joblib.load('./dataset_3rd/db-v5-AfterFraud.joblib')
utils.catboost_fillna(D_AF['train'], D_AF['pred'])

D_AF['train']["label"] = D_AF['train']["label"].astype(np.uint8)
D_AF['pred']["label"] = D_AF['pred']["label"].astype(np.uint8)

train_X_AF = D_AF['train'].drop(["label"], axis=1)
train_Y_AF = D_AF['train']["label"].values

val_X_AF = D_AF['pred'].drop(["label"], axis=1)
val_Y_AF = D_AF['pred']["label"]

In [26]:
cat_features_list = np.nonzero((train_X_AF.dtypes==bool).to_numpy() | (train_X_AF.dtypes=='category').to_numpy())[0]

train_data_AF = Pool(train_X_AF,
                  train_Y_AF,
                  cat_features = list(train_X_AF.columns[cat_features_list]))

val_data_AF = Pool(val_X_AF,
                  val_Y_AF,
                  cat_features = list(val_X_AF.columns[cat_features_list]))

In [27]:
params_AF =  {
 'eval_metric': 'F1', #
 'iterations': 1000, #
 'boosting_type': 'Ordered', # Plain* or Ordered(better but slow)
 'one_hot_max_size': 10, # 2*
 'l2_leaf_reg': 25, # 2*
 'random_strength': 10, #
 'od_type': 'Iter',
 # 'rsm': 0.5, # 1*, 0~1   # CPU only
 'max_ctr_complexity': 4, # 允許的特徵交叉的最大特徵數量
 'use_best_model': True,
 'od_wait': 999,
 'random_seed': 8,
 'depth': 2,
 'loss_function': 'Logloss',
 'learning_rate': 0.8,
 'task_type': 'CPU',
 'max_leaves': 4, # Can be used only with the Lossguide growing policy. should < 64
 'verbose': 50,
    
 'thread_count': 16}

In [28]:
base_AF = model_AF.predict(train_data_AF, prediction_type='RawFormulaVal')
train_data_AF.set_baseline(base_AF)
val_data_AF.set_baseline(model_AF.predict(val_data_AF, prediction_type='RawFormulaVal'))

<catboost.core.Pool at 0x7f97b60d5160>

In [30]:
f1_score(val_Y_AF, model_AF.predict(val_X_AF))

0.9473684210526316

In [29]:
model_AF_final=CatBoostClassifier(**params_AF)
model_AF_final.fit(train_data_AF, eval_set=val_data_AF, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9988485	test: 0.9465318	best: 0.9465318 (0)	total: 34.5ms	remaining: 34.5s
50:	learn: 0.9990196	test: 0.9457701	best: 0.9485880 (1)	total: 1.16s	remaining: 21.6s
100:	learn: 0.9991424	test: 0.9456915	best: 0.9485880 (1)	total: 2.2s	remaining: 19.6s
150:	learn: 0.9991424	test: 0.9456915	best: 0.9485880 (1)	total: 3.18s	remaining: 17.9s
200:	learn: 0.9991669	test: 0.9456915	best: 0.9485880 (1)	total: 4.19s	remaining: 16.6s
250:	learn: 0.9991669	test: 0.9456915	best: 0.9485880 (1)	total: 5.25s	remaining: 15.7s
300:	learn: 0.9991669	test: 0.9456915	best: 0.9485880 (1)	total: 6.32s	remaining: 14.7s
350:	learn: 0.9991669	test: 0.9456915	best: 0.9485880 (1)	total: 7.36s	remaining: 13.6s
400:	learn: 0.9991669	test: 0.9456915	best: 0.9485880 (1)	total: 8.41s	remaining: 12.6s
450:	learn: 0.9992404	test: 0.9456915	best: 0.9485880 (1)	total: 9.49s	remaining: 11.6s
500:	learn: 0.9992404	test: 0.9456915	best: 0.9485880 (1)	total: 10.6s	remaining: 10.5s
550:	learn: 0.9992404	test: 0.94645

<catboost.core.CatBoostClassifier at 0x7f986813b670>

In [55]:
f1_score(val_Y_AF, model_AF_final_2.predict(val_data_AF))

0.948587979724837

In [91]:
f1_score(val_Y_AF, model_AF_final_2.predict(val_data_AF))

0.9523809523809523

In [87]:
params_AF_2 =  {
 'eval_metric': 'F1', #
 'iterations': 5000, #
 'boosting_type': 'Ordered', # Plain* or Ordered(better but slow)
 'one_hot_max_size': 20, # 2*
 'l2_leaf_reg': 25, # 2*
 'random_strength': 10, #
 'od_type': 'Iter',
 # 'rsm': 0.5, # 1*, 0~1   # CPU only
 'max_ctr_complexity': 4, # 允許的特徵交叉的最大特徵數量
 'use_best_model': True,
 'od_wait': 999,
 'random_seed': 8,
 'depth': 2,
 'loss_function': 'Logloss',
 'learning_rate': 0.66,
 'task_type': 'CPU',
 'max_leaves': 4, # Can be used only with the Lossguide growing policy. should < 64
 'verbose': 100,
    
 'thread_count': 16}

In [88]:
model_AF_final_2=CatBoostClassifier(**params_AF_2)
model_AF_final_2.fit(train_data_AF, eval_set=val_data_AF, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9989461	test: 0.9523121	best: 0.9523121 (0)	total: 30.6ms	remaining: 2m 33s
100:	learn: 0.9990689	test: 0.9523810	best: 0.9523810 (23)	total: 2.03s	remaining: 1m 38s
200:	learn: 0.9990689	test: 0.9523810	best: 0.9523810 (23)	total: 3.88s	remaining: 1m 32s
300:	learn: 0.9990689	test: 0.9523810	best: 0.9523810 (23)	total: 5.73s	remaining: 1m 29s
400:	learn: 0.9990689	test: 0.9516245	best: 0.9523810 (23)	total: 7.56s	remaining: 1m 26s
500:	learn: 0.9990689	test: 0.9516245	best: 0.9523810 (23)	total: 9.45s	remaining: 1m 24s
600:	learn: 0.9990933	test: 0.9516245	best: 0.9523810 (23)	total: 11.3s	remaining: 1m 22s
700:	learn: 0.9991179	test: 0.9508671	best: 0.9523810 (23)	total: 13.1s	remaining: 1m 20s
800:	learn: 0.9991179	test: 0.9508671	best: 0.9523810 (23)	total: 15s	remaining: 1m 18s
900:	learn: 0.9991914	test: 0.9508671	best: 0.9523810 (23)	total: 16.8s	remaining: 1m 16s
1000:	learn: 0.9991914	test: 0.9508671	best: 0.9523810 (23)	total: 18.7s	remaining: 1m 14s
Stopped by ov

<catboost.core.CatBoostClassifier at 0x7f983ae1f310>

In [90]:
# 0.66 =  0.9486623283
# 'one_hot_max_size': 20 = 0.9523809524
model_AF_final_2.save_model('AF_model_final', format='cbm')

In [14]:
D_AF = joblib.load('./dataset_3rd/db-v5-AfterFraud.joblib')
utils.catboost_fillna(D_AF['train'], D_AF['pred'])

D_AF['train']["label"] = D_AF['train']["label"].astype(np.uint8)
D_AF['pred']["label"] = D_AF['pred']["label"].astype(np.uint8)

train_X_AF = D_AF['train'].drop(["label"], axis=1)
train_Y_AF = D_AF['train']["label"].values

val_X_AF = D_AF['pred'].drop(["label"], axis=1)
val_Y_AF = D_AF['pred']["label"]

In [15]:
D_UBF = joblib.load('./dataset_3rd/db-v5-UsedBeforeFraud.joblib')
utils.catboost_fillna(D_UBF['train'], D_UBF['pred'])

D_UBF['train']["label"] = D_UBF['train']["label"].astype(np.uint8)
D_UBF['pred']["label"] = D_UBF['pred']["label"].astype(np.uint8)

train_X_UBF = D_UBF['train'].drop(["label"], axis=1)
train_Y_UBF = D_UBF['train']["label"].values

val_X_UBF = D_UBF['pred'].drop(["label"], axis=1)
val_Y_UBF = D_UBF['pred']["label"]

# First Use

In [40]:
f1_score(val_Y_FU, model_FU.predict(val_data_FU))

0.8550689571739656

In [39]:
f1_score(train_Y_FU, model_FU.predict(train_data_FU))

0.8443308348858369

In [19]:
cat_features_list = np.nonzero((train_X_FU.dtypes==bool).to_numpy() | (train_X_FU.dtypes=='category').to_numpy())[0]

train_data_FU = Pool(train_X_FU,
                  train_Y_FU,
                  cat_features = list(train_X_FU.columns[cat_features_list]))

val_data_FU = Pool(val_X_FU,
                  val_Y_FU,
                  cat_features = list(val_X_FU.columns[cat_features_list]))

In [44]:
params_FU =  {
 'eval_metric': 'F1', #
 'iterations': 500, #
 'boosting_type': 'Ordered', # Plain* or Ordered(better but slow)
 'one_hot_max_size': 10, # 2*
 'l2_leaf_reg': 25, # 2*
 'random_strength': 10, #
 'od_type': 'Iter',
 # 'rsm': 0.5, # 1*, 0~1   # CPU only
 'max_ctr_complexity': 4, # 允許的特徵交叉的最大特徵數量
 'use_best_model': True,
 'od_wait': 168,
 'random_seed': 8,
 'depth': 2,
 'loss_function': 'Logloss',
 'learning_rate': 0.8,
 'task_type': 'CPU',
 'max_leaves': 4, # Can be used only with the Lossguide growing policy. should < 64
 'verbose': 50,
    
 'thread_count': 16}

In [34]:
base_FU = model_FU.predict(train_data_FU, prediction_type='RawFormulaVal')
train_data_FU.set_baseline(base_FU)
val_data_FU.set_baseline(model_FU.predict(val_data_FU, prediction_type='RawFormulaVal'))

<catboost.core.Pool at 0x7fabd504d100>

In [48]:
model_FU_try=CatBoostClassifier(**params_FU)
model_FU_try.fit(train_X_FU, y=train_Y_FU, eval_set=val_data_FU, plot=True,cat_features = list(train_X_FU.columns[cat_features_list])
                , baseline = base_FU)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8450879	test: 0.7352328	best: 0.7352328 (0)	total: 334ms	remaining: 2m 46s
50:	learn: 0.8387945	test: 0.2589139	best: 0.7352328 (0)	total: 13.8s	remaining: 2m 1s
100:	learn: 0.8416775	test: 0.2192842	best: 0.7352328 (0)	total: 27.8s	remaining: 1m 49s
150:	learn: 0.8433573	test: 0.2211955	best: 0.7352328 (0)	total: 40.9s	remaining: 1m 34s
Stopped by overfitting detector  (168 iterations wait)

bestTest = 0.7352328006
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x7fa72f18ab80>

In [45]:
model_FU_final=CatBoostClassifier(**params_FU)
model_FU_final.fit(train_data_FU, eval_set=val_data_FU, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8450879	test: 0.7352328	best: 0.7352328 (0)	total: 329ms	remaining: 2m 44s
50:	learn: 0.8387945	test: 0.2589139	best: 0.7352328 (0)	total: 14.1s	remaining: 2m 3s
100:	learn: 0.8416775	test: 0.2192842	best: 0.7352328 (0)	total: 28.1s	remaining: 1m 51s
150:	learn: 0.8433573	test: 0.2211955	best: 0.7352328 (0)	total: 41.8s	remaining: 1m 36s
Stopped by overfitting detector  (168 iterations wait)

bestTest = 0.7352328006
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x7fabeebc80d0>

In [41]:
model_FU_final_G=CatBoostClassifier(task_type = 'GPU')
model_FU_final_G.fit(train_data_FU, eval_set=val_data_FU, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.038316
0:	learn: 0.0305836	test: 0.0877908	best: 0.0877908 (0)	total: 389ms	remaining: 6m 28s
1:	learn: 0.0301494	test: 0.0870545	best: 0.0870545 (1)	total: 738ms	remaining: 6m 8s
2:	learn: 0.0297268	test: 0.0866878	best: 0.0866878 (2)	total: 1.08s	remaining: 5m 58s
3:	learn: 0.0293491	test: 0.0866357	best: 0.0866357 (3)	total: 1.41s	remaining: 5m 51s
4:	learn: 0.0289808	test: 0.0871500	best: 0.0866357 (3)	total: 1.75s	remaining: 5m 48s
5:	learn: 0.0286583	test: 0.0880765	best: 0.0866357 (3)	total: 2.1s	remaining: 5m 47s
6:	learn: 0.0283144	test: 0.0892229	best: 0.0866357 (3)	total: 2.38s	remaining: 5m 37s
7:	learn: 0.0279983	test: 0.0911762	best: 0.0866357 (3)	total: 2.65s	remaining: 5m 28s
8:	learn: 0.0277071	test: 0.0933238	best: 0.0866357 (3)	total: 2.98s	remaining: 5m 28s
9:	learn: 0.0274124	test: 0.0960445	best: 0.0866357 (3)	total: 3.33s	remaining: 5m 29s
10:	learn: 0.0271396	test: 0.0995381	best: 0.0866357 (3)	total: 3.66s	remaining: 5m 28s
11:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x7fac70fb35e0>

In [42]:
f1_score(train_Y_FU, model_FU_final_G.predict(train_data_FU))

0.8452781371280724

In [43]:
f1_score(train_Y_FU, model_FU_final.predict(train_data_FU))

0.8453535750480877

In [10]:
f1_score(val_Y_AF, model_AF.predict(val_X_AF))

0.9473684210526316

In [None]:
f1_score(val_Y_AF, model_AF.predict(val_X_AF))

In [16]:
f1_score(val_Y_UBF, model_UBF.predict(val_X_UBF))

0.1594533029612756