In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from catboost import CatBoostClassifier, Pool, sum_models
import sys
sys.path.append('..')
import utils
from sklearn.metrics import f1_score

In [2]:
model_FU = model = CatBoostClassifier()
model_FU.load_model('./pretrain_models/FirstUse/db-v4-FirstUse-1_2_0.75_31_0.8_7(allin).cbm', format='cbm')

model_AF = model = CatBoostClassifier()
model_AF.load_model('./pretrain_models/AfterFraud/db-v4-AfterFraud-1_3_0.8_31_0.8_8(allin).cbm', format='cbm')

model_UBF = model = CatBoostClassifier()
model_UBF.load_model('./pretrain_models/UsedBeforeFraud/db-v4-UsedBeforeFraud-1_3.5_0.66_31_0.8_8(allin).cbm', format='cbm')

<catboost.core.CatBoostClassifier at 0x7fcc7a0fadd0>

In [8]:
D_AF = joblib.load('./dataset_3rd/db-v5-AfterFraud.joblib')
utils.catboost_fillna(D_AF['train'], D_AF['pred'])

D_AF['train']["label"] = D_AF['train']["label"].astype(np.uint8)
D_AF['pred']["label"] = D_AF['pred']["label"].astype(np.uint8)

train_X_AF = D_AF['train'].drop(["label"], axis=1)
train_Y_AF = D_AF['train']["label"].values

val_X_AF = D_AF['pred'].drop(["label"], axis=1)
val_Y_AF = D_AF['pred']["label"]

In [9]:
cat_features_list = np.nonzero((train_X_AF.dtypes==bool).to_numpy() | (train_X_AF.dtypes=='category').to_numpy())[0]

train_data_AF = Pool(train_X_AF,
                  train_Y_AF,
                  cat_features = list(train_X_AF.columns[cat_features_list]))

val_data_AF = Pool(val_X_AF,
                  val_Y_AF,
                  cat_features = list(val_X_AF.columns[cat_features_list]))

In [13]:
params_AF =  {
 'eval_metric': 'F1', #
 'iterations': 1000, #
 'boosting_type': 'Ordered', # Plain* or Ordered(better but slow)
 'one_hot_max_size': 10, # 2*
 'l2_leaf_reg': 25, # 2*
 'random_strength': 10, #
 'od_type': 'Iter',
 # 'rsm': 0.5, # 1*, 0~1   # CPU only
 'max_ctr_complexity': 4, # 允許的特徵交叉的最大特徵數量
 'use_best_model': True,
 'od_wait': 999,
 'random_seed': 8,
 'depth': 2,
 'loss_function': 'Logloss',
 'learning_rate': 0.8,
 'task_type': 'CPU',
 'max_leaves': 4, # Can be used only with the Lossguide growing policy. should < 64
 'verbose': 50,
    
 'thread_count': 16}

In [12]:
f1_score(val_Y_AF, model_AF.predict(val_X_AF))

0.9473684210526316

In [14]:
model_AF_final=CatBoostClassifier(**params_AF)
model_AF_final.fit(train_data_AF, eval_set=val_data_AF)

0:	learn: 0.9047533	test: 0.8831563	best: 0.8831563 (0)	total: 63.6ms	remaining: 1m 3s
50:	learn: 0.9831949	test: 0.9418521	best: 0.9439655 (31)	total: 2.96s	remaining: 55.1s
100:	learn: 0.9860449	test: 0.9427754	best: 0.9439655 (31)	total: 5.73s	remaining: 51s
150:	learn: 0.9872441	test: 0.9297994	best: 0.9444845 (107)	total: 8.45s	remaining: 47.5s
200:	learn: 0.9877665	test: 0.9310345	best: 0.9444845 (107)	total: 11s	remaining: 43.6s
250:	learn: 0.9877162	test: 0.9317038	best: 0.9444845 (107)	total: 13.3s	remaining: 39.8s
300:	learn: 0.9889031	test: 0.9436117	best: 0.9444845 (107)	total: 15.8s	remaining: 36.6s
350:	learn: 0.9887365	test: 0.9437722	best: 0.9444845 (107)	total: 18.3s	remaining: 33.9s
400:	learn: 0.9895946	test: 0.9459459	best: 0.9459459 (395)	total: 21s	remaining: 31.4s
450:	learn: 0.9899134	test: 0.9452736	best: 0.9459459 (395)	total: 23.7s	remaining: 28.8s
500:	learn: 0.9898668	test: 0.9354376	best: 0.9466951 (456)	total: 26.1s	remaining: 26s
550:	learn: 0.9901384	te

<catboost.core.CatBoostClassifier at 0x7fcb9b747e80>

In [None]:
model_AF_final_2.save_model('AF_model_final', format='cbm')