In [222]:
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from pickle import load
from vecstack import stacking
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [223]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [229]:
params = {
    "iterations": 2500,
    "learning_rate": 0.02,
    "loss_function": 'Logloss',
    "random_seed": 1,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 8,
}
catboost = CatBoostClassifier(**params)

In [230]:
xgboost = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=20)
lightgbm = LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=20)
extra_trees = ExtraTreesClassifier(random_state=0, n_jobs=-1,  n_estimators=100, max_depth=20)
random_forest = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=20)

In [231]:
S_train, S_test = stacking([catboost, lightgbm, xgboost, random_forest, extra_trees],
                           X_train, y_train, X_test,   # data
                           regression=False,           # classification task (if you need regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need probabilities - set to True) 
                           save_dir=None,              # do not save result and log (to save in current dir - set to '.')
                           metric=accuracy_score,      # metric: callable
                           n_folds=4,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproduci
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [CatBoostClassifier]
0:	learn: 0.6712708	total: 14.4ms	remaining: 36s
1:	learn: 0.6522062	total: 42.6ms	remaining: 53.3s
2:	learn: 0.6335680	total: 56.4ms	remaining: 47s
3:	learn: 0.6160561	total: 76ms	remaining: 47.4s
4:	learn: 0.5985796	total: 92.2ms	remaining: 46s
5:	learn: 0.5821059	total: 110ms	remaining: 45.7s
6:	learn: 0.5656136	total: 127ms	remaining: 45.3s
7:	learn: 0.5517590	total: 143ms	remaining: 44.7s
8:	learn: 0.5383438	total: 160ms	remaining: 44.4s
9:	learn: 0.5252715	total: 179ms	remaining: 44.5s
10:	learn: 0.5122451	total: 197ms	remaining: 44.5s
11:	learn: 0.5003891	total: 213ms	remaining: 44.1s
12:	learn: 0.4892061	total: 235ms	remaining: 45s
13:	learn: 0.4789218	total: 258ms	remaining: 45.8s
14:	learn: 0.4701065	total: 276ms	remaining: 45.7s
15:	learn: 0.4606638	total: 294ms	remaining: 45.7s
16:	learn: 0.4513883	total: 311ms	re

In [232]:
catboost.fit(
    S_train, y_train,
    verbose=200,
    plot=False
)

# Predict
y_pred = catboost.predict(S_test)
print(classification_report(y_test, y_pred, digits=5))
# Final prediction score
# print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

0:	learn: 0.6688123	total: 15.3ms	remaining: 38.3s
200:	learn: 0.2281311	total: 1.35s	remaining: 15.5s
400:	learn: 0.2279070	total: 2.59s	remaining: 13.5s
600:	learn: 0.2279001	total: 3.72s	remaining: 11.7s
800:	learn: 0.2279001	total: 4.87s	remaining: 10.3s
1000:	learn: 0.2279001	total: 6.06s	remaining: 9.08s
1200:	learn: 0.2279001	total: 7.3s	remaining: 7.89s
1400:	learn: 0.2279001	total: 8.54s	remaining: 6.7s
1600:	learn: 0.2279001	total: 9.79s	remaining: 5.5s
1800:	learn: 0.2279001	total: 11.2s	remaining: 4.34s
2000:	learn: 0.2279001	total: 12.5s	remaining: 3.13s
2200:	learn: 0.2279001	total: 13.7s	remaining: 1.86s
2400:	learn: 0.2279001	total: 15s	remaining: 617ms
2499:	learn: 0.2279001	total: 15.6s	remaining: 0us
              precision    recall  f1-score   support

           0    0.92222   0.97940   0.94995      4903
           1    0.95200   0.83181   0.88785      2408

    accuracy                        0.93079      7311
   macro avg    0.93711   0.90561   0.91890      7311