In [222]:
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from pickle import load
from vecstack import stacking
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [223]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [224]:
params = {
    "iterations": 2500,
    "learning_rate": 0.02,
    "loss_function": 'Logloss',
    "random_seed": 1,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 8,
}
catboost = CatBoostClassifier(**params)

In [186]:
xgboost = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=20)
lightgbm = LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=20)
extra_trees = ExtraTreesClassifier(random_state=0, n_jobs=-1,  n_estimators=100, max_depth=20)
random_forest = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100, max_depth=20)

In [198]:
S_train, S_test = stacking([catboost, lightgbm, xgboost, random_forest, extra_trees],
                           X_train, y_train, X_test,   # data
                           regression=False,           # classification task (if you need regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need probabilities - set to True) 
                           save_dir=None,              # do not save result and log (to save in current dir - set to '.')
                           metric=accuracy_score,      # metric: callable
                           n_folds=4,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproduci
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [LGBMClassifier]
    fold  0:  [0.92805362]
    fold  1:  [0.92805362]
    fold  2:  [0.92012037]
    fold  3:  [0.92641226]
    ----
    MEAN:     [0.92565996] + [0.00326773]
    FULL:     [0.92565996]

model  1:     [XGBClassifier]
    fold  0:  [0.93174668]
    fold  1:  [0.92627548]
    fold  2:  [0.92531801]
    fold  3:  [0.92736972]
    ----
    MEAN:     [0.92767747] + [0.00245895]
    FULL:     [0.92767747]

model  2:     [RandomForestClassifier]
    fold  0:  [0.92559157]
    fold  1:  [0.92695938]
    fold  2:  [0.92189851]
    fold  3:  [0.92490767]
    ----
    MEAN:     [0.92483928] + [0.00185159]
    FULL:     [0.92483928]

model  3:     [ExtraTreesClassifier]
    fold  0:  [0.91615374]
    fold  1:  [0.91464916]
    fold  2:  [0.91259746]
    fold  3:  [0.91328136]
    ----
    MEAN:     [0.91417043] + [0.00136266]
    FULL:     [

In [228]:
catboost.fit(
    S_train, y_train,
    verbose=200,
    plot=False
)

# Predict
y_pred = catboost.predict(S_test)
print(classification_report(y_test, y_pred, digits=5))
# Final prediction score
# print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

0:	learn: 0.6703931	total: 25.7ms	remaining: 1m 4s
200:	learn: 0.2337598	total: 2.37s	remaining: 27.2s
400:	learn: 0.2335558	total: 4.39s	remaining: 23s
600:	learn: 0.2335526	total: 6.26s	remaining: 19.8s
800:	learn: 0.2335526	total: 8.22s	remaining: 17.4s
1000:	learn: 0.2335526	total: 10.2s	remaining: 15.3s
1200:	learn: 0.2335526	total: 12.4s	remaining: 13.4s
1400:	learn: 0.2335526	total: 14.3s	remaining: 11.3s
1600:	learn: 0.2335526	total: 16.4s	remaining: 9.19s
1800:	learn: 0.2335526	total: 18.4s	remaining: 7.13s
2000:	learn: 0.2335526	total: 20.4s	remaining: 5.09s
2200:	learn: 0.2335526	total: 22.4s	remaining: 3.04s
2400:	learn: 0.2335526	total: 24.4s	remaining: 1s
2499:	learn: 0.2335526	total: 25.4s	remaining: 0us
              precision    recall  f1-score   support

           0    0.91982   0.97797   0.94800      4903
           1    0.94852   0.82641   0.88327      2408

    accuracy                        0.92805      7311
   macro avg    0.93417   0.90219   0.91563      7311