In [None]:
import joblib
import numpy as np
from vecstack import stacking
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [22]:
X_train = pd.read_pickle("data/05_model_input/X_train.pkl")
X_test = pd.read_pickle("data/05_model_input/X_test.pkl")
y_train = pd.read_pickle("data/05_model_input/y_train.pkl")
y_test = pd.read_pickle("data/05_model_input/y_test.pkl")

In [53]:
depth = 10
estimators = 100

In [54]:
params = {
    "iterations": 2500,
    "learning_rate": 0.02,
    "loss_function": 'Logloss',
    "random_seed": 1,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 8,
    "verbose": False,
    "depth": depth,
    "random_seed": 1,
}
catboost = CatBoostClassifier(**params)

In [55]:
xgboost = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=estimators, max_depth=depth)
lightgbm = LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=estimators, max_depth=depth)
extra_trees = ExtraTreesClassifier(random_state=0, n_jobs=-1,  n_estimators=estimators, max_depth=depth)
random_forest = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=estimators, max_depth=depth)

In [56]:
S_train, S_test = stacking([catboost, lightgbm, xgboost, random_forest, extra_trees],
                           X_train, y_train, X_test,   # data
                           regression=False,           # classification task (if you need regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need probabilities - set to True) 
                           save_dir=None,              # do not save result and log (to save in current dir - set to '.')
                           metric=f1_score,            # metric: callable
                           n_folds=4,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproduci
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [f1_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [CatBoostClassifier]
    fold  0:  [0.88991430]
    fold  1:  [0.88704754]
    fold  2:  [0.87772926]
    fold  3:  [0.88830255]
    ----
    MEAN:     [0.88574841] + [0.00474006]
    FULL:     [0.88573775]

model  1:     [LGBMClassifier]
    fold  0:  [0.88645331]
    fold  1:  [0.88412389]
    fold  2:  [0.87500000]
    fold  3:  [0.88093146]
    ----
    MEAN:     [0.88162716] + [0.00429909]
    FULL:     [0.88161841]

model  2:     [XGBClassifier]
    fold  0:  [0.89035088]
    fold  1:  [0.88128948]
    fold  2:  [0.88208469]
    fold  3:  [0.88699080]
    ----
    MEAN:     [0.88517896] + [0.00369912]
    FULL:     [0.88516537]

model  3:     [RandomForestClassifier]
    fold  0:  [0.87390029]
    fold  1:  [0.86985840]
    fold  2:  [0.86477987]
    fold  3:  [0.86459515]
    ----
    MEAN:     [0.86828343] + [0.00387001]
    FULL:     [0.868286

In [57]:
logreg = LogisticRegression(random_state=0).fit(S_train, y_train,)
y_pred = logreg.predict(S_test)

print(classification_report(y_test, y_pred, digits=5))

              precision    recall  f1-score   support

           0    0.91966   0.98062   0.94917      4903
           1    0.95439   0.82558   0.88533      2408

    accuracy                        0.92956      7311
   macro avg    0.93703   0.90310   0.91725      7311
weighted avg    0.93110   0.92956   0.92814      7311

