# Libraries

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier, early_stopping, Dataset
import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Вспомогательные блоки организации для пайплайна
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn import set_config
import warnings
warnings.filterwarnings('ignore')
set_config(transform_output="pandas")

import optuna
from optuna.integration import CatBoostPruningCallback
from optuna.samplers import RandomSampler



# Data

In [6]:
train = pd.read_csv('../data/train.csv').drop(columns=['id'])
train

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0
165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


## Features

In [7]:
train_f = train.drop_duplicates(subset=['CustomerId', 'Surname']).drop(columns=['CustomerId', 'Surname'])
train_f

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0
2,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0
...,...,...,...,...,...,...,...,...,...,...,...
165028,630,France,Male,50.0,8,0.00,2,1.0,1.0,5962.50,0
165030,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0
165031,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0
165032,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0


In [8]:
cat_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
num_features = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary', 'Tenure', 'NumOfProducts']

In [9]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop='first', sparse=False))])
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    # ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, num_features),
    ("categorical", categorical_transformer, cat_features)])

preprocessor

In [10]:
X = preprocessor.fit_transform(train_f).rename(columns={column: column.split('__')[1] for column in preprocessor.fit_transform(train_f).columns})
y = train_f['Exited']

In [11]:
X

Unnamed: 0,CreditScore,Age,Balance,EstimatedSalary,Tenure,NumOfProducts,Geography_Germany,Geography_Spain,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0
0,668.0,33.0,0.00,181449.97,3.0,2.0,0.0,0.0,1.0,1.0,0.0
1,627.0,33.0,0.00,49503.50,1.0,2.0,0.0,0.0,1.0,1.0,1.0
2,678.0,40.0,0.00,184866.69,10.0,2.0,0.0,0.0,1.0,1.0,0.0
3,581.0,34.0,148882.54,84560.88,2.0,1.0,0.0,0.0,1.0,1.0,1.0
4,716.0,33.0,0.00,15068.83,5.0,2.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
165028,630.0,50.0,0.00,5962.50,8.0,2.0,0.0,0.0,1.0,1.0,1.0
165030,792.0,35.0,0.00,131834.45,3.0,1.0,0.0,0.0,1.0,0.0,0.0
165031,565.0,31.0,0.00,127429.56,5.0,1.0,0.0,0.0,1.0,1.0,1.0
165032,554.0,30.0,161533.00,71173.03,7.0,1.0,0.0,1.0,0.0,0.0,1.0


# ML

## CatBoost

In [14]:
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    param = {
        "iterations": 2000, # Можно не перебирать, есть Early-Stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100), 
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced"]),
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "AUC", # Тоже стоит заранее определиться

        "objective": trial.suggest_categorical("objective", ["Logloss"]),
    }

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        

    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=2024,
        # cat_features=cat_features,
    )

    pruning_callback = CatBoostPruningCallback(trial, "AUC")

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=50,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    y_pred = clf.predict_proba(X_val)[:, 1]
    return clf, y_pred

In [15]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X, y.astype(str)):
        train_data = X.iloc[train_idx, :], y.iloc[train_idx]
        valid_data = X.iloc[valid_idx, :], y.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
        scores.append(roc_auc_score(valid_data[1], y_pred))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [16]:
study = optuna.create_study(
    study_name="CatBoost",
    direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
    sampler=RandomSampler(2024),
)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2024-01-26 13:41:30,204] A new study created in memory with name: CatBoost


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-01-26 13:41:41,892] Trial 29 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:43,306] Trial 22 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:43,437] Trial 1 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:43,674] Trial 28 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:43,957] Trial 19 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:43,960] Trial 4 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:44,410] Trial 0 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:44,664] Trial 11 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:44,672] Trial 3 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:44,678] Trial 23 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:44,732] Trial 26 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:45,143] Trial 10 pruned. Trial was pruned at iteration 10.
[I 2024-01-26 13:41:45,169] Trial 17 pruned. Trial was p

In [None]:
{'learning_rate': 0.28032105310060784, 'l2_leaf_reg': 43, 'colsample_bylevel': 0.19416882891868145, 'min_data_in_leaf': 6, 'auto_class_weights': 'SqrtBalanced', 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'objective': 'Logloss'} # 0.9000956693941863.

In [17]:
study.best_value

0.9014349425685045

In [18]:
study.best_params

{'learning_rate': 0.1696310500969717,
 'l2_leaf_reg': 48,
 'colsample_bylevel': 0.5868476873585279,
 'min_data_in_leaf': 25,
 'auto_class_weights': 'SqrtBalanced',
 'depth': 5,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'objective': 'Logloss',
 'subsample': 0.7402107645449161}

## LightGBM

In [19]:
def fit_lgbm(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val


    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 90),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }


    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    clf = LGBMClassifier(**param, n_jobs=-1, n_estimators=1000, verbose=0, callbacks=[
        pruning_callback,
        early_stopping(100, verbose=0),
    ])
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])

    preds = clf.predict_proba(X_val)[:, 1]
    return clf, preds

In [20]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X, y.astype(str)):
        train_data = X.iloc[train_idx, :], y.iloc[train_idx]
        valid_data = X.iloc[valid_idx, :], y.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_lgbm(trial, train_data, valid_data) # Определили выше
        scores.append(roc_auc_score(valid_data[1], y_pred))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
            raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [None]:
study = optuna.create_study(
    study_name="LGBM",
    direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
    sampler=RandomSampler(2024),
)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

In [22]:
study.best_value

0.9011865697241652

In [23]:
study.best_params

{'lambda_l1': 2.3772843324101983e-05,
 'lambda_l2': 3.669720725756965e-07,
 'num_leaves': 4,
 'feature_fraction': 0.7574838929544363,
 'bagging_fraction': 0.7330639363496431,
 'bagging_freq': 7,
 'min_child_samples': 51}

## XGB

In [21]:
def fit_xgboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val

    # dtrain = xgb.DMatrix(X_train, label=y_train)
    # dvalid = xgb.DMatrix(X_val, label=y_val)

    param = {
        "verbosity": 0,
        "verbose_eval": False,
        "eval_metric": "auc",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "n_estimators": 500,
        "early_stopping_rounds": 50,
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    param["objective"] = "binary:hinge"
    

    # Add a callback for pruning.
    clf = XGBClassifier(**param)
    clf.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    # clf = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])

    y_pred = clf.predict_proba(X_val)[:, 1]
    return clf, y_pred

In [22]:
def objective(trial, return_models=False):
    n_splits = 5
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    # X_train = train.drop(columns=['outcome'])
    # y_train = train["outcome"]

    scores, models = [], []
    
    for train_idx, valid_idx in kf.split(X, y.astype(str)):
        train_data = X.iloc[train_idx, :], y.iloc[train_idx]
        valid_data = X.iloc[valid_idx, :], y.iloc[valid_idx]

        # Подаем trials для перебора
        model, y_pred = fit_xgboost(trial, train_data, valid_data) # Определили выше
        scores.append(roc_auc_score(valid_data[1], y_pred))
        models.append(model)

    result = np.mean(scores) - np.std(scores)

    if trial.should_prune():
        raise optuna.TrialPruned()
    
    if return_models:
        return result, models
    return result

In [None]:
study = optuna.create_study(
    study_name="XGB",
    direction="maximize",
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=5000, reduction_factor=10
    ),
    sampler=RandomSampler(2024),
)
study.optimize(objective,
    n_trials=1000,
    n_jobs = -1,
    show_progress_bar=True,
)

[I 2024-01-26 16:00:16,718] A new study created in memory with name: XGB


  0%|          | 0/1000 [00:00<?, ?it/s]

[0]	validation_0-auc:0.50000
[1]	validation_0-auc:0.50000
[0]	validation_0-auc:0.50000
[2]	validation_0-auc:0.62365
[1]	validation_0-auc:0.50000[3]	validation_0-auc:0.50000

[2]	validation_0-auc:0.65297
[4]	validation_0-auc:0.50000
[3]	validation_0-auc:0.50000
[0]	validation_0-auc:0.50000
[5]	validation_0-auc:0.50000
[4]	validation_0-auc:0.50000
[6]	validation_0-auc:0.50000
[5]	validation_0-auc:0.50000
[1]	validation_0-auc:0.50000
[0]	validation_0-auc:0.50000
[7]	validation_0-auc:0.50000
[0]	validation_0-auc:0.49991
[1]	validation_0-auc:0.50000
[0]	validation_0-auc:0.64452
[0]	validation_0-auc:0.63718
[2]	validation_0-auc:0.66672[1]	validation_0-auc:0.50000

[6]	validation_0-auc:0.57161
[1]	validation_0-auc:0.50000
[1]	validation_0-auc:0.56210
[8]	validation_0-auc:0.50000
[3]	validation_0-auc:0.50233
[2]	validation_0-auc:0.49912
[7]	validation_0-auc:0.63737
[2]	validation_0-auc:0.61471
[2]	validation_0-auc:0.50000
[3]	validation_0-auc:0.62419
[8]	validation_0-auc:0.50000
[9]	validation

[11]	validation_0-auc:0.50000
[34]	validation_0-auc:0.64504
[35]	validation_0-auc:0.54118
[28]	validation_0-auc:0.50000
[65]	validation_0-auc:0.57356
[36]	validation_0-auc:0.54196
[29]	validation_0-auc:0.50000
[37]	validation_0-auc:0.54205
[47]	validation_0-auc:0.50000
[38]	validation_0-auc:0.53802
[66]	validation_0-auc:0.59612
[39]	validation_0-auc:0.54053
[67]	validation_0-auc:0.58022
[45]	validation_0-auc:0.50000
[40]	validation_0-auc:0.53415
[68]	validation_0-auc:0.59698
[41]	validation_0-auc:0.54496
[45]	validation_0-auc:0.50000
[12]	validation_0-auc:0.50000
[10]	validation_0-auc:0.50000
[69]	validation_0-auc:0.59077
[42]	validation_0-auc:0.53198
[70]	validation_0-auc:0.59549
[48]	validation_0-auc:0.50000
[43]	validation_0-auc:0.54956
[50]	validation_0-auc:0.50000
[47]	validation_0-auc:0.50000
[71]	validation_0-auc:0.59460
[5]	validation_0-auc:0.50000
[44]	validation_0-auc:0.53059
[72]	validation_0-auc:0.59778
[33]	validation_0-auc:0.50000
[40]	validation_0-auc:0.50000
[11]	valida