In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import optuna

seeds = 1234
def set_seeds(seed=seeds):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
# Set seeds
set_seeds()

# Import train set
df = pd.read_pickle(r"train_enc.pkl")

# Separate labels and target
X, y =  df.drop(columns="left", axis = 1), df["left"]


In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN
from lightgbm import LGBMClassifier

from sklearn.metrics import fbeta_score, make_scorer, precision_score, recall_score

def f2_measure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

f2_score = make_scorer(f2_measure)    

# Get variable names
bi_vars = [col for col in X.columns if X[col].nunique() == 2]
num_vars = [col for col in X.columns if (X[col].dtype in ["int8", "float32"]) & (col not in bi_vars)]
cat_vars = [col for col in X.columns if (col not in num_vars) & (col not in bi_vars)]

# Preprocessing recipe
preprocess = ColumnTransformer(
    transformers=[
        ("scaler", MinMaxScaler(), num_vars),
        ("ohe", OneHotEncoder(), cat_vars)
    ], remainder = "passthrough"
)

# Resampling
smoteen = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'), random_state=seeds)

clf = LGBMClassifier(random_state=seeds)
pipe_final = make_pipeline(preprocess, smoteen, clf)

## Tune 1

In [103]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score
import optuna


def objective(trial,data=X,target=y):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=seeds)
    params = {
                "verbosity": -1,
                "boosting_type": "gbdt",
                "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 0.9),
                "colsample_bynode": trial.suggest_float("colsample_bynode", 0.2, 0.9),
    }
    clf = LGBMClassifier(random_state=seeds,scale_pos_weight=2, **params)
    model = make_pipeline(preprocess, clf)
    model.fit(train_x, train_y)
    predictions = model.predict_proba(test_x)[:,1]
    f1 = f1_score(test_y, predictions, average='macro')
    return f1

In [104]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-02-17 16:51:21,740][0m A new study created in memory with name: no-name-a8bfa4e4-ed9c-4617-a9aa-1e33cb23c833[0m
[33m[W 2022-02-17 16:51:21,748][0m Trial 0 failed because of the following error: ValueError('Pipeline.fit does not accept the eval_set parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.')[0m
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\envs\env_full\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Eric\AppData\Local\Temp/ipykernel_4756/3310503327.py", line 22, in objective
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)],
  File "C:\ProgramData\Anaconda3\envs\env_full\lib\site-packages\imblearn\pipeline.py", line 267, in fit
    fit_params_steps = self._check_fit_params(**fit_params)
  File "C:\ProgramData\Anaconda3\envs\e

ValueError: Pipeline.fit does not accept the eval_set parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

## Tune

In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import optuna


def objective(trial,data=X,target=y):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=seeds)
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    clf = LGBMClassifier(random_state=seeds,scale_pos_weight=2, **params)
    model = make_pipeline(preprocess, clf)
    model.fit(train_x, train_y)
    predictions = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y, predictions)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-02-17 16:33:16,291][0m A new study created in memory with name: no-name-8b3d7cf2-c870-4fd0-82fa-9a41e08807ab[0m




[32m[I 2022-02-17 16:33:16,722][0m Trial 0 finished with value: 0.924957720277355 and parameters: {'lambda_l1': 0.06514575898316952, 'lambda_l2': 1.9751838457968587e-07, 'num_leaves': 131, 'feature_fraction': 0.8267236727110189, 'bagging_fraction': 0.8349946090451172, 'bagging_freq': 3, 'min_child_samples': 98}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:17,063][0m Trial 1 finished with value: 0.9248414510400812 and parameters: {'lambda_l1': 3.810155143368929, 'lambda_l2': 0.0001065975764698808, 'num_leaves': 174, 'feature_fraction': 0.43021982955569393, 'bagging_fraction': 0.6903749522871129, 'bagging_freq': 1, 'min_child_samples': 53}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:17,412][0m Trial 2 finished with value: 0.9246744461356334 and parameters: {'lambda_l1': 2.875084347888479e-06, 'lambda_l2': 0.0008296990793777786, 'num_leaves': 24, 'feature_fraction': 0.9226360357239631, 'bagging_fraction': 0.8410514148934715, 'bagging_freq': 4, 'min_child_samples': 10}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:18,081][0m Trial 3 finished with value: 0.9220192795535261 and parameters: {'lambda_l1': 0.8666771901824198, 'lambda_l2': 0.011669457674086586, 'num_leaves': 165, 'feature_fraction': 0.7179184585630725, 'bagging_fraction': 0.6064307599490495, 'bagging_freq': 4, 'min_child_samples': 41}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:18,745][0m Trial 4 finished with value: 0.9207403179435142 and parameters: {'lambda_l1': 1.3865362811719462e-07, 'lambda_l2': 1.9412669432127464e-08, 'num_leaves': 71, 'feature_fraction': 0.765048024266324, 'bagging_fraction': 0.5577479296576575, 'bagging_freq': 2, 'min_child_samples': 43}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:19,478][0m Trial 5 finished with value: 0.9182267884322678 and parameters: {'lambda_l1': 0.010757547451431993, 'lambda_l2': 1.295063670929102e-07, 'num_leaves': 84, 'feature_fraction': 0.6223465537149334, 'bagging_fraction': 0.5568052479114087, 'bagging_freq': 2, 'min_child_samples': 22}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:19,915][0m Trial 6 finished with value: 0.9243869440216472 and parameters: {'lambda_l1': 1.716384430704023e-05, 'lambda_l2': 0.0026153274935564, 'num_leaves': 143, 'feature_fraction': 0.7100939014021541, 'bagging_fraction': 0.8023386983748484, 'bagging_freq': 1, 'min_child_samples': 87}. Best is trial 0 with value: 0.924957720277355.[0m
[32m[I 2022-02-17 16:33:20,122][0m Trial 7 finished with value: 0.9227137239979706 and parameters: {'lambda_l1': 5.259143279359682e-07, 'lambda_l2': 0.0022500765414208155, 'num_leaves': 7, 'feature_fraction': 0.8250287885123336, 'bagging_fraction': 0.6042804576686013, 'bagging_freq': 6, 'min_child_samples': 99}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:20,697][0m Trial 8 finished with value: 0.9173833079654998 and parameters: {'lambda_l1': 5.110046933076805e-05, 'lambda_l2': 7.833349914288318e-05, 'num_leaves': 60, 'feature_fraction': 0.6593260237562765, 'bagging_fraction': 0.45011086592236194, 'bagging_freq': 1, 'min_child_samples': 12}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:21,022][0m Trial 9 finished with value: 0.9234377642482666 and parameters: {'lambda_l1': 1.764133692866698, 'lambda_l2': 1.0959429616886919e-08, 'num_leaves': 128, 'feature_fraction': 0.4912218958447454, 'bagging_fraction': 0.5314913347348074, 'bagging_freq': 6, 'min_child_samples': 64}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:21,599][0m Trial 10 finished with value: 0.9230572467444614 and parameters: {'lambda_l1': 0.0043173258111230016, 'lambda_l2': 2.6525514029301274, 'num_leaves': 252, 'feature_fraction': 0.9626452311062837, 'bagging_fraction': 0.9949948980612656, 'bagging_freq': 3, 'min_child_samples': 76}. Best is trial 0 with value: 0.924957720277355.[0m
[32m[I 2022-02-17 16:33:21,883][0m Trial 11 finished with value: 0.9218522746490783 and parameters: {'lambda_l1': 9.380393489691542, 'lambda_l2': 3.851189668210794e-06, 'num_leaves': 205, 'feature_fraction': 0.41617300708721805, 'bagging_fraction': 0.7531881166906867, 'bagging_freq': 3, 'min_child_samples': 61}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:22,322][0m Trial 12 finished with value: 0.9222708439032641 and parameters: {'lambda_l1': 0.08326712756683323, 'lambda_l2': 4.908453219063226e-06, 'num_leaves': 186, 'feature_fraction': 0.5432258002994697, 'bagging_fraction': 0.8926329682026746, 'bagging_freq': 2, 'min_child_samples': 100}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:23,049][0m Trial 13 finished with value: 0.9191992220531033 and parameters: {'lambda_l1': 0.0011176249881078306, 'lambda_l2': 1.842507654171393e-06, 'num_leaves': 116, 'feature_fraction': 0.864747077512856, 'bagging_fraction': 0.6831738615407343, 'bagging_freq': 1, 'min_child_samples': 32}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:23,478][0m Trial 14 finished with value: 0.9229177236597329 and parameters: {'lambda_l1': 0.06589577154562977, 'lambda_l2': 0.11757483302935162, 'num_leaves': 224, 'feature_fraction': 0.5724350359865022, 'bagging_fraction': 0.7136181837358507, 'bagging_freq': 7, 'min_child_samples': 74}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:24,143][0m Trial 15 finished with value: 0.9191632842888551 and parameters: {'lambda_l1': 0.1062412795809559, 'lambda_l2': 3.2737876869919545e-05, 'num_leaves': 162, 'feature_fraction': 0.418753941599994, 'bagging_fraction': 0.9172676865899382, 'bagging_freq': 3, 'min_child_samples': 83}. Best is trial 0 with value: 0.924957720277355.[0m




[32m[I 2022-02-17 16:33:24,469][0m Trial 16 finished with value: 0.9263888888888889 and parameters: {'lambda_l1': 7.99318629235835, 'lambda_l2': 2.6802800031291535e-07, 'num_leaves': 94, 'feature_fraction': 0.8121696257341431, 'bagging_fraction': 0.6669633218261335, 'bagging_freq': 5, 'min_child_samples': 54}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:24,998][0m Trial 17 finished with value: 0.9224188229325214 and parameters: {'lambda_l1': 0.39762917038122936, 'lambda_l2': 3.0935994778473736e-07, 'num_leaves': 93, 'feature_fraction': 0.8091371406610275, 'bagging_fraction': 0.7981627166989003, 'bagging_freq': 5, 'min_child_samples': 64}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:25,493][0m Trial 18 finished with value: 0.9229113817013361 and parameters: {'lambda_l1': 0.0003580389485221061, 'lambda_l2': 1.3923343952125512e-07, 'num_leaves': 42, 'feature_fraction': 0.8816148456882613, 'bagging_fraction': 0.9523210645253557, 'bagging_freq': 5, 'min_child_samples': 29}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:25,887][0m Trial 19 finished with value: 0.9239451209200067 and parameters: {'lambda_l1': 1.2760874876654142e-08, 'lambda_l2': 7.244853618057045e-07, 'num_leaves': 105, 'feature_fraction': 0.9991878368914763, 'bagging_fraction': 0.4205469945917383, 'bagging_freq': 5, 'min_child_samples': 87}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:26,594][0m Trial 20 finished with value: 0.9207191780821918 and parameters: {'lambda_l1': 0.01584334060542916, 'lambda_l2': 1.2657283480119349e-05, 'num_leaves': 138, 'feature_fraction': 0.7728876087415602, 'bagging_fraction': 0.8612498646405227, 'bagging_freq': 7, 'min_child_samples': 52}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:27,019][0m Trial 21 finished with value: 0.9259893455098935 and parameters: {'lambda_l1': 5.292604822313879, 'lambda_l2': 0.0001469539310381654, 'num_leaves': 167, 'feature_fraction': 0.8736773058694078, 'bagging_fraction': 0.6732426639152443, 'bagging_freq': 4, 'min_child_samples': 51}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:27,414][0m Trial 22 finished with value: 0.9258540503974294 and parameters: {'lambda_l1': 9.160082077217977, 'lambda_l2': 6.46979337874981e-08, 'num_leaves': 144, 'feature_fraction': 0.886740191855645, 'bagging_fraction': 0.6577348220775909, 'bagging_freq': 4, 'min_child_samples': 46}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:27,798][0m Trial 23 finished with value: 0.9257525790630813 and parameters: {'lambda_l1': 8.3048794431, 'lambda_l2': 2.4571577718075772e-08, 'num_leaves': 195, 'feature_fraction': 0.9132168622144227, 'bagging_fraction': 0.6410751474524452, 'bagging_freq': 4, 'min_child_samples': 46}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:28,559][0m Trial 24 finished with value: 0.9216472179942501 and parameters: {'lambda_l1': 0.565708502818209, 'lambda_l2': 0.06055340154105779, 'num_leaves': 218, 'feature_fraction': 0.8733336056135544, 'bagging_fraction': 0.7436263889548929, 'bagging_freq': 5, 'min_child_samples': 35}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:28,986][0m Trial 25 finished with value: 0.9249978860138678 and parameters: {'lambda_l1': 1.6375955251657004, 'lambda_l2': 5.148854258858307, 'num_leaves': 150, 'feature_fraction': 0.7648079634424911, 'bagging_fraction': 0.6552110458907683, 'bagging_freq': 6, 'min_child_samples': 55}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:29,302][0m Trial 26 finished with value: 0.9262155420260442 and parameters: {'lambda_l1': 9.559069626044598, 'lambda_l2': 9.675248930818588e-07, 'num_leaves': 112, 'feature_fraction': 0.9487536235606865, 'bagging_fraction': 0.49236846478294194, 'bagging_freq': 4, 'min_child_samples': 47}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:29,942][0m Trial 27 finished with value: 0.9196389311686116 and parameters: {'lambda_l1': 0.30284355755015196, 'lambda_l2': 0.0001815679189921115, 'num_leaves': 56, 'feature_fraction': 0.9489041725077405, 'bagging_fraction': 0.49271585477835944, 'bagging_freq': 5, 'min_child_samples': 22}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:30,346][0m Trial 28 finished with value: 0.9226513614070693 and parameters: {'lambda_l1': 0.9635214789344579, 'lambda_l2': 9.155051716601741e-07, 'num_leaves': 108, 'feature_fraction': 0.955799557099781, 'bagging_fraction': 0.49301585001193815, 'bagging_freq': 6, 'min_child_samples': 72}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:30,872][0m Trial 29 finished with value: 0.9222053103331642 and parameters: {'lambda_l1': 0.02278568020771971, 'lambda_l2': 1.3427699590054155e-05, 'num_leaves': 85, 'feature_fraction': 0.8307273091319747, 'bagging_fraction': 0.6094571379717232, 'bagging_freq': 4, 'min_child_samples': 58}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:31,666][0m Trial 30 finished with value: 0.9166603247082701 and parameters: {'lambda_l1': 0.0023960147274640184, 'lambda_l2': 2.288066874053234e-05, 'num_leaves': 126, 'feature_fraction': 0.8099083214163064, 'bagging_fraction': 0.7613127310558935, 'bagging_freq': 3, 'min_child_samples': 37}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:32,071][0m Trial 31 finished with value: 0.9260633350245222 and parameters: {'lambda_l1': 7.797739126685135, 'lambda_l2': 9.684100497040538e-08, 'num_leaves': 155, 'feature_fraction': 0.9107815793606897, 'bagging_fraction': 0.6464316836704277, 'bagging_freq': 4, 'min_child_samples': 49}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:32,525][0m Trial 32 finished with value: 0.9240191104346356 and parameters: {'lambda_l1': 3.1113827269923564, 'lambda_l2': 6.708139618151407e-07, 'num_leaves': 176, 'feature_fraction': 0.9895477555605499, 'bagging_fraction': 0.7121282157961886, 'bagging_freq': 4, 'min_child_samples': 51}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:32,981][0m Trial 33 finished with value: 0.9246131405377981 and parameters: {'lambda_l1': 0.23830613829173147, 'lambda_l2': 5.597519256915071e-08, 'num_leaves': 157, 'feature_fraction': 0.9359902308054923, 'bagging_fraction': 0.561778416595143, 'bagging_freq': 4, 'min_child_samples': 68}. Best is trial 16 with value: 0.9263888888888889.[0m




[32m[I 2022-02-17 16:33:33,417][0m Trial 34 finished with value: 0.9268180280737358 and parameters: {'lambda_l1': 2.791303532323693, 'lambda_l2': 3.1179893153736994e-07, 'num_leaves': 118, 'feature_fraction': 0.9119325814149439, 'bagging_fraction': 0.5989821428812343, 'bagging_freq': 5, 'min_child_samples': 49}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:33,955][0m Trial 35 finished with value: 0.9235307796380856 and parameters: {'lambda_l1': 2.583982335257004, 'lambda_l2': 2.8440770108907665e-07, 'num_leaves': 95, 'feature_fraction': 0.9103581494533473, 'bagging_fraction': 0.6204127970725002, 'bagging_freq': 5, 'min_child_samples': 40}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:34,517][0m Trial 36 finished with value: 0.924217825131067 and parameters: {'lambda_l1': 1.2450344827059645, 'lambda_l2': 4.516090325101916e-08, 'num_leaves': 120, 'feature_fraction': 0.8431012673613083, 'bagging_fraction': 0.5124630033814175, 'bagging_freq': 5, 'min_child_samples': 45}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:35,119][0m Trial 37 finished with value: 0.9227464907830204 and parameters: {'lambda_l1': 0.20863976428923556, 'lambda_l2': 3.4499202530875937e-06, 'num_leaves': 75, 'feature_fraction': 0.7459634374430636, 'bagging_fraction': 0.5956640440868152, 'bagging_freq': 3, 'min_child_samples': 56}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:35,783][0m Trial 38 finished with value: 0.9226492474209369 and parameters: {'lambda_l1': 2.8050506050477666, 'lambda_l2': 1.8603533585967535e-07, 'num_leaves': 102, 'feature_fraction': 0.9115798365425524, 'bagging_fraction': 0.5664300725084459, 'bagging_freq': 6, 'min_child_samples': 25}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:36,212][0m Trial 39 finished with value: 0.9242389649923897 and parameters: {'lambda_l1': 0.662543318964859, 'lambda_l2': 1.225581433075917e-08, 'num_leaves': 40, 'feature_fraction': 0.9768038449341334, 'bagging_fraction': 0.4668739063260409, 'bagging_freq': 5, 'min_child_samples': 48}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:36,733][0m Trial 40 finished with value: 0.92083967529173 and parameters: {'lambda_l1': 3.76788937070107e-05, 'lambda_l2': 1.411757419120422e-06, 'num_leaves': 134, 'feature_fraction': 0.6733302469847937, 'bagging_fraction': 0.4025703656655561, 'bagging_freq': 4, 'min_child_samples': 41}. Best is trial 34 with value: 0.9268180280737358.[0m




[32m[I 2022-02-17 16:33:37,066][0m Trial 41 finished with value: 0.9268581938102486 and parameters: {'lambda_l1': 9.727166980299499, 'lambda_l2': 0.0006138586173851652, 'num_leaves': 164, 'feature_fraction': 0.795344377122718, 'bagging_fraction': 0.6938541349510962, 'bagging_freq': 4, 'min_child_samples': 50}. Best is trial 41 with value: 0.9268581938102486.[0m




[32m[I 2022-02-17 16:33:37,413][0m Trial 42 finished with value: 0.928006088280061 and parameters: {'lambda_l1': 9.693520077339764, 'lambda_l2': 0.0011718408699389177, 'num_leaves': 115, 'feature_fraction': 0.7966576759605296, 'bagging_fraction': 0.6991081699843358, 'bagging_freq': 4, 'min_child_samples': 61}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:37,845][0m Trial 43 finished with value: 0.9239493488922713 and parameters: {'lambda_l1': 2.558095193388561, 'lambda_l2': 0.0009748413623417207, 'num_leaves': 115, 'feature_fraction': 0.733130311909747, 'bagging_fraction': 0.7184581558407153, 'bagging_freq': 5, 'min_child_samples': 60}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:38,305][0m Trial 44 finished with value: 0.9235603754439372 and parameters: {'lambda_l1': 1.0594386256361172, 'lambda_l2': 0.005367031803284334, 'num_leaves': 73, 'feature_fraction': 0.796537610495237, 'bagging_fraction': 0.5829031659669597, 'bagging_freq': 4, 'min_child_samples': 66}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:38,831][0m Trial 45 finished with value: 0.9261309825807542 and parameters: {'lambda_l1': 3.842217798323652, 'lambda_l2': 0.0006550721394729027, 'num_leaves': 85, 'feature_fraction': 0.6812875163915644, 'bagging_fraction': 0.8077721224069628, 'bagging_freq': 3, 'min_child_samples': 55}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:39,178][0m Trial 46 finished with value: 0.927071706409606 and parameters: {'lambda_l1': 9.979459038290237, 'lambda_l2': 0.04937217831828219, 'num_leaves': 128, 'feature_fraction': 0.7843311752534018, 'bagging_fraction': 0.7787835711145241, 'bagging_freq': 4, 'min_child_samples': 70}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:39,648][0m Trial 47 finished with value: 0.921613394216134 and parameters: {'lambda_l1': 2.1303386801028504e-06, 'lambda_l2': 0.02716918922248016, 'num_leaves': 127, 'feature_fraction': 0.7830530573469431, 'bagging_fraction': 0.6939240549117988, 'bagging_freq': 2, 'min_child_samples': 80}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:41,094][0m Trial 48 finished with value: 0.9201357179096905 and parameters: {'lambda_l1': 0.031535301069265324, 'lambda_l2': 0.30243372528938245, 'num_leaves': 179, 'feature_fraction': 0.7068932769586695, 'bagging_fraction': 0.774802932280211, 'bagging_freq': 6, 'min_child_samples': 5}. Best is trial 42 with value: 0.928006088280061.[0m




[32m[I 2022-02-17 16:33:41,575][0m Trial 49 finished with value: 0.9246871300524269 and parameters: {'lambda_l1': 0.12366898578824966, 'lambda_l2': 0.48330904379176376, 'num_leaves': 95, 'feature_fraction': 0.8408951636735417, 'bagging_fraction': 0.7345990203069114, 'bagging_freq': 5, 'min_child_samples': 69}. Best is trial 42 with value: 0.928006088280061.[0m


Number of finished trials: 50
Best trial: {'lambda_l1': 9.693520077339764, 'lambda_l2': 0.0011718408699389177, 'num_leaves': 115, 'feature_fraction': 0.7966576759605296, 'bagging_fraction': 0.6991081699843358, 'bagging_freq': 4, 'min_child_samples': 61}


In [90]:
study.best_trial.params

{'lambda_l1': 9.693520077339764,
 'lambda_l2': 0.0011718408699389177,
 'num_leaves': 115,
 'feature_fraction': 0.7966576759605296,
 'bagging_fraction': 0.6991081699843358,
 'bagging_freq': 4,
 'min_child_samples': 61}

In [108]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import optuna


def objective(trial,data=X,target=y):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25,random_state=seeds)
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    clf = LGBMClassifier(random_state=seeds,scale_pos_weight=2, **params)
    model = make_pipeline(preprocess,smoteen, clf)
    metric = make_scorer(f2_measure)
    # Evaluate model
    scores = cross_val_score(model,test_x, test_y, scoring=metric, cv=3, n_jobs=-1).mean()
    return scores

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-02-17 17:03:34,505][0m A new study created in memory with name: no-name-85c79d04-9b94-40ef-a297-62d851684703[0m
[32m[I 2022-02-17 17:03:34,732][0m Trial 0 finished with value: 0.7843229505356429 and parameters: {'lambda_l1': 1.0718804276943297e-08, 'lambda_l2': 6.313719211217128e-05, 'num_leaves': 13, 'feature_fraction': 0.835105825861003, 'bagging_fraction': 0.6690570960165985, 'bagging_freq': 3, 'min_child_samples': 48}. Best is trial 0 with value: 0.7843229505356429.[0m
[32m[I 2022-02-17 17:03:34,961][0m Trial 1 finished with value: 0.7830201351672801 and parameters: {'lambda_l1': 0.013826865927299446, 'lambda_l2': 0.00021482568347568632, 'num_leaves': 232, 'feature_fraction': 0.41284107871574893, 'bagging_fraction': 0.4152344107212189, 'bagging_freq': 4, 'min_child_samples': 42}. Best is trial 0 with value: 0.7843229505356429.[0m
[32m[I 2022-02-17 17:03:35,436][0m Trial 2 finished with value: 0.7812428822608769 and parameters: {'lambda_l1': 3.3036489348816736

Number of finished trials: 50
Best trial: {'lambda_l1': 9.372189667936706e-07, 'lambda_l2': 5.006422292802079, 'num_leaves': 207, 'feature_fraction': 0.5726199420672202, 'bagging_fraction': 0.9838013199012343, 'bagging_freq': 3, 'min_child_samples': 17}


## Results

In [113]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


test = pd.read_pickle(r"test_enc.pkl")
X_test, y_test =  test.drop(columns="left", axis = 1), test["left"]

params= {'lambda_l1': 9.693520077339764,
 'lambda_l2': 0.0011718408699389177,
 'num_leaves': 115,
 'feature_fraction': 0.7966576759605296,
 'bagging_fraction': 0.6991081699843358,
 'bagging_freq': 4,
 'min_child_samples': 61}

clf2 = LGBMClassifier(random_state=seeds,**params)

pipeline_final = make_pipeline(preprocess, smoteen, clf2)

pipeline_final.fit(X, y)
    # Obtain the predictions from our random forest model 
predicted = pipeline_final.predict(X_test)
    # Predict probabilities
probs = pipeline_final.predict_proba(X_test)
    # Print the ROC curve, classification report and confusion matrix
print(fbeta_score(y_test, predicted, beta=2))
print(classification_report(y_test, predicted))
print(confusion_matrix(y_test, predicted))

0.8283558629361835
              precision    recall  f1-score   support

           0       0.97      0.68      0.80      1351
           1       0.55      0.95      0.70       557

    accuracy                           0.76      1908
   macro avg       0.76      0.82      0.75      1908
weighted avg       0.85      0.76      0.77      1908

[[925 426]
 [ 30 527]]
