In [1]:
import numpy as np
import pandas as pd
import math
import sys, os
#import seaborn as sns
#from scipy import stats
#from pathlib import Path
#import matplotlib.pyplot as plt
#from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_squared_error
#from sklearn.metrics import mean_squared_error
#from sklearn.preprocessing import MinMaxScaler
#from sklearn.datasets import fetch_california_housing
#import math
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import optuna
from optuna.samplers import TPESampler
import lightgbm as lgbm
#from xgboost import XGBRegressor
#import xgboost as xgb
#from catboost import CatBoostRegressor
#from lightgbm.sklearn import LGBMRegressor
#from sklearn.ensemble import RandomForestRegressor
import pickle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

In [2]:
train_time = 1 * 60 * 60
objective = 'lgbmregressor'

In [3]:
sys.path.append(os.path.realpath('..'))

In [4]:
train_df = pd.read_csv(r"..\data\train.csv", index_col=0)
test_df = pd.read_csv(r"..\data\test.csv", index_col=0)
sample_sub = pd.read_csv(r"..\data\sample_submission.csv", index_col=0)

In [5]:
del train_df['Over18']
del train_df['EmployeeCount']
del train_df['StandardHours']

In [6]:
columns_to_vectorize = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
for vector_target in columns_to_vectorize:
    print(vector_target)
    vectorizer = CountVectorizer()
    vectorizer.fit_transform(train_df[vector_target])
    train_df[f'{vector_target}_v'] = vectorizer.transform(train_df[vector_target]).toarray().argmax(axis=1)[:,None]
    vectorizer.fit_transform(test_df[vector_target])
    test_df[f'{vector_target}_v'] = vectorizer.transform(test_df[vector_target]).toarray().argmax(axis=1)[:,None]

BusinessTravel
Department
EducationField
Gender
JobRole
MaritalStatus
OverTime


In [7]:
features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction','HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
        'BusinessTravel_v', 'Department_v', 'EducationField_v','Gender_v', 'JobRole_v', 'MaritalStatus_v', 'OverTime_v']
target = ['Attrition']

In [8]:
scaler = MinMaxScaler().fit(train_df[features])
X = scaler.transform(train_df[features])
X_test = scaler.transform(test_df[features])

scaler = MinMaxScaler().fit(train_df[target])
Y = scaler.transform(train_df[target])

In [9]:
RANDOM_STATE = 12 
FOLDS = 5
param_grid_history = {}

def render_model(param_grid):
    reg = lgbm.LGBMRegressor(**param_grid, metric='AUC', random_state=1)
    return reg
    
def train_model(reg, X_train, y_train, X_valid, y_valid):
    reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgbm.early_stopping(100, verbose=True)], verbose=False)

    preds = reg.predict(train_df[features])
    fpr, tpr, _ = roc_curve(train_df[target], preds)
    roc_auc = auc(fpr, tpr)
    return reg, roc_auc

def objective_v2(trial):
    scores = []
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 1000), 
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 100),
        'num_leaves': trial.suggest_int('num_leaves', 100, 10000),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.1, 1),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 1),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.1, 5),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 10), 
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 1000), 
        'reg_lambda': trial.suggest_int('reg_lambda', 1, 100), 
        'n_estimators': trial.suggest_int('n_estimators', 10, 100000), 
    } 
        
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        print(10*"=", f"Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        
        reg = render_model(param_grid)
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)

        scores.append(roc_auc)
    mean_scores = np.mean(scores)
    param_grid_history[mean_scores] = param_grid

    return mean_scores


In [10]:
study = optuna.create_study(direction='maximize', sampler=TPESampler(), study_name=objective)
study.optimize(objective_v2, timeout=train_time)

[32m[I 2023-01-17 22:33:53,911][0m A new study created in memory with name: lgbmregressor[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.727407




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.792905




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.826695




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.793475




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.786398


[32m[I 2023-01-17 22:33:55,332][0m Trial 0 finished with value: 0.820596817874069 and parameters: {'learning_rate': 0.403337995025499, 'max_depth': 918, 'min_data_in_leaf': 6, 'num_leaves': 9648, 'colsample_bytree': 0.16222481148852086, 'lambda_l1': 0.536601065081237, 'lambda_l2': 0.7649328175185188, 'subsample': 0.741313044938092, 'min_gain_to_split': 0.8351399385880928, 'bagging_fraction': 0.8143801152823923, 'feature_fraction': 0.8160289228991924, 'subsample_freq': 10, 'min_child_samples': 659, 'reg_lambda': 8, 'n_estimators': 94807}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.753378
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.625169
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.69928
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.763475


[32m[I 2023-01-17 22:33:55,611][0m Trial 1 finished with value: 0.6937467840216656 and parameters: {'learning_rate': 0.20615048455148427, 'max_depth': 651, 'min_data_in_leaf': 94, 'num_leaves': 3464, 'colsample_bytree': 0.1531005273012884, 'lambda_l1': 0.7336374217288605, 'lambda_l2': 0.22790435816630503, 'subsample': 0.6297524223759186, 'min_gain_to_split': 3.450275358956495, 'bagging_fraction': 0.4402514717517191, 'feature_fraction': 0.3556281173319158, 'subsample_freq': 6, 'min_child_samples': 783, 'reg_lambda': 8, 'n_estimators': 27496}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5625
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.661064




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.770101
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.759576




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.755254
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.754025


[32m[I 2023-01-17 22:33:56,301][0m Trial 2 finished with value: 0.7538307379823967 and parameters: {'learning_rate': 0.14735545125469934, 'max_depth': 762, 'min_data_in_leaf': 74, 'num_leaves': 7444, 'colsample_bytree': 0.53106043275576, 'lambda_l1': 0.6896256380261555, 'lambda_l2': 0.925471877926584, 'subsample': 0.32259859236610294, 'min_gain_to_split': 4.242177786056084, 'bagging_fraction': 0.9571063757698471, 'feature_fraction': 0.9180663144194419, 'subsample_freq': 4, 'min_child_samples': 591, 'reg_lambda': 71, 'n_estimators': 70910}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.675422
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.731799
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.587288
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.683475


[32m[I 2023-01-17 22:33:56,545][0m Trial 3 finished with value: 0.6683077183480026 and parameters: {'learning_rate': 0.314940357655767, 'max_depth': 167, 'min_data_in_leaf': 41, 'num_leaves': 3091, 'colsample_bytree': 0.5344993394677735, 'lambda_l1': 0.6437272876753731, 'lambda_l2': 0.37313695934129854, 'subsample': 0.9488002764803154, 'min_gain_to_split': 1.9940659354116737, 'bagging_fraction': 0.2679861230284695, 'feature_fraction': 0.4117283197247613, 'subsample_freq': 10, 'min_child_samples': 953, 'reg_lambda': 88, 'n_estimators': 76546}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.5625
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[18]	valid_0's auc: 0.734291
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[25]	valid_0's auc: 0.826858
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.816695




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.769915
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's auc: 0.760678


[32m[I 2023-01-17 22:33:57,185][0m Trial 4 finished with value: 0.794373730534868 and parameters: {'learning_rate': 0.04746643464356633, 'max_depth': 248, 'min_data_in_leaf': 23, 'num_leaves': 6658, 'colsample_bytree': 0.41907782717666864, 'lambda_l1': 0.9315345917879286, 'lambda_l2': 0.36657965032437967, 'subsample': 0.43284458822821503, 'min_gain_to_split': 3.208898422406649, 'bagging_fraction': 0.8550048099710111, 'feature_fraction': 0.32705227379965884, 'subsample_freq': 5, 'min_child_samples': 813, 'reg_lambda': 98, 'n_estimators': 11111}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.783361
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.795735
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.781144


[32m[I 2023-01-17 22:33:57,518][0m Trial 5 finished with value: 0.8127586323628977 and parameters: {'learning_rate': 0.2700630007062995, 'max_depth': 868, 'min_data_in_leaf': 12, 'num_leaves': 4621, 'colsample_bytree': 0.4373221141262026, 'lambda_l1': 0.2756255612971849, 'lambda_l2': 0.6442336167048393, 'subsample': 0.7421318115491543, 'min_gain_to_split': 0.7600680168369415, 'bagging_fraction': 0.4193524352815561, 'feature_fraction': 0.37835144802569454, 'subsample_freq': 6, 'min_child_samples': 443, 'reg_lambda': 43, 'n_estimators': 96633}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[20]	valid_0's auc: 0.804534
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.79
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.743581
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[6]	valid_0's auc: 0.807095
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.776398
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.799068
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.742246


[32m[I 2023-01-17 22:33:57,821][0m Trial 6 finished with value: 0.793436018957346 and parameters: {'learning_rate': 0.1472179002420975, 'max_depth': 981, 'min_data_in_leaf': 10, 'num_leaves': 4815, 'colsample_bytree': 0.5386676145850501, 'lambda_l1': 0.43477064172999436, 'lambda_l2': 0.6906045727047873, 'subsample': 0.6770717741427911, 'min_gain_to_split': 1.276796515297926, 'bagging_fraction': 0.6373247893474818, 'feature_fraction': 0.6511529614341841, 'subsample_freq': 6, 'min_child_samples': 557, 'reg_lambda': 18, 'n_estimators': 94243}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[104]	valid_0's auc: 0.778843
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.84848
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[98]	valid_0's auc: 0.818475
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[63]	valid_0's auc: 0.816144


[32m[I 2023-01-17 22:33:58,528][0m Trial 7 finished with value: 0.8095118483412322 and parameters: {'learning_rate': 0.07575371870242624, 'max_depth': 156, 'min_data_in_leaf': 90, 'num_leaves': 6525, 'colsample_bytree': 0.5444129581258106, 'lambda_l1': 0.38989822330977764, 'lambda_l2': 0.7381898254451607, 'subsample': 0.9743395191043096, 'min_gain_to_split': 2.8122644732787156, 'bagging_fraction': 0.5243712743858001, 'feature_fraction': 0.6855543844873826, 'subsample_freq': 1, 'min_child_samples': 241, 'reg_lambda': 8, 'n_estimators': 44411}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[196]	valid_0's auc: 0.766186
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.768919
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.817061
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[72]	valid_0's auc: 0.819703


[32m[I 2023-01-17 22:33:58,884][0m Trial 8 finished with value: 0.8175958023019634 and parameters: {'learning_rate': 0.015290550860772294, 'max_depth': 69, 'min_data_in_leaf': 44, 'num_leaves': 3096, 'colsample_bytree': 0.7732145456012458, 'lambda_l1': 0.9166327984709999, 'lambda_l2': 0.4642599878265511, 'subsample': 0.9857988393820357, 'min_gain_to_split': 1.6788703681177428, 'bagging_fraction': 0.7280330363001867, 'feature_fraction': 0.7461891729151183, 'subsample_freq': 8, 'min_child_samples': 408, 'reg_lambda': 40, 'n_estimators': 50368}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's auc: 0.81
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.789831
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.618919




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.76723
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.711483


[32m[I 2023-01-17 22:33:59,282][0m Trial 9 finished with value: 0.7111899119837508 and parameters: {'learning_rate': 0.20508979183801904, 'max_depth': 962, 'min_data_in_leaf': 76, 'num_leaves': 5230, 'colsample_bytree': 0.8510898972960744, 'lambda_l1': 0.36615444247589646, 'lambda_l2': 0.3289040107532989, 'subsample': 0.5443770112770804, 'min_gain_to_split': 3.388582485137221, 'bagging_fraction': 0.30513792749133123, 'feature_fraction': 0.9334305301109618, 'subsample_freq': 9, 'min_child_samples': 421, 'reg_lambda': 28, 'n_estimators': 65967}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.683475
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.676695
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's auc: 0.797889
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[95]	valid_0's auc: 0.795101
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[82]	valid_0's auc: 0.785508
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[35]	valid_0's auc: 0.831356




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[106]	valid_0's auc: 0.783644


[32m[I 2023-01-17 22:34:01,002][0m Trial 10 finished with value: 0.8114177386594449 and parameters: {'learning_rate': 0.4678574505979868, 'max_depth': 445, 'min_data_in_leaf': 29, 'num_leaves': 9696, 'colsample_bytree': 0.1230344574982768, 'lambda_l1': 0.1013035435019573, 'lambda_l2': 0.9160452685532956, 'subsample': 0.23863903263921454, 'min_gain_to_split': 0.12713258982461384, 'bagging_fraction': 0.12219968295171779, 'feature_fraction': 0.1027441612719508, 'subsample_freq': 2, 'min_child_samples': 17, 'reg_lambda': 63, 'n_estimators': 1514}. Best is trial 0 with value: 0.820596817874069.[0m
[32m[I 2023-01-17 22:34:01,169][0m Trial 11 finished with value: 0.7917342586323629 and parameters: {'learning_rate': 0.40209287962810436, 'max_depth': 431, 'min_data_in_leaf': 55, 'num_leaves': 538, 'colsample_bytree': 0.8318429907319655, 'lambda_l1': 0.9931538664311187, 'lambda_l2': 0.531662280503818, 'subsample': 0.845937290964041, 'min_gain_to_split': 1.7497516990105777, 'bagging_fraction

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.747002
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.839611
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.79339
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.803983
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.734195




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.752492
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.7856




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.842119
Training until validation scores don't improve for 100 rounds




Early stopping, best iteration is:
[4]	valid_0's auc: 0.798644


[32m[I 2023-01-17 22:34:02,348][0m Trial 12 finished with value: 0.8136692620176031 and parameters: {'learning_rate': 0.3563782656687756, 'max_depth': 621, 'min_data_in_leaf': 50, 'num_leaves': 9517, 'colsample_bytree': 0.7553153096952147, 'lambda_l1': 0.8164408641745393, 'lambda_l2': 0.5558657336917696, 'subsample': 0.7957156901826581, 'min_gain_to_split': 0.8331840041895371, 'bagging_fraction': 0.7382735184994076, 'feature_fraction': 0.8244278110685708, 'subsample_freq': 8, 'min_child_samples': 649, 'reg_lambda': 1, 'n_estimators': 34491}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.771483


[32m[I 2023-01-17 22:34:02,503][0m Trial 13 finished with value: 0.7968561272850372 and parameters: {'learning_rate': 0.4969499059683727, 'max_depth': 339, 'min_data_in_leaf': 2, 'num_leaves': 932, 'colsample_bytree': 0.6939526131054922, 'lambda_l1': 0.5248255560434976, 'lambda_l2': 0.10314994189298393, 'subsample': 0.8665727619929918, 'min_gain_to_split': 1.9554139141252083, 'bagging_fraction': 0.9691236197053126, 'feature_fraction': 0.6029033082145354, 'subsample_freq': 10, 'min_child_samples': 356, 'reg_lambda': 28, 'n_estimators': 59453}. Best is trial 0 with value: 0.820596817874069.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.745608
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.787035
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.772331
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.803432
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.778136
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.810726




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.816723
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.848305
Training until validation scores don't improve for 100 rounds


[32m[I 2023-01-17 22:34:02,852][0m Trial 14 finished with value: 0.8919895057549085 and parameters: {'learning_rate': 0.4303955902285228, 'max_depth': 55, 'min_data_in_leaf': 30, 'num_leaves': 3112, 'colsample_bytree': 0.23792639243326333, 'lambda_l1': 0.5678327464283103, 'lambda_l2': 0.8009892426952581, 'subsample': 0.5195621188794695, 'min_gain_to_split': 0.18395419436907856, 'bagging_fraction': 0.7897654408711418, 'feature_fraction': 0.773773012267108, 'subsample_freq': 8, 'min_child_samples': 727, 'reg_lambda': 54, 'n_estimators': 85092}. Best is trial 14 with value: 0.8919895057549085.[0m


Early stopping, best iteration is:
[6]	valid_0's auc: 0.834958
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[12]	valid_0's auc: 0.801271
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.801014
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.786655
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.827076
Training until validation scores don't improve for 100 rounds


[32m[I 2023-01-17 22:34:03,050][0m Trial 15 finished with value: 0.8449996614759648 and parameters: {'learning_rate': 0.41663704075820424, 'max_depth': 594, 'min_data_in_leaf': 26, 'num_leaves': 1781, 'colsample_bytree': 0.24668275443232263, 'lambda_l1': 0.5781091410183, 'lambda_l2': 0.8161074636810143, 'subsample': 0.10223430367775788, 'min_gain_to_split': 0.4980431688212577, 'bagging_fraction': 0.8317622014050856, 'feature_fraction': 0.5346398202583971, 'subsample_freq': 9, 'min_child_samples': 730, 'reg_lambda': 59, 'n_estimators': 83035}. Best is trial 14 with value: 0.8919895057549085.[0m


Early stopping, best iteration is:
[17]	valid_0's auc: 0.833263
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.778856
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[7]	valid_0's auc: 0.766807
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.804181
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's auc: 0.830254
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[13]	valid_0's auc: 0.833644


[32m[I 2023-01-17 22:34:03,243][0m Trial 16 finished with value: 0.8302234258632364 and parameters: {'learning_rate': 0.42319381549827234, 'max_depth': 568, 'min_data_in_leaf': 29, 'num_leaves': 1783, 'colsample_bytree': 0.2900258039104026, 'lambda_l1': 0.5961671971529413, 'lambda_l2': 0.8398197350937658, 'subsample': 0.10425434319977325, 'min_gain_to_split': 0.5342360034180098, 'bagging_fraction': 0.6061152674907859, 'feature_fraction': 0.5079008229495393, 'subsample_freq': 8, 'min_child_samples': 978, 'reg_lambda': 63, 'n_estimators': 82091}. Best is trial 14 with value: 0.8919895057549085.[0m


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.801356
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	valid_0's auc: 0.850929
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.86402
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[84]	valid_0's auc: 0.824915


[33m[W 2023-01-17 22:34:03,443][0m Trial 17 failed because of the following error: KeyboardInterrupt()[0m
Traceback (most recent call last):
  File "D:\source\repos\venv\Python310\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ryans\AppData\Local\Temp\ipykernel_19772\2665844800.py", line 44, in objective_v2
    reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)
  File "C:\Users\ryans\AppData\Local\Temp\ipykernel_19772\2665844800.py", line 10, in train_model
    reg.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgbm.early_stopping(100, verbose=True)], verbose=False)
  File "D:\source\repos\venv\Python310\lib\site-packages\lightgbm\sklearn.py", line 895, in fit
    super().fit(X, y, sample_weight=sample_weight, init_score=init_score,
  File "D:\source\repos\venv\Python310\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "D:\source\repos

In [None]:
parm_grid_bytes = pickle.dumps(param_grid_history)
with open(f"rendered_data/{objective}_bytes.hex", "wb") as binary_file:
    binary_file.write(parm_grid_bytes)

In [None]:
def train(param_grid):
    reg = render_model(param_grid)
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_df[features], train_df[target])):
        print(10*"=", f"FINAL TRAINING Fold={fold+1}", 10*"=")
        X_train, X_valid = train_df.iloc[train_idx][features], train_df.iloc[valid_idx][features]
        y_train , y_valid = train_df[target].iloc[train_idx] , train_df[target].iloc[valid_idx] 
        reg, roc_auc = train_model(reg, X_train, y_train, X_valid, y_valid)
    return reg

percent = math.ceil(len(param_grid_history.keys()) * .1)
top = sorted(list(param_grid_history.keys()))[-percent:]

train_preds = []
test_preds = []
for key in tqdm(top):
    model = train(param_grid_history[key])
    train_preds.append(model.predict(X))
    test_preds.append(model.predict(X_test))

train_final_preds = np.stack(train_preds).mean(0)
test_final_preds = np.stack(test_preds).mean(0)

In [None]:
train_submission = pd.DataFrame(data={'id': train_df.index, target[0]: train_final_preds})
train_submission.to_csv(fr'rendered_data/{objective}_train_submission.csv', index=False)

test_submission = pd.DataFrame(data={'id': test_df.index, target[0]: test_final_preds})
test_submission.to_csv(fr'rendered_data/{objective}_test_submission.csv', index=False)