In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Imports

In [None]:
# @title
!pip install iterative-stratification
!pip install optuna
!pip install --upgrade category_encoders
!pip install catboost

# Library & Data Load

In [None]:
# @title
import sys
# sys.path.append('../input/iterativestratification')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from tqdm.auto import tqdm
from copy import deepcopy
from functools import partial
import random
import gc

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA


# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder
from imblearn.under_sampling import RandomUnderSampler

# Import libraries for Hypertuning
import optuna

# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
# from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# @title
train = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/icr-identify-age-related-conditions (1).zip (Unzipped Files)/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/icr-identify-age-related-conditions (1).zip (Unzipped Files)/test.csv')
greeks = pd.read_csv('/content/drive/MyDrive/Kaggle/SC_PJ/icr-identify-age-related-conditions (1).zip (Unzipped Files)/greeks.csv')

# Preprocessing

## Variable

In [None]:
# @title
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

num_cols = train.columns.tolist()[1:-1]
cat_cols = 'EJ'
target_col = 'Class'

In [None]:
# @title
X_train = train.drop([f'{target_col}', 'Id'],axis=1).reset_index(drop=True)
y_train = train[f'{target_col}'].reset_index(drop=True)

X_test = test.drop(['Id'],axis=1).reset_index(drop=True)

## Label Encoding

In [None]:
# @title
for col in num_cols:
    if col == cat_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

## Null

In [None]:
# @title
# 결측치 KNNImputer 사용
imp = KNNImputer()
X_train_imp =imp.fit_transform(X_train)
X_test_imp = imp.transform(X_test)

X_train_tmp = pd.DataFrame(columns = X_train.columns, data = X_train_imp)
X_train = pd.concat([X_train_tmp], axis=1)

X_test_tmp = pd.DataFrame(columns = X_test.columns, data = X_test_imp)
X_test = pd.concat([X_test_tmp], axis=1)

## Scale

In [None]:
# @title
# 'EJ'만 제외하고 standarscaler 적용
scale_cols = X_train.select_dtypes(include = ['float64', 'int64']).columns

scale_cols = [_ for _ in scale_cols if _ not in ['EJ']]

for col in scale_cols:
    scaler = StandardScaler()
    X_train[col] = scaler.fit_transform(X_train[[col]])
    X_test[col] = scaler.transform(X_test[[col]])

X_train

## handle the dataset imbalance - Oversampling
- Using "greeks.csv" data to stratify observations

In [None]:
# @title
greeks_id = greeks.pop('Id')

In [None]:
# @title
train = pd.concat([X_train, y_train, greeks],axis=1)
train

In [None]:
# @title
greeks.columns

In [None]:
# @title
greeks['Epsilon'] = 0

cols = greeks.columns

for col in cols:
    le = LabelEncoder()
    greeks[col] = le.fit_transform(greeks[col])


In [None]:
# @title
train = pd.concat([X_train, y_train, greeks],axis=1)
train

In [None]:
# @title
# 클래스 불균형 해결
train = pd.concat([X_train, y_train, greeks],axis=1)
over_columns = train.columns

# oversampling
from imblearn.over_sampling import SMOTE

# greeks 테이블의 'Alpha' feature 기준
X_over, y_over = SMOTE(random_state=42).fit_resample(train, greeks.Alpha)

# 오버샘플링 전후 Output특성 클래스 비교 (각각 다른 코드 블럭에서 실행)
plt.figure(figsize=(8,4))
plt.subplot(1, 2, 1)
plt.title("Before Oversampling")
greeks['Alpha'].value_counts().plot(kind = 'bar')
plt.subplot(1, 2, 2)
plt.title("After Oversampling")
y_over.value_counts().plot(kind = 'bar')
plt.show()

print(f'오버샘플링 전 shape 체크: ', greeks.Alpha.shape)
print(f'오버샘플링 후 shape 체크: ', y_over.shape)

In [None]:
# @title
over_columns

In [None]:
# @title
X_over

In [None]:
# @title
X_train = X_over.iloc[:,:-6]
y_train = X_over.iloc[:,-6:-5]
greeks = X_over.iloc[:, -5:]
print(X_train.shape, y_train.shape, greeks.shape)

In [None]:
# @title
X_train

In [None]:
# @title
y_train

In [None]:
# @title
# 모델 학습을 위해 array 형태로 변환
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values

# Base model hyperparameter optimization with Optuna

In [None]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner

## LGBMClassifier

In [None]:
def lgbm_objective(trial, X, y):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'random_state': 42,
        'class_weight': 'balanced'
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []



    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = LGBMClassifier(**params)
        classifier.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=0, early_stopping_rounds=25)

        lgb_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, lgb_pred))

    return np.mean(log_loss_scores)


# Create the Optuna study and optimize the objective function
lgbm_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
lgbm_study.optimize(lambda trial: lgbm_objective(trial, X_train, y_train), n_trials=50)

# Print the best hyperparameters and corresponding log loss for LGBMClassifier
lgbm_best_params = lgbm_study.best_params
lgbm_best_score = lgbm_study.best_value
print("LightGBM Best Hyperparameters: ", lgbm_best_params)
print("LightGBM Best Log Loss: ", lgbm_best_score)


[I 2023-07-28 15:28:31,825] A new study created in memory with name: no-name-e986e87d-0b45-496e-b24c-0f23f46d63a5
[I 2023-07-28 15:28:34,558] Trial 0 finished with value: 0.2109327143444947 and parameters: {'n_estimators': 390, 'learning_rate': 0.0042372116530247535, 'max_depth': 5, 'subsample': 0.6130226666995232, 'colsample_bytree': 0.5577573784219801}. Best is trial 0 with value: 0.2109327143444947.
[I 2023-07-28 15:28:44,359] Trial 1 finished with value: 0.034371228797388555 and parameters: {'n_estimators': 639, 'learning_rate': 0.02308014145692365, 'max_depth': 14, 'subsample': 0.36649272268589184, 'colsample_bytree': 0.9412818593340355}. Best is trial 1 with value: 0.034371228797388555.
[I 2023-07-28 15:28:45,964] Trial 2 finished with value: 0.027342509005498023 and parameters: {'n_estimators': 909, 'learning_rate': 0.08140851578214872, 'max_depth': 9, 'subsample': 0.632322998989933, 'colsample_bytree': 0.5058435060831932}. Best is trial 2 with value: 0.027342509005498023.
[I 20

LightGBM Best Hyperparameters:  {'n_estimators': 764, 'learning_rate': 0.05228728192855567, 'max_depth': 12, 'subsample': 0.7560254463896311, 'colsample_bytree': 0.23453028993348088}
LightGBM Best Log Loss:  0.02138338026489591


In [None]:
# 시각화
optuna.visualization.plot_optimization_history(lgbm_study)

In [None]:
# 파라미터들 간의 관계
optuna.visualization.plot_parallel_coordinate(lgbm_study)

In [None]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(lgbm_study)

## XGBClassifier

In [None]:
def xgb_objective(trial, X, y):
    params = {
        'objective': 'binary:logistic', # 분류
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth',4, 16),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'gamma': trial.suggest_int('gamma', 1, 6),
        'max_depth': trial.suggest_int('max_depth', 2, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'eta': trial.suggest_float('eta', 1e-4, 1e-1, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 2, 4.71),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = XGBClassifier(**params)
        classifier.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=0, early_stopping_rounds=25)

        xgb_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, xgb_pred))

    return np.mean(log_loss_scores)

# Create the Optuna study and optimize the objective function
xgb_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
xgb_study.optimize(lambda trial: xgb_objective(trial, X_train, y_train), n_trials=30)

# Print the best hyperparameters and corresponding log loss for XGBClassifier
xgb_best_params = xgb_study.best_params
xgb_best_score = xgb_study.best_value
print("XGBoost Best Hyperparameters: ", xgb_best_params)
print("XGBoost Best Log Loss: ", xgb_best_score)

[I 2023-07-28 16:09:32,949] A new study created in memory with name: no-name-358ef1c3-dcfa-43d9-b2e8-83a68d26659f
[I 2023-07-28 16:10:02,830] Trial 0 finished with value: 0.6931186482884357 and parameters: {'max_depth': 10, 'learning_rate': 5.2952664425068036e-08, 'n_estimators': 683, 'lambda': 1.097285844115476e-08, 'gamma': 5, 'min_child_weight': 2, 'eta': 0.010369721598925526, 'subsample': 0.5483297441753493, 'colsample_bytree': 0.7546697591783917, 'scale_pos_weight': 4.309010752658876}. Best is trial 0 with value: 0.6931186482884357.
[I 2023-07-28 16:12:51,559] Trial 1 finished with value: 0.6930948479171244 and parameters: {'max_depth': 13, 'learning_rate': 2.393331113448259e-08, 'n_estimators': 2711, 'lambda': 0.060932790917846764, 'gamma': 4, 'min_child_weight': 3, 'eta': 0.0017040581162149802, 'subsample': 0.8320387488439092, 'colsample_bytree': 0.7272916210656366, 'scale_pos_weight': 4.19460159104907}. Best is trial 1 with value: 0.6930948479171244.
[I 2023-07-28 16:13:50,228]

XGBoost Best Hyperparameters:  {'max_depth': 14, 'learning_rate': 0.0028155272809576247, 'n_estimators': 2812, 'lambda': 0.02343943429696876, 'gamma': 1, 'min_child_weight': 2, 'eta': 0.0455883586610819, 'subsample': 0.6140102374972644, 'colsample_bytree': 0.839995125353164, 'scale_pos_weight': 3.487828692949581}
XGBoost Best Log Loss:  0.06453310481287096


## CatBoostClassifier

In [None]:
def cat_objective(trial, X, y):
    params = {
        'eval_metric':'Logloss',
        'iterations':trial.suggest_int("iterations", 1000, 3000),
        'od_wait':trial.suggest_int('od_wait', 500, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0)
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = CatBoostClassifier(**params)
        classifier.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=0, early_stopping_rounds=25)

        cat_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, cat_pred))

    return np.mean(log_loss_scores)

# Create the Optuna study and optimize the objective function
cat_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
cat_study.optimize(lambda trial: cat_objective(trial, X_train, y_train), n_trials=2)

# Print the best hyperparameters and corresponding log loss for CatBoostClassifier
cat_best_params = cat_study.best_params
cat_best_score = cat_study.best_value
print("CATBoost Best Hyperparameters: ", cat_best_params)
print("CATBoost Best Log Loss: ", cat_best_score)

[I 2023-07-28 15:41:56,306] A new study created in memory with name: no-name-c0d5f987-3863-4c59-a40e-e0a405874b0e
[I 2023-07-28 15:49:20,691] Trial 0 finished with value: 0.04277890395976157 and parameters: {'iterations': 2443, 'od_wait': 1020, 'learning_rate': 0.11650900213290195, 'reg_lambda': 54.92442937510624, 'subsample': 0.7413254845918719, 'random_strength': 19.86576787076967, 'depth': 9, 'min_data_in_leaf': 22, 'leaf_estimation_iterations': 15, 'bagging_temperature': 26.58464727804052, 'colsample_bylevel': 0.46882440988152946}. Best is trial 0 with value: 0.04277890395976157.
[I 2023-07-28 15:50:59,661] Trial 1 finished with value: 0.042160859883973015 and parameters: {'iterations': 2678, 'od_wait': 2121, 'learning_rate': 0.01687687391964352, 'reg_lambda': 16.7785205340028, 'subsample': 0.7828139135810908, 'random_strength': 36.64168060819137, 'depth': 6, 'min_data_in_leaf': 28, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.03457278556761656, 'colsample_bylevel': 0

CATBoost Best Hyperparameters:  {'iterations': 2678, 'od_wait': 2121, 'learning_rate': 0.01687687391964352, 'reg_lambda': 16.7785205340028, 'subsample': 0.7828139135810908, 'random_strength': 36.64168060819137, 'depth': 6, 'min_data_in_leaf': 28, 'leaf_estimation_iterations': 14, 'bagging_temperature': 0.03457278556761656, 'colsample_bylevel': 0.96354199984017}
CATBoost Best Log Loss:  0.042160859883973015


## HistGradientBoostingClassifier

In [None]:
def hist_objective(trial, X, y):
    params = {
        'loss': 'log_loss',
        'class_weight': 'balanced',
        'l2_regularization': trial.suggest_loguniform('l2_regularization',1e-10,10.0),
        'early_stopping': 'auto',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_iter': trial.suggest_categorical('max_iter', [1000]),
        'max_depth': trial.suggest_int('max_depth', 2,30),
        'max_bins': trial.suggest_int('max_bins', 100,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20,100000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = HistGradientBoostingClassifier(**params)
        classifier.fit(X_tr, y_tr)

        hist_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, hist_pred))

    return np.mean(log_loss_scores)

# Create the Optuna study and optimize the objective function
hist_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
hist_study.optimize(lambda trial: hist_objective(trial, X_train, y_train), n_trials=30)

# Print the best hyperparameters and corresponding log loss for HistGradientBoostingClassifier
hist_best_params = hist_study.best_params
hist_best_score = hist_study.best_value
print("HistGradientBoosting Best Hyperparameters: ", hist_best_params)
print("HistGradientBoosting Best Log Loss: ", hist_best_score)

[I 2023-07-28 15:50:59,677] A new study created in memory with name: no-name-f4f73bee-4f20-495a-9570-9d40fc6613f5
[I 2023-07-28 15:51:01,485] Trial 0 finished with value: 0.6931471805599452 and parameters: {'l2_regularization': 1.0488851947582157e-07, 'learning_rate': 0.012971815961777317, 'max_iter': 1000, 'max_depth': 25, 'max_bins': 117, 'min_samples_leaf': 66814, 'max_leaf_nodes': 58}. Best is trial 0 with value: 0.6931471805599452.
[I 2023-07-28 15:51:03,280] Trial 1 finished with value: 0.6931471805599452 and parameters: {'l2_regularization': 4.5462210943026475, 'learning_rate': 0.04298871769709397, 'max_iter': 1000, 'max_depth': 25, 'max_bins': 238, 'min_samples_leaf': 1492, 'max_leaf_nodes': 55}. Best is trial 0 with value: 0.6931471805599452.
[I 2023-07-28 15:51:05,088] Trial 2 finished with value: 0.6931471805599452 and parameters: {'l2_regularization': 2.0373982828948464e-07, 'learning_rate': 0.014985340149699532, 'max_iter': 1000, 'max_depth': 26, 'max_bins': 185, 'min_samp

HistGradientBoosting Best Hyperparameters:  {'l2_regularization': 1.0488851947582157e-07, 'learning_rate': 0.012971815961777317, 'max_iter': 1000, 'max_depth': 25, 'max_bins': 117, 'min_samples_leaf': 66814, 'max_leaf_nodes': 58}
HistGradientBoosting Best Log Loss:  0.6931471805599452


## RandomForestClassifier

In [None]:
def rf_objective(trial, X, y):
    params = {
        'criterion': 'log_loss',
        'class_weight': 'balanced',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 150),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 60)
    }


    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = RandomForestClassifier(**params)
        classifier.fit(X_tr, y_tr)

        rf_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, rf_pred))

    return np.mean(log_loss_scores)

# Create the Optuna study and optimize the objective function
rf_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
rf_study.optimize(lambda trial: rf_objective(trial, X_train, y_train), n_trials=20)

# Print the best hyperparameters and corresponding log loss for RandomForestClassifier
rf_best_params = rf_study.best_params
rf_best_score = rf_study.best_value
print("RandomForest Best Hyperparameters: ", rf_best_params)
print("RandomForest Best Log Loss: ", rf_best_score)

[I 2023-07-28 16:00:49,864] A new study created in memory with name: no-name-2120d084-2c30-462e-8375-2367e5301b53
[I 2023-07-28 16:01:08,161] Trial 0 finished with value: 0.27873781610452497 and parameters: {'n_estimators': 631, 'max_depth': 9, 'min_samples_split': 125, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.27873781610452497.
[I 2023-07-28 16:01:11,023] Trial 1 finished with value: 0.3085357314620687 and parameters: {'n_estimators': 110, 'max_depth': 12, 'min_samples_split': 99, 'min_samples_leaf': 60}. Best is trial 0 with value: 0.27873781610452497.
[I 2023-07-28 16:01:13,250] Trial 2 finished with value: 0.25997295268540377 and parameters: {'n_estimators': 77, 'max_depth': 9, 'min_samples_split': 26, 'min_samples_leaf': 36}. Best is trial 2 with value: 0.25997295268540377.
[I 2023-07-28 16:01:18,888] Trial 3 finished with value: 0.24711304133894568 and parameters: {'n_estimators': 178, 'max_depth': 8, 'min_samples_split': 108, 'min_samples_leaf': 1}. Best is trial 3

RandomForest Best Hyperparameters:  {'n_estimators': 462, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 1}
RandomForest Best Log Loss:  0.09807076088126394


## Base model develop

In [None]:
xgb_clf  = XGBClassifier(**xgb_best_params)
lgb_clf = LGBMClassifier(**lgbm_best_params)
cat_clf = CatBoostClassifier(**cat_best_params)
hist_clf = HistGradientBoostingClassifier(**hist_best_params)
rf_clf = RandomForestClassifier(**rf_best_params)

# Stacked Ensemble model hyperparameter optimization with Optuna

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수.
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
# 지정된 n_folds값으로 KFold 생성.
#     kf = KFold(n_splits=n_folds, shuffle=True, random_state=721)

    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=721)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
#     train_fold_pred_c = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')

    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n, y_train_n)):
    #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출
        print('\t 폴드 세트: ',folder_counter,' 시작 ')

        if type(X_train_n) is np.ndarray:
            X_tr = X_train_n[train_index]
            y_tr = y_train_n[train_index]
            X_te = X_train_n[valid_index]
            y_te = y_train_n[valid_index]
        else:
            X_tr = X_train_n.iloc[train_index]
            y_tr = y_train_n.iloc[train_index]
            X_te = X_train_n.iloc[valid_index]
            y_te = y_train_n.iloc[valid_index]

        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr, y_tr)
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)

        #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장.
        test_pred[:, folder_counter] = model.predict(X_test_n)


    print(model.__class__.__name__, 'log_loss: ', log_loss(y_train_n, train_fold_pred))
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)

#train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred , test_pred_mean

## stacking ensemble

In [None]:
xgb_train, xgb_test = get_stacking_base_datasets(xgb_clf, X_train, y_train, X_test, 5)
lgb_train, lgb_test = get_stacking_base_datasets(lgb_clf, X_train, y_train, X_test, 5)
cat_train, cat_test = get_stacking_base_datasets(cat_clf, X_train, y_train, X_test, 5)
hist_train, hist_test = get_stacking_base_datasets(hist_clf, X_train, y_train, X_test, 5)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 5)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
372:	learn: 0.1146955	total: 3.13s	remaining: 19.3s
373:	learn: 0.1145122	total: 3.14s	remaining: 19.3s
374:	learn: 0.1142418	total: 3.15s	remaining: 19.3s
375:	learn: 0.1139091	total: 3.16s	remaining: 19.3s
376:	learn: 0.1136528	total: 3.16s	remaining: 19.3s
377:	learn: 0.1133171	total: 3.17s	remaining: 19.3s
378:	learn: 0.1126429	total: 3.18s	remaining: 19.3s
379:	learn: 0.1124527	total: 3.19s	remaining: 19.3s
380:	learn: 0.1121522	total: 3.2s	remaining: 19.3s
381:	learn: 0.1118459	total: 3.21s	remaining: 19.3s
382:	learn: 0.1115865	total: 3.21s	remaining: 19.3s
383:	learn: 0.1114306	total: 3.22s	remaining: 19.2s
384:	learn: 0.1109229	total: 3.23s	remaining: 19.2s
385:	learn: 0.1106372	total: 3.24s	remaining: 19.2s
386:	learn: 0.1101873	total: 3.24s	remaining: 19.2s
387:	learn: 0.1094712	total: 3.25s	remaining: 19.2s
388:	learn: 0.1092423	total: 3.26s	remaining: 19.2s
389:	learn: 0.1088006	total: 3.27s	remaining: 19.2s
390:	learn: 0.1

In [None]:
Stack_final_X_train = np.concatenate((xgb_train, lgb_train, cat_train, hist_train, rf_train), axis=1)
Stack_final_X_test = np.concatenate((xgb_test, lgb_test, cat_test, hist_test, rf_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', Stack_final_X_train.shape,
      '스태킹 테스트 피처 데이터 Shape:',Stack_final_X_test.shape)

원본 학습 피처 데이터 Shape: (2036, 56) 원본 테스트 피처 Shape: (5, 56)
스태킹 학습 피처 데이터 Shape: (2036, 5) 스태킹 테스트 피처 데이터 Shape: (5, 5)


## Final LGBMClassifier

In [None]:
def final_lgbm_objective(trial, X, y):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'random_state': 42,
        'class_weight': 'balanced'
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_loss_scores = []

    for train_index, val_index in skf.split(X, y):
        if type(X_train) is np.ndarray:
            X_tr = X[train_index]
            y_tr = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
        else:
            X_tr = X.iloc[train_index]
            y_tr = y.iloc[train_index]
            X_val = X.iloc[val_index]
            y_val = X.iloc[val_index]

        classifier = LGBMClassifier(**params)
        classifier.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=0, early_stopping_rounds=25)

        final_lgb_pred = classifier.predict_proba(X_val)
        log_loss_scores.append(log_loss(y_val, final_lgb_pred))

    return np.mean(log_loss_scores)

# Create the Optuna study and optimize the objective function
lgbm_study = optuna.create_study(direction='minimize')  # Since log loss should be minimized
lgbm_study.optimize(lambda trial: final_lgbm_objective(trial, Stack_final_X_train, y_train), n_trials=30)

# Print the best hyperparameters and corresponding log loss for LGBMClassifier
final_lgbm_best_params = lgbm_study.best_params
final_lgbm_best_score = lgbm_study.best_value
print("LightGBM Best Hyperparameters: ", final_lgbm_best_params)
print("LightGBM Best Log Loss: ", final_lgbm_best_score)

[I 2023-07-28 16:47:55,770] A new study created in memory with name: no-name-283a53f0-ab09-42c7-8ddd-e5696861f7f1
[I 2023-07-28 16:47:56,508] Trial 0 finished with value: 0.051875619349381584 and parameters: {'n_estimators': 947, 'learning_rate': 0.021042482875557655, 'max_depth': 5, 'subsample': 0.400441285445601, 'colsample_bytree': 0.5417276020514414}. Best is trial 0 with value: 0.051875619349381584.
[I 2023-07-28 16:47:57,247] Trial 1 finished with value: 0.05286366114289429 and parameters: {'n_estimators': 852, 'learning_rate': 0.011662520857189156, 'max_depth': 14, 'subsample': 0.24198113624185727, 'colsample_bytree': 0.9462241358186692}. Best is trial 0 with value: 0.051875619349381584.
[I 2023-07-28 16:47:57,826] Trial 2 finished with value: 0.3163225457550609 and parameters: {'n_estimators': 608, 'learning_rate': 0.0011414906955249013, 'max_depth': 9, 'subsample': 0.25533568696098574, 'colsample_bytree': 0.6549194755047427}. Best is trial 0 with value: 0.051875619349381584.
[

LightGBM Best Hyperparameters:  {'n_estimators': 273, 'learning_rate': 0.099934713315333, 'max_depth': 7, 'subsample': 0.8317800178189619, 'colsample_bytree': 0.47825455292721986}
LightGBM Best Log Loss:  0.051671337331099146


In [None]:
# 최종 Stacking 모델을 위한 Classifier생성.
lgb_final = LGBMClassifier(**final_lgbm_best_params)

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(Stack_final_X_train, y_train, test_size = 0.3, stratify = y_train, random_state = 721)

lgb_final.fit(X_tr, y_tr)
pred = lgb_final.predict_proba(X_val)
print('log_loss: ', log_loss(y_val, pred))

log_loss:  0.055807597122171496
