In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

from catboost import CatBoostClassifier
from sklearn.utils.class_weight import compute_class_weight
from LG_Aimers_6th.cal_auc import calculate_auc

data_seed = 9
train_path = f'../data/custom_train_{data_seed}.csv'
test_path = f'../data/custom_test_{data_seed}.csv'

In [4]:
from LG_Aimers_6th.lgbm_process import lgbm_process

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = lgbm_process(train, test)
print(train.shape, test.shape)

(205080, 67) (51271, 66)


In [None]:
seed = 333
data_seeds = [1, 7]

valid_aucs = []
test_aucs = []
for data_seed in data_seeds:
    train_path = f'../data/custom_train_{data_seed}.csv'
    test_path = f'../data/custom_test_{data_seed}.csv'

    train = pd.read_csv(train_path).drop(columns=['ID'])
    test = pd.read_csv(test_path).drop(columns=['ID'])

    test_preds_lgbm = []

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    auc_scores, acc_scores,  f1_scores = [], [], []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(train.drop(columns=['임신 성공 여부']), train["임신 성공 여부"])):
        fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
        fold_train2 = train.iloc[train_idx].copy().reset_index(drop=True)
        fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
        fold_test = test.copy()

        fold_train, fold_valid = lgbm_process(fold_train, fold_valid, seed=seed)
        _, fold_test = lgbm_process(fold_train2, fold_test, seed=seed)

        X_train = fold_train.drop(columns=['임신 성공 여부'])
        X_valid = fold_valid.drop(columns=['임신 성공 여부'])
        y_train = fold_train['임신 성공 여부']
        y_valid = fold_valid['임신 성공 여부']

        # Optuna
        lgbm_params = {
            'n_estimators': 1134,
            'learning_rate': 0.009183378614268902,
            'max_depth': 15,
            'num_leaves': 59,
            'min_child_samples': 56,
            'subsample': 0.5894604069264655,
            'colsample_bytree': 0.6305670256882752,
            'reg_alpha': 7.47936987466662,
            'reg_lambda': 0.0010986427203281623,
        }

        model_lgb = LGBMClassifier(
            **lgbm_params,
            verbosity=-1,
            n_jobs=10,
            random_state=seed,
            early_stopping_rounds=100,
        )

        model_lgb.fit(
            X_train, y_train,
            eval_set=(X_valid, y_valid),
        )

        valid_preds_proba  = model_lgb.predict_proba(X_valid)[:, 1]
        auc_ = roc_auc_score(y_valid, valid_preds_proba)
        auc_scores.append(auc_)

        # print(f"[Data Seed {seed} Fold {fold + 1}]  | AUC: {auc_:.7f}")

        test_pred = model_lgb.predict_proba(fold_test)[:, 1]
        test_preds_lgbm.append(test_pred)

    valid_auc = np.mean(auc_scores)
    valid_aucs.append(valid_auc)

    test_auc = calculate_auc(np.mean(test_preds_lgbm, axis=0), seed=data_seed)
    test_aucs.append(test_auc)

    print(f'[Data Seed {data_seed}] Valid AUC:{valid_auc:.5f} Test AUC: {test_auc}')

avg_valid_auc = np.mean(valid_aucs)
avg_test_auc = np.mean(test_aucs)

print('-' * 60)
print(f'[Data Seed 1,7] AVG Valid AUC: {avg_valid_auc:.5f}, Test AUC: {avg_test_auc}')
print('-' * 60)