##
- train: 학습용
- val: 검증용
> 기존 데이터 셋, 8:2 split (train, val)

- test: 최종 테스트용

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
CategoryEmbeddingModelConfig,
FTTransformerConfig,
TabNetModelConfig,
GANDALFConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.stacking import StackingModelConfig
# from pytorch_tabular.utils import make_mixed_dataset

from sklearn.preprocessing import LabelEncoder, FunctionTransformer, QuantileTransformer, MultiLabelBinarizer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score

import random

import preprocessing

from pytorch_lightning.loggers import WandbLogger

In [2]:
data_seed = 1
seed = 333

train_path = f'../../data/custom_train_{data_seed}.csv'
test_path = f'../../data/custom_test_{data_seed}.csv'

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

print(train.shape, test.shape)

(205080, 68) (51271, 67)


## preprocessing

In [3]:
train, test = preprocessing.all_process(train, test)

print(train.shape, test.shape)

(205080, 66) (51271, 65)


In [4]:
cat_cols = [col for col in train.columns if pd.api.types.is_categorical_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 여부']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

수치형 변수: 57개 
['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신', '배아생성이유_기증용', '배아생성이유_난자 저장용', '배아생성이유_배아 저장용', '배아생성이유_현재 시술용']
범주형 변수: 8개 
['시술 시기 코드', '시술 당시 나이', '배란 유도 유형', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']
(205080, 66) (51271, 65)


## config
- continuous_cols 기본 설정 뺴기 @@@@@@@@@@@

In [5]:
## 기본 학습 관련 config
data_config = DataConfig(
    target=["임신 성공 여부"],
    continuous_cols=numeric_cols,
    categorical_cols=cat_cols,
    normalize_continuous_features=False,     # 정규화 기본 설정 False로 수정
)
trainer_config = TrainerConfig(
    batch_size=4096,
    max_epochs=20,
    early_stopping="valid_loss",     
    early_stopping_mode="min",
    early_stopping_patience=3,
    checkpoints="valid_loss",        
    load_best=True, 
    auto_lr_find=False,
)
optimizer_config = OptimizerConfig()  # default: Adam, 1e-3

## stacking 할 모델들 config
model_config_1 = CategoryEmbeddingModelConfig(
    task="classification",
    layers="128-64-32",
    activation="ReLU",
    learning_rate=1e-3,
    seed=seed
)
model_config_2 = FTTransformerConfig(
    task="classification",
    input_embed_dim=32,
    num_attn_blocks=2,
    num_heads=4,
    learning_rate=1e-3,
    seed=seed
)
model_config_3 = TabNetModelConfig(
    task="classification",
    n_d=16,
    n_a=16,
    n_steps=5,
    learning_rate=1e-3,
    seed=seed
)
model_config_4 = GANDALFConfig(
    task="classification",
    gflu_stages=6,
    gflu_dropout=0.1,
    gflu_feature_init_sparsity=0.3,  # 각 GFLU 스테이지에서 처음에 선택할 feature의 비율
    learnable_sparsity=True,  # GFLU에서 선택할 feature의 sparsity 비율을 학습 중에 업데이트할지 여부
    embedding_dropout=0.05,
    batch_norm_continuous_input=False,  # 연속형 정규화 안함
    learning_rate=1e-3,
    seed=seed,    
)

## stacking model config
stacking_config = StackingModelConfig(
    task="classification",
    model_configs=[
        model_config_1,
        model_config_2,
        model_config_3,
        model_config_4
    ],
    head="LinearHead",
    head_config={
        "layers": "64",
        "activation": "ReLU",
        "dropout": 0.1
    },
    learning_rate=1e-3
)

## CategoryEmbeddingModelConfig

In [None]:
# train-main
cat_emb_model = TabularModel(
    data_config=data_config,
    model_config=model_config_1,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [7]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

roc_metrics = []
test_preds = []

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["임신 성공 여부"])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = preprocessing.all_process(train_fold, val_fold)
    train2_fold, test_fold = preprocessing.all_process(train2_fold, test_fold)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = cat_emb_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = cat_emb_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    cat_emb_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = cat_emb_model.predict(val_fold)
    
    # 테스트 데이터에 대해 예측 수행 및 저장
    pred_test = cat_emb_model.predict(test_fold)
    test_preds.append(pred_test)
    
    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")
    
    # 다음 fold를 위해 모델 가중치 초기화
    cat_emb_model.model.reset_weights()

# 최종 예측 확률
final_test_preds = np.mean(test_preds, axis=0)

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA A100 80GB PCIe MIG 1g.10gb') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 1 ROC AUC: 0.73223226


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 2 ROC AUC: 0.73496620


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 3 ROC AUC: 0.73385512


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 4 ROC AUC: 0.73323385


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 5 ROC AUC: 0.73403010
Average ROC AUC over 5 folds: 0.73366351


In [None]:
tmp_submission = pd.DataFrame({f'cat_emb_model_{data_seed}': final_test_preds[:,1]})

# 모델별 예측 결과 저장용
tmp_submission_csv = tmp_submission

In [11]:
import sys
sys.path.append("../../")
from cal_auc import calculate_auc

score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7366768328607607


## FTTransformer

In [12]:
# train-main
ftt_model = TabularModel(
    data_config=data_config,
    model_config=model_config_2,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [13]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

roc_metrics = []
test_preds = []

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["임신 성공 여부"])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = preprocessing.all_process(train_fold, val_fold)
    train2_fold, test_fold = preprocessing.all_process(train2_fold, test_fold)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = ftt_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = ftt_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    ftt_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = ftt_model.predict(val_fold)
    
    # 테스트 데이터에 대해 예측 수행 및 저장
    pred_test = ftt_model.predict(test_fold)
    test_preds.append(pred_test)
    
    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")
    
    # 다음 fold를 위해 모델 가중치 초기화
    ftt_model.model.reset_weights()

# 최종 예측 확률
final_test_preds = np.mean(test_preds, axis=0)

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 1 ROC AUC: 0.73729094


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 2 ROC AUC: 0.73260450


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 3 ROC AUC: 0.73602420


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 4 ROC AUC: 0.73485214


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 5 ROC AUC: 0.73410831
Average ROC AUC over 5 folds: 0.73497602


In [14]:
tmp_submission_csv['ftt_model_1'] = final_test_preds[:, 1]
tmp_submission = pd.DataFrame({f'ftt_model_{data_seed}': final_test_preds[:,1]})

In [15]:
score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7362884020511868


## TabNetModel

In [16]:
# train-main
tabn_model = TabularModel(
    data_config=data_config,
    model_config=model_config_3,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [17]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

roc_metrics = []
test_preds = []

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["임신 성공 여부"])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = preprocessing.all_process(train_fold, val_fold)
    train2_fold, test_fold = preprocessing.all_process(train2_fold, test_fold)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = tabn_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = tabn_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    tabn_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = tabn_model.predict(val_fold)
    
    # 테스트 데이터에 대해 예측 수행 및 저장
    pred_test = tabn_model.predict(test_fold)
    test_preds.append(pred_test)
    
    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")
    
    # 다음 fold를 위해 모델 가중치 초기화
    tabn_model.model.reset_weights()

# 최종 예측 확률
final_test_preds = np.mean(test_preds, axis=0)

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

`Trainer.fit` stopped: `max_epochs=20` reached.


Fold 1 ROC AUC: 0.67664421


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 2 ROC AUC: 0.68825440


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 3 ROC AUC: 0.68979583


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 4 ROC AUC: 0.69633903


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

`Trainer.fit` stopped: `max_epochs=20` reached.


Fold 5 ROC AUC: 0.69013270
Average ROC AUC over 5 folds: 0.68823323


In [18]:
tmp_submission_csv['tabn_model_1'] = final_test_preds[:, 1]
tmp_submission = pd.DataFrame({f'tabn_model_{data_seed}': final_test_preds[:,1]})

In [19]:
score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7064222680251903


## GandalfModel

In [20]:
# train-main
gandalf_model = TabularModel(
    data_config=data_config,
    model_config=model_config_4,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [21]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

roc_metrics = []
test_preds = []

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["임신 성공 여부"])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = preprocessing.all_process(train_fold, val_fold)
    train2_fold, test_fold = preprocessing.all_process(train2_fold, test_fold)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = gandalf_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = gandalf_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    gandalf_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = gandalf_model.predict(val_fold)
    
    # 테스트 데이터에 대해 예측 수행 및 저장
    pred_test = gandalf_model.predict(test_fold)
    test_preds.append(pred_test)
    
    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")
    
    # 다음 fold를 위해 모델 가중치 초기화
    gandalf_model.model.reset_weights()

# 최종 예측 확률
final_test_preds = np.mean(test_preds, axis=0)

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 1 ROC AUC: 0.73662149


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 2 ROC AUC: 0.73376360


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 3 ROC AUC: 0.73415166


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 4 ROC AUC: 0.73250897


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 5 ROC AUC: 0.73506473
Average ROC AUC over 5 folds: 0.73442209


In [22]:
tmp_submission_csv['gandalf_model_1'] = final_test_preds[:, 1]
tmp_submission = pd.DataFrame({f'gandalf_model{data_seed}': final_test_preds[:,1]})

In [23]:
score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7361565065776497


## Stacking

In [24]:
# stacking train-main
stacking_model = TabularModel(
    data_config=data_config,
    model_config=stacking_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

In [26]:
# ROC AUC
def _roc_auc_scoer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred['임신 성공 여부_1_probability'])

seed = 333
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

roc_metrics = []
test_preds = []

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# StratifiedKFold
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train["임신 성공 여부"])):
    
    # 현재 fold의 train/validation 데이터 분할
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold = train.iloc[val_idx].copy().reset_index(drop=True)    
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # preprocessing
    train_fold, val_fold = preprocessing.all_process(train_fold, val_fold)
    train2_fold, test_fold = preprocessing.all_process(train2_fold, test_fold)
    
    # 첫 fold일 때 datamodule과 모델 초기화, 이후 fold에서는 copy로 재사용
    if fold == 0:
        datamodule = stacking_model.prepare_dataloader(train=train_fold, validation=val_fold, seed=seed)
        model = stacking_model.prepare_model(datamodule)
    else:
        datamodule = datamodule.copy(train=train_fold, validation=val_fold)
    
    # 모델 학습 (각 Fold별로 학습 진행)
    stacking_model.train(model, datamodule)
    
    # 검증 데이터에 대해 예측 수행
    pred_df = stacking_model.predict(val_fold)
    
    # 테스트 데이터에 대해 예측 수행 및 저장
    pred_test = stacking_model.predict(test_fold)
    test_preds.append(pred_test)
    
    # ROC AUC 계산
    fold_roc = _roc_auc_scoer(val_fold["임신 성공 여부"], pred_df)
    roc_metrics.append(fold_roc)
    
    print(f"Fold {fold+1} ROC AUC: {fold_roc:.8f}")
    
    # 다음 fold를 위해 모델 가중치 초기화
    stacking_model.model.reset_weights()

# 최종 예측 확률
final_test_preds = np.mean(test_preds, axis=0)

# 전체 Fold의 평균 ROC AUC 출력
average_roc_auc = np.mean(roc_metrics)
print(f"Average ROC AUC over {skf.n_splits} folds: {average_roc_auc:.8f}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 1 ROC AUC: 0.73847651


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 2 ROC AUC: 0.73504577


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 3 ROC AUC: 0.73563085


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 4 ROC AUC: 0.73534097


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Fold 5 ROC AUC: 0.73564421
Average ROC AUC over 5 folds: 0.73602766


In [27]:
tmp_submission_csv['stacking_model_1'] = final_test_preds[:, 1]
tmp_submission = pd.DataFrame({f'stacking_model_{data_seed}': final_test_preds[:,1]})

In [28]:
score = calculate_auc(tmp_submission, seed=data_seed)
print(f'[Seed: {data_seed}]: {score}')

[Seed: 1]: 0.7375005576466684


In [30]:
tmp_submission_csv

Unnamed: 0,cat_emb_model_1,ftt_model_1,tabn_model_1,gandalf_model_1,stacking_model_1
0,0.168441,0.165553,0.252536,0.153588,0.178953
1,0.191748,0.219913,0.210454,0.206622,0.216393
2,0.003227,0.002903,0.026940,0.001807,0.001580
3,0.215366,0.212969,0.291090,0.218407,0.230265
4,0.374211,0.396720,0.244180,0.389833,0.369396
...,...,...,...,...,...
51266,0.001855,0.002921,0.108385,0.001485,0.000498
51267,0.246433,0.194121,0.261618,0.245292,0.231216
51268,0.095030,0.095089,0.178024,0.105684,0.105637
51269,0.000024,0.002762,0.032066,0.000782,0.000032


In [None]:
# tmp_submission_csv.to_csv('predictions_stacking.csv', index=False, encoding='utf-8-sig')