In [1]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer, MultiLabelBinarizer
from sklearn.impute import SimpleImputer

from pytorch_tabular import TabularModel
from pytorch_tabular.models import GANDALFConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

from LG_Aimers_6th.Eunhak.NODE import valid_auc_scores
from LG_Aimers_6th.cal_auc import calculate_auc
import warnings
warnings.filterwarnings(action='ignore')

## 2. Data Load

In [2]:
data_seed = 1

train_path = f'../data/custom_train_{data_seed}.csv'
test_path = f'../data/custom_test_{data_seed}.csv'

# 학습/평가 데이터 로드
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

print(train.shape, test.shape)

(205080, 68) (51271, 67)


In [3]:
from preprocess_DL import all_process

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

train, test = all_process(train, test)

cat_cols = [col for col in train.columns if pd.api.types.is_categorical_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 여부']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

수치형 변수: 57개 
['임신 시도 또는 마지막 임신 경과 연수', '배란 자극 여부', '단일 배아 이식 여부', '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부', '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인', '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인', '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인', '불임 원인 - 배란 장애', '불임 원인 - 자궁경부 문제', '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수', '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수', 'IVF 출산 횟수', 'DI 출산 횟수', '총 생성 배아 수', '미세주입된 난자 수', '미세주입에서 생성된 배아 수', '이식된 배아 수', '미세주입 배아 이식 수', '저장된 배아 수', '미세주입 후 저장된 배아 수', '해동된 배아 수', '해동 난자 수', '수집된 신선 난자 수', '저장된 신선 난자 수', '혼합된 난자 수', '파트너 정자와 혼합된 난자 수', '기증자 정자와 혼합된 난자 수', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부', 'PGD 시술 여부', 'PGS 시술 여부', '난자 혼합 경과일', '배아 이식 경과일', '배아 해동 경과일', '시술_임신', '배아생성이유_기증용', '배아생성이유_난자 저장용', '배아생성이유_배아 저장용', '배아생성이유_현재 시술용']
범주형 변수: 8개 
['시술 시기 코드', '시술 당시 나이', '배란 유도 유형', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']
(205080, 66) (51271, 65)


In [14]:
trainer_config = TrainerConfig(
    batch_size=2048,
    data_aware_init_batch_size=2000,
    auto_lr_find=False,
    max_epochs=100,
    accelerator='gpu',
    early_stopping='valid_loss',
    early_stopping_min_delta=0.001,
    early_stopping_mode='min',
    early_stopping_patience=3,
    checkpoints='valid_loss',
    checkpoints_path='saved_models',
    checkpoints_save_top_k=1,
    progress_bar='simple',
)
optimizer_config = OptimizerConfig()

In [15]:
seed = 333

valid_scores = []
test_preds = []

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['임신 성공 여부'])):
    fold_train, fold_valid = train.iloc[train_idx].copy().reset_index(drop=True), train.iloc[valid_idx].copy().reset_index(drop=True)
    fold_train2 = fold_train.copy()
    fold_test = test.copy()

    fold_train, fold_valid = all_process(fold_train, fold_valid)
    _, fold_test = all_process(fold_train2, fold_test)

    cat_cols = [col for col in fold_train.columns if pd.api.types.is_categorical_dtype(fold_train[col])]
    numeric_cols = [col for col in fold_train.columns if col not in cat_cols and col != '임신 성공 여부']
    data_config = DataConfig(
        target=[
            "임신 성공 여부"
        ],
        continuous_cols=numeric_cols,
        categorical_cols=cat_cols,
    )

    model_config = GANDALFConfig(
        task="classification",
        gflu_stages=6,
        gflu_feature_init_sparsity=0.3,
        gflu_dropout=0.0,
        learning_rate=1e-3,
        metrics=["auroc"],
        metrics_prob_input=[True],
        seed=seed,
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        verbose=False,
    )

    tabular_model.fit(train=fold_train, validation=fold_valid, seed=seed)

    valid_preds = tabular_model.predict(fold_valid).iloc[:, 1]
    valid_score = roc_auc_score(fold_valid['임신 성공 여부'], valid_preds)
    valid_scores.append(valid_score)

    test_pred = tabular_model.predict(fold_test).iloc[:, 1]
    test_preds.append(test_pred)
    test_score = calculate_auc(test_pred, seed=data_seed)

    print(f'[Data {data_seed} Fold {fold + 1}] Valid AUC: {valid_score:.5f}, Test AUC: {test_score:.6f}')

final_test_score = calculate_auc(np.mean(test_preds, axis=0), seed=data_seed)

print('=' * 60)
print(f"[Data {data_seed}] Valid AUC: {np.mean(valid_scores, axis=0):.5f}, Test AUC: {final_test_score}")

Seed set to 333
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | GANDALFBackbone  | 256 K  | train
1 | _embedding_layer | Embedding1dLayer | 307    | train
2 | _head            | Sequential       | 172    | train
3 | loss             | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
256 K     Trainable params
0         Non-trainable params
256 K     Total params
1.026     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[Data 1 Fold 1] Valid AUC: 0.73559, Test AUC: 0.733179


Seed set to 333
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | GANDALFBackbone  | 256 K  | train
1 | _embedding_layer | Embedding1dLayer | 307    | train
2 | _head            | Sequential       | 172    | train
3 | loss             | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
256 K     Trainable params
0         Non-trainable params
256 K     Total params
1.026     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[Data 1 Fold 2] Valid AUC: 0.73384, Test AUC: 0.733970


Seed set to 333
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | GANDALFBackbone  | 256 K  | train
1 | _embedding_layer | Embedding1dLayer | 307    | train
2 | _head            | Sequential       | 172    | train
3 | loss             | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
256 K     Trainable params
0         Non-trainable params
256 K     Total params
1.026     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[Data 1 Fold 3] Valid AUC: 0.73548, Test AUC: 0.734515


Seed set to 333
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | GANDALFBackbone  | 256 K  | train
1 | _embedding_layer | Embedding1dLayer | 307    | train
2 | _head            | Sequential       | 172    | train
3 | loss             | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
256 K     Trainable params
0         Non-trainable params
256 K     Total params
1.026     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[Data 1 Fold 4] Valid AUC: 0.73314, Test AUC: 0.733946


Seed set to 333
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | GANDALFBackbone  | 256 K  | train
1 | _embedding_layer | Embedding1dLayer | 307    | train
2 | _head            | Sequential       | 172    | train
3 | loss             | CrossEntropyLoss | 0      | train
--------------------------------------------------------------
256 K     Trainable params
0         Non-trainable params
256 K     Total params
1.026     Total estimated model params size (MB)
35        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[Data 1 Fold 5] Valid AUC: 0.73457, Test AUC: 0.733765
[Data 1] Valid AUC: 0.73452, Test AUC: 0.7362658222005021


In [16]:
submission = pd.DataFrame({f'gandalf_{data_seed}': np.mean(test_preds, axis=0)})
submission

Unnamed: 0,gandalf_1
0,0.184353
1,0.218783
2,0.001795
3,0.218861
4,0.379053
...,...
51266,0.001396
51267,0.237536
51268,0.091678
51269,0.000281


In [17]:
submission.to_csv('GANDALF_1.csv', index=False)