In [1]:
import numpy as np
from sklearn import metrics

def f1_score(true_prob, pred_prob):
    true_binary = (np.array(true_prob) > 0.5).astype(int)
    pred_binary = (np.array(pred_prob) > 0.5).astype(int)
    return metrics.f1_score(true_binary, pred_binary)

def weighted_brier_score(true_prob, pred_prob, alpha=4):
    weights = 1 + alpha * true_prob + np.abs(0.5 - true_prob) ** 2
    brier = np.sum(weights * (true_prob - pred_prob) ** 2) / np.sum(weights)
    adjusted_brier = max(0, 1 - brier)
    return adjusted_brier

def competition_metric(true_prob, pred_prob):
    true_prob = np.array(true_prob)
    pred_prob = np.array(pred_prob)

    if true_prob.shape != pred_prob.shape:
        raise ValueError("예측값과 정답값의 shape이 일치하지 않습니다.")
    if np.isnan(pred_prob).any():
        raise ValueError("예측값에 NaN이 포함되어 있습니다.")
    if not ((0 <= pred_prob) & (pred_prob <= 1)).all():
        raise ValueError("예측값이 0~1 범위를 벗어났습니다.")
    if not np.isfinite(pred_prob).all():
        raise ValueError("예측값에 inf 또는 -inf가 포함되어 있습니다.")

    wbs = weighted_brier_score(true_prob, pred_prob)
    f1 = f1_score(true_prob, pred_prob)
    score = 0.5 * wbs + 0.5 * f1
    return score

In [2]:
# conda install -c conda-forge faiss-gpu

# conda 가상환경 상에서 설치

In [3]:
import sys
import os
from pathlib import Path

# 현재 작업 디렉토리(Eunhak)에서 tabular_dl_tabr 경로 추가
project_path = os.path.join(os.getcwd(), "tabular_dl_tabr")
if project_path not in sys.path:
    sys.path.insert(0, project_path)


project_dir = Path(r"C:\workspace\LG_Aimers_6th\Eunhak\tabular_dl_tabr")
os.environ['PROJECT_DIR'] = str(project_dir)

# 경로가 존재하지 않으면 생성
if not project_dir.exists():
    project_dir.mkdir(parents=True, exist_ok=True)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import pandas as pd
import numpy as np
import random
import math
import scipy
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import delu
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from bin.tabr import Model
from LG_Aimers_6th.cal_auc import calculate_auc

In [4]:
train_path = '../offline_data/train_aimers_6th_offline.csv'
test_path = '../offline_data/test_aimers_6th_offline.csv'
sample_path = '../offline_data/sample_submission_aimers_6th_offline.csv'

train = pd.read_csv(train_path, encoding='utf-8-sig').drop(columns=['ID'])
test = pd.read_csv(test_path, encoding='utf-8-sig').drop(columns=['ID'])

task_type = 'regression'

print(train.shape, test.shape)

(126244, 34) (54412, 33)


In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import random
import warnings
warnings.filterwarnings(action='ignore')

def drop_cols_with_na(train_df, val_df):
    # 나중에 결측치 대체하면서 반영할 예정

    cat_cols_with_na = [
        '이전 총 임신 횟수',
        '이전 총 임신 성공 횟수',

        '총 생성 배아 수', ## 여기부터 100% DI
        '저장된 배아 수',
        '채취된 신선 난자 수',
        '수정 시도된 난자 수'
    ]

    numeric_cols_with_na = [
        '이식된 배아 수', ## only DI
        '미세주입(ICSI) 배아 이식 수',
        '배아 이식 후 경과일',
    ]
    train_df = train_df.drop(columns=cat_cols_with_na)
    train_df = train_df.drop(columns=numeric_cols_with_na)
    val_df = val_df.drop(columns=cat_cols_with_na)
    val_df = val_df.drop(columns=numeric_cols_with_na)
    return train_df, val_df


def 시술유형(train, test):
    train['세부 시술 유형'] = train['세부 시술 유형'].fillna("Unknown")
    test['세부 시술 유형'] = test['세부 시술 유형'].fillna("Unknown")

    def categorize_procedure(proc):
        tokens = [token.strip() for token in proc.split(",") if token.strip() and not token.strip().isdigit()]
        # 우선순위에 따른 범주화
        if tokens.count("Unknown") >= 1:
            return "Unknown"
        if tokens.count("AH") >= 1:
            return "AH"
        if tokens.count("BLASTOCYST") >= 1:
            return "BLASTOCYST"
        if tokens.count("ICSI") >= 2 or tokens.count("IVF") >= 2:
            return "2ICSI_2IVF"
        if tokens.count("IVF") >= 1 and tokens.count("ICSI") >= 1:
            return "IVF_ICSI"
        if tokens == "ICSI":
            return "ICSI"
        if tokens == "IVF":
            return "IVF"
        return ",".join(tokens) if tokens else None

    for df in [train, test]:
        df['세부 시술 유형'] = df['세부 시술 유형'].str.replace(" / ", ",")
        df['세부 시술 유형'] = df['세부 시술 유형'].str.replace(":", ",")
        df['세부 시술 유형'] = df['세부 시술 유형'].str.replace(" ", "")

    counts = train['세부 시술 유형'].value_counts()
    allowed_categories = counts[counts >= 100].index.tolist()

    # allowed_categories에 속하지 않는 값은 "Unknown"으로 대체
    train.loc[~train['세부 시술 유형'].isin(allowed_categories), '세부 시술 유형'] = "Unknown"
    test.loc[~test['세부 시술 유형'].isin(allowed_categories), '세부 시술 유형'] = "Unknown"

    train['세부 시술 유형'] = train['세부 시술 유형'].apply(categorize_procedure)
    test['세부 시술 유형'] = test['세부 시술 유형'].apply(categorize_procedure)

    train['시술유형_통합'] = train['시술 유형'].astype(str) + '_' + train['세부 시술 유형'].astype(str)
    test['시술유형_통합'] = test['시술 유형'].astype(str) + '_' + test['세부 시술 유형'].astype(str)

    drop_cols = ['시술 유형', '세부 시술 유형']
    train = train.drop(drop_cols, axis=1)
    test = test.drop(drop_cols, axis=1)

    return train, test

def 횟수_to_int(df_train, df_val):
    for col in [col for col in df_train.columns if '횟수' in col]:
        df_train[col] = df_train[col].replace({'6회 이상': '6회'})
        df_val[col] = df_val[col].replace({'6회 이상': '6회'})

        df_train[col] = df_train[col].str[0].astype(int)
        df_val[col] = df_val[col].str[0].astype(int)

    return df_train, df_val

def 임신_IVF(df_train, df_val):
    for col in [col for col in df_train.columns if '횟수' in col]:
        df_train[col] = df_train[col].replace({'6회 이상': '6회'})
        df_val[col] = df_val[col].replace({'6회 이상': '6회'})
        mode_value = df_train[col].mode()[0]

        df_train[col] = df_train[col].fillna(mode_value)
        df_val[col] = df_val[col].fillna(mode_value)

        # 문자열의 첫 글자를 추출 후 int형으로 변환
        df_train[col] = df_train[col].str[0].astype(int)
        df_val[col] = df_val[col].str[0].astype(int)

    df_train['임신_IVF'] = df_train['이전 총 임신 횟수'] - df_train['이전 IVF 시술 횟수']
    df_val['임신_IVF'] = df_val['이전 총 임신 횟수'] - df_val['이전 IVF 시술 횟수']
    # df_train = df_train.drop('이전 시술 횟수', axis=1)
    return df_train, df_val


def 난자기증자나이(df_train, df_test):
    mapping = {
        '만20세 이하': 20,
        '만21-25세': 25,
        '만26-30세': 30,
        '만31-35세': 35,
        '알 수 없음': 20,  # 만20세 이하와 동일하게 처리
    }
    df_train['난자 기증자 나이'] = df_train['난자 기증자 나이'].replace(mapping)
    df_test['난자 기증자 나이'] = df_test['난자 기증자 나이'].replace(mapping)
    return df_train, df_test

def 단일배아이식여부(df_train, df_val):
    df_train['단일 배아 이식 여부'] = df_train['단일 배아 이식 여부'].fillna(0)
    df_val['단일 배아 이식 여부'] = df_val['단일 배아 이식 여부'].fillna(0)
    return df_train, df_val


def 기증자정자와혼합된난자수(df_train, df_test):
    df_train["기증자 정자와 혼합된 난자 수"] = df_train["기증자 정자와 혼합된 난자 수"].fillna(2)
    df_test["기증자 정자와 혼합된 난자 수"] = df_test["기증자 정자와 혼합된 난자 수"].fillna(2)
    return df_train, df_test

def label_encoding(train, test, cols):
    encoder = LabelEncoder()
    for col in cols:
        train[col] = encoder.fit_transform(train[col])
        test[col] = encoder.transform(test[col])
    return train, test

def type_to_category(train, test, cols):
    train[cols] = train[cols].astype('category')
    test[cols] = test[cols].astype('category')
    return train, test

def impute_nan(train, test):

    for col in cols_to_impute:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

    return train, test

def num_feature_scailing(train, test, seed=777):
    cat_cols = [col for col in train.columns if pd.api.types.is_categorical_dtype(train[col])]
    numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 확률']
    # bin_cols 들도 동일하게 스케일링

    arr_train = train[numeric_cols].to_numpy()  # DataFrame -> NumPy
    arr_train = arr_train.astype(np.float32)
    arr_test = test[numeric_cols].to_numpy()
    arr_test = arr_test.astype(np.float32)

    np.random.seed(seed)
    random.seed(seed)
    noise = (
        np.random.default_rng(0)
        .normal(0.0, 1e-5, arr_train.shape)
        .astype(arr_train.dtype)
    )
    preprocessing = QuantileTransformer(
        n_quantiles=max(min(len(train[numeric_cols]) // 30, 1000), 10),
        output_distribution='normal',
        subsample=10**9,
    ).fit(arr_train + noise)

    train[numeric_cols] = preprocessing.transform(arr_train)
    test[numeric_cols] = preprocessing.transform(arr_test)
    return train, test

def drop_single_value_columns(df_train, df_test):
    cols_to_drop = [col for col in df_train.columns if df_train[col].nunique() == 1]
    return df_train.drop(columns=cols_to_drop), df_test.drop(columns=cols_to_drop)

def all_process(train, val):
    train, val = drop_cols_with_na(train, val)

    # 기본 전처리 단계
    train, val = 횟수_to_int(train, val)

    train, val = 시술유형(train, val)
    # train, val = 임신_IVF(train, val)

    train, val = 단일배아이식여부(train, val)

    cols_to_encoding = [
        "환자 시술 당시 나이",
        # "클리닉 내 총 시술 횟수",
        # "IVF 시술 횟수",
        # "DI 시술 횟수",
        # "총 임신 횟수",
        # "IVF 임신 횟수",
        # "DI 임신 횟수",
        # "총 출산 횟수",
        # "IVF 출산 횟수",
        # "DI 출산 횟수",
        "난자 출처",
        "정자 출처",
        "난자 기증자 나이",
        "정자 기증자 나이",
        '시술유형_통합',

        '해동된 배아 수', # 원래 int였는데 범주형으로 바뀜

    ]
    train, val = label_encoding(train, val, cols=cols_to_encoding)
    train, val = type_to_category(train, val, cols=cols_to_encoding)

    # train, val = impute_nan(train, val)
    train, val = num_feature_scailing(train, val)

    train, val = drop_single_value_columns(train, val)

    return train, val


In [6]:
# from preprocess_DL import all_process

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

train, test = all_process(train, test)

print(train.shape, test.shape)

(126244, 24) (54412, 23)


In [7]:
def get_cols(df_train, target_col='임신 성공 확률'):
    cat_cols = [col for col in df_train.columns if pd.api.types.is_categorical_dtype(df_train[col])]
    numeric_cols = [col for col in df_train.columns if col not in cat_cols and col != '임신 성공 확률']

    num_cols = []
    bin_cols = []
    for col in numeric_cols:
        if df_train[col].nunique() == 2:
            bin_cols.append(col)
        else:
            num_cols.append(col)

    return num_cols, bin_cols, cat_cols

num_cols, bin_cols, cat_cols = get_cols(train)
cat_cardinalities = [train[col].nunique() for col in cat_cols]

print(f'수치형 변수: {len(num_cols)}개 \n{num_cols}')
print(f'이진형 변수: {len(bin_cols)}개 \n{bin_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')

수치형 변수: 2개 
['이전 IVF 시술 횟수', '이전 DI 시술 횟수']
이진형 변수: 14개 
['배란 자극 시술 여부', '단일 배아 이식 여부', '불임 원인 - 난관 질환', '불임 원인 - 배란 장애', '불임 원인 - 남성 요인', '불임 원인 - 자궁내막증', '불임 원인 - 불명확', '해동 난자 사용 여부', '신선 난자 사용 여부', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '착상 전 PGD 시행 여부', '착상 전 PGS 시행 여부']
범주형 변수: 7개 
['환자 시술 당시 나이', '해동된 배아 수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']


In [8]:
def build_dataset_from_dfs(train_df, valid_df, test_df, num_cols, bin_cols, cat_cols, target_col='임신 성공 확률'):
    data = {}
    data['X_num'] = {
        'train': torch.tensor(train_df[num_cols].values, dtype=torch.float32),
        'val':   torch.tensor(valid_df[num_cols].values, dtype=torch.float32),
        'test':  torch.tensor(test_df[num_cols].values, dtype=torch.float32),
    }
    data['X_bin'] = {
        'train': torch.tensor(train_df[bin_cols].values, dtype=torch.float32),
        'val':   torch.tensor(valid_df[bin_cols].values, dtype=torch.float32),
        'test':  torch.tensor(test_df[bin_cols].values, dtype=torch.float32),
    }
    if cat_cols:
        data['X_cat'] = {
            'train': torch.tensor(train_df[cat_cols].values, dtype=torch.long),
            'val':   torch.tensor(valid_df[cat_cols].values, dtype=torch.long),
            'test':  torch.tensor(test_df[cat_cols].values, dtype=torch.long),
        }
    else:
        data['X_cat'] = None
    data['Y'] = {
        'train': torch.tensor(train_df[target_col].values, dtype=torch.float),
        'val':   torch.tensor(valid_df[target_col].values, dtype=torch.float),
        # test 데이터에는 타깃이 없을 수 있습니다.
    }
    return data

def move_data_to_device(data, device):
    # data는 dict 형식: 예) {'X_num': {'train': tensor, 'val': tensor, ...}, ...}
    for key in data:
        if data[key] is None:
            continue
        if isinstance(data[key], dict):
            for part in data[key]:
                data[key][part] = data[key][part].to(device)
        else:
            data[key] = data[key].to(device)
    return data


class MyDataset:
    def __init__(self, data, n_num_features, n_bin_features, cat_cardinalities, is_regression=False, is_multiclass=True):
        self.data = data
        self._n_num_features = n_num_features
        self._n_bin_features = n_bin_features
        self._cat_cardinalities = cat_cardinalities
        self.is_regression = is_regression
        self.is_multiclass = is_multiclass

    @property
    def n_num_features(self):
        return self._n_num_features

    @property
    def n_bin_features(self):
        return self._n_bin_features

    def cat_cardinalities(self):
        return self._cat_cardinalities

    @property
    def Y(self):
        return self.data['Y']

    def size(self, part: str) -> int:
        # target이 있는 경우 사용
        if part in self.data['Y']:
            return self.data['Y'][part].shape[0]
        else:
            return self.data['X_num'][part].shape[0]

# data_dict = build_dataset_from_dfs(train, valid, test, num_cols, bin_cols, cat_cols, target_col='임신 성공 확률')
# data_dict = move_data_to_device(data_dict, device)
#
# dataset = MyDataset(data_dict, n_num_features=len(num_cols), n_bin_features=len(bin_cols), cat_cardinalities=cat_cardinalities)

In [9]:
def get_Xy(part: str, idx: torch.Tensor = None) -> tuple[dict, torch.Tensor]:
    batch = (
        { key[2:]: dataset.data[key][part] for key in dataset.data if key.startswith('X_') },
        dataset.data['Y'][part] if 'Y' in dataset.data and part in dataset.data['Y'] else None
    )
    if idx is None:
        return batch
    else:
        return (
            {k: v[idx] for k, v in batch[0].items()},
            batch[1][idx] if batch[1] is not None else None
        )

# train_size = dataset.size('train')
# train_indices = torch.arange(train_size, device=device)

In [10]:
model = Model(
    n_num_features=len(num_cols),
    n_bin_features=len(bin_cols),
    cat_cardinalities=cat_cardinalities,
    n_classes=1,
    num_embeddings=None,      # 임베딩 사용하지 않을 경우 None
    d_main=64,
    d_multiplier=2.0,
    encoder_n_blocks=2,
    predictor_n_blocks=2,
    mixer_normalization=True,
    context_dropout=0.1,
    dropout0=0.1,
    dropout1='dropout0',      # 'dropout0' 문자열을 지정하면 내부에서 dropout0 값이 사용됩니다.
    normalization="BatchNorm1d",
    activation="ReLU",
    memory_efficient=False,
    candidate_encoding_batch_size=None,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [11]:
def apply_model(part: str, idx: torch.Tensor, is_train: bool) -> torch.Tensor:
    x, y = get_Xy(part, idx)
    candidate_indices = train_indices
    if is_train:
        # training part: 후보에서 현재 배치 제거
        candidate_indices = candidate_indices[~torch.isin(candidate_indices, idx)]
    # 후보 데이터: 조건에 따라 전체 train 또는 선택된 인덱스 사용
    candidate_x, candidate_y = get_Xy('train', None if candidate_indices.equal(train_indices) else candidate_indices)
    return model(
        x_=x,
        y=y if is_train else None,
        candidate_x_=candidate_x,
        candidate_y=candidate_y,
        context_size=5,
        is_train=is_train,
    ).squeeze(-1)


In [16]:
seed = 333

torch.manual_seed(seed)
delu.random.seed(seed)
np.random.seed(seed)
random.seed(seed)

train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

test_preds = []
val_scores = []

skf = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train)):
    fold_train = train.iloc[train_idx].copy().reset_index(drop=True)
    fold_valid = train.iloc[valid_idx].copy().reset_index(drop=True)
    fold_train2 = fold_train.copy()
    fold_test = test.copy()

    fold_train, fold_valid = all_process(fold_train, fold_valid)
    _, fold_test = all_process(fold_train2, fold_test)

    num_cols, bin_cols, cat_cols = get_cols(fold_train)
    cat_cardinalities = [fold_train[col].nunique() for col in cat_cols]

    data_dict = build_dataset_from_dfs(
        fold_train, fold_valid, fold_test,
        num_cols, bin_cols, cat_cols, target_col='임신 성공 확률'
    )
    data_dict = move_data_to_device(data_dict, device)
    dataset = MyDataset(data_dict, n_num_features=len(num_cols), n_bin_features=len(bin_cols), cat_cardinalities=cat_cardinalities)

    train_size = dataset.size('train')
    train_indices = torch.arange(train_size, device=device)

    model = Model(
        n_num_features=len(num_cols),
        n_bin_features=len(bin_cols),
        cat_cardinalities=cat_cardinalities,
        n_classes=None,
        num_embeddings=None,      # 임베딩 사용하지 않을 경우 None
        d_main=64,
        d_multiplier=2.0,
        encoder_n_blocks=2,
        predictor_n_blocks=2,
        mixer_normalization=True,
        context_dropout=0.1,
        dropout0=0.1,
        dropout1='dropout0',      # 'dropout0' 문자열을 지정하면 내부에서 dropout0 값이 사용됩니다.
        normalization="BatchNorm1d",
        activation="ReLU",
        memory_efficient=False,
        candidate_encoding_batch_size=None,
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    num_epochs = 100000
    batch_size = 2048

    patience = 10
    best_val_loss = float('inf')
    early_stop_counter = 0

    checkpoint_path = "best_model_TabR.pth"

    for epoch in range(num_epochs):
        model.train()
        shuffled_indices = train_indices[torch.randperm(train_size)]
        num_batches = math.ceil(train_size / batch_size)
        epoch_loss = 0.0
        for i in range(num_batches):
            idx = shuffled_indices[i * batch_size : (i + 1) * batch_size]
            outputs = apply_model('train', idx, is_train=True)

            # 해당 인덱스의 타깃
            _, y_batch = get_Xy('train', idx)

            y_batch = y_batch.float()
            loss = criterion(outputs.squeeze(), y_batch.squeeze()) # squeeze해서 shape 맞추기 (예: (batch_size,))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * idx.numel()

        avg_loss = epoch_loss / train_size

        model.eval()
        with torch.no_grad():
            val_indices = torch.arange(dataset.size('val'), device=device)
            outputs_val = apply_model('val', val_indices, is_train=False)
            _, y_val = get_Xy('val', val_indices)

            val_loss = criterion(outputs_val.squeeze(), y_val.float().squeeze()).item() # validation loss 계산

            outputs_val_np = outputs_val.detach().cpu().numpy().squeeze()
            outputs_val_np = np.clip(outputs_val_np, 0, 1)
            y_val_np = y_val.detach().cpu().numpy().squeeze()

            val_score = competition_metric(y_val_np, outputs_val_np)
            val_scores.append(val_score)
        print(f"[Epoch {epoch+1}] Train Loss: {avg_loss:.4f}, Valid Loss: {val_loss:.4f}, Valid Score: {val_score:.6f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            best = {'epoch': epoch+1, 'train_loss':avg_loss, 'val_loss': val_loss, 'val_score': val_score}
            torch.save(model.state_dict(), checkpoint_path)
        else:
            early_stop_counter += 1
            # print(f"No improvement in validation loss for {early_stop_counter} epochs.")
            if early_stop_counter >= patience:
                break

    model.load_state_dict(torch.load(checkpoint_path))
    torch.cuda.empty_cache()
    print(f'\n[Fold{fold+1} Result]')
    print(best)

    model.eval()
    with torch.no_grad():
        test_indices = torch.arange(dataset.size('test'), device=device)
        test_pred = apply_model('test', test_indices, is_train=False)
        test_pred_np = test_pred.detach().cpu().numpy().squeeze()
        test_preds.append(test_pred_np)


avg_valid_score = np.mean(val_scores, axis=0)

print(f'[Seed {seed}] Final Valid Score: {avg_valid_score}')

[Epoch 1] Train Loss: 0.2258, Valid Loss: 0.1680, Valid AUC: 0.396055
[Epoch 2] Train Loss: 0.1702, Valid Loss: 0.1641, Valid AUC: 0.380258
[Epoch 3] Train Loss: 0.1670, Valid Loss: 0.1629, Valid AUC: 0.365977
[Epoch 4] Train Loss: 0.1656, Valid Loss: 0.1628, Valid AUC: 0.359220
[Epoch 5] Train Loss: 0.1644, Valid Loss: 0.1633, Valid AUC: 0.386130
[Epoch 6] Train Loss: 0.1640, Valid Loss: 0.1627, Valid AUC: 0.361295
[Epoch 7] Train Loss: 0.1639, Valid Loss: 0.1626, Valid AUC: 0.364489
[Epoch 8] Train Loss: 0.1635, Valid Loss: 0.1626, Valid AUC: 0.358539
[Epoch 9] Train Loss: 0.1628, Valid Loss: 0.1627, Valid AUC: 0.370576
[Epoch 10] Train Loss: 0.1631, Valid Loss: 0.1628, Valid AUC: 0.371270
[Epoch 11] Train Loss: 0.1629, Valid Loss: 0.1625, Valid AUC: 0.361044
[Epoch 12] Train Loss: 0.1627, Valid Loss: 0.1630, Valid AUC: 0.380371
[Epoch 13] Train Loss: 0.1625, Valid Loss: 0.1637, Valid AUC: 0.374815
[Epoch 14] Train Loss: 0.1624, Valid Loss: 0.1623, Valid AUC: 0.359373
[Epoch 15] Trai

In [18]:
submission = pd.read_csv(sample_path)

submission['임신 성공 확률'] = np.clip(np.mean(test_preds, axis=0), 0, 1)
submission

Unnamed: 0,ID,임신 성공 확률
0,TEST_00000,0.073774
1,TEST_00001,0.183785
2,TEST_00002,0.219433
3,TEST_00003,0.283400
4,TEST_00004,0.254783
...,...,...
54407,TEST_54407,0.287154
54408,TEST_54408,0.014109
54409,TEST_54409,0.293662
54410,TEST_54410,0.270870


In [19]:
submission.to_csv(f'./Submission/TabR_{seed}.csv', index=False)