In [None]:
# !pip install numpy==1.26.4
# !pip install pandas==2.2.2
# !pip install scikit-learn==1.5.1
# !pip install scipy==1.14.1
# !pip install statsmodels==0.14.2
# !pip install joblib==1.4.2
# !pip install threadpoolctl==3.5.0
# !pip install lightgbm==4.6.0
# !pip install catboost==1.2.3

In [7]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
CategoryEmbeddingModelConfig,
FTTransformerConfig,
TabNetModelConfig,
GANDALFConfig,
)
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.stacking import StackingModelConfig
# from pytorch_tabular.utils import make_mixed_dataset

from sklearn.preprocessing import LabelEncoder, FunctionTransformer, QuantileTransformer, MultiLabelBinarizer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

import random

import preprocessing

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.utils.class_weight import compute_class_weight

from embedding import TabularPipeline
from eval_metric import competition_metric, f1_score, weighted_brier_score

import re

## CategoryEmbedding Model

In [8]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.models.common.heads import LinearHeadConfig

In [9]:
import sys
sys.path.append("../../")
from new_lgbm_process import all_process

seed = 333

train_path = "../../data/train.csv"
test_path = "../../data/test.csv"

train = pd.read_csv(train_path).drop(columns=["ID"])
test = pd.read_csv(test_path).drop(columns=["ID"])
train, test = all_process(train, test)
print(train.shape, test.shape)

(126244, 33) (54412, 32)


In [10]:
cat_cols = [col for col in train.columns if pd.api.types.is_object_dtype(train[col])]
numeric_cols = [col for col in train.columns if col not in cat_cols and col != '임신 성공 확률']

print(f'수치형 변수: {len(numeric_cols)}개 \n{numeric_cols}')
print(f'범주형 변수: {len(cat_cols)}개 \n{cat_cols}')
print(train.shape, test.shape)

수치형 변수: 21개 
['배란 자극 시술 여부', '단일 배아 이식 여부', '불임 원인 - 난관 질환', '불임 원인 - 배란 장애', '불임 원인 - 남성 요인', '불임 원인 - 자궁내막증', '불임 원인 - 불명확', '이전 IVF 시술 횟수', '이전 DI 시술 횟수', '이전 총 임신 횟수', '이전 총 임신 성공 횟수', '이식된 배아 수', '미세주입(ICSI) 배아 이식 수', '해동 난자 사용 여부', '신선 난자 사용 여부', '동결 배아 사용 여부', '신선 배아 사용 여부', '기증 배아 사용 여부', '착상 전 PGD 시행 여부', '착상 전 PGS 시행 여부', '배아 이식 후 경과일']
범주형 변수: 11개 
['환자 시술 당시 나이', '총 생성 배아 수', '저장된 배아 수', '해동된 배아 수', '채취된 신선 난자 수', '수정 시도된 난자 수', '난자 출처', '정자 출처', '난자 기증자 나이', '정자 기증자 나이', '시술유형_통합']
(126244, 33) (54412, 32)


## Categorical Embedding Transformer
- embedding_dim = min(50, (num_categories + 1) // 2)
    - 범주가 4개 → 임베딩 dim = 2
    - 범주가 10개 → 임베딩 dim = 5
    - 범주가 200개 → 임베딩 dim = 50 (최대값 제한)
    
## Embedding+LGBM
- nsplit=5 : 0.739812
- nsplit=10 : 

In [None]:
num_samples = 10

seed = 333
kf = KFold(n_splits=5, shuffle=True, random_state=seed)

# 학습/평가 데이터 로드 (ID 칼럼 제거)
train = pd.read_csv(train_path).drop(columns=['ID'])
test = pd.read_csv(test_path).drop(columns=['ID'])

# fold별 예측값 저장 리스트 (각 fold별로 num_samples의 평균 예측값)
fold_valid_preds = []
fold_test_preds = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train, train['임신 성공 확률'])):
    
    # fold 고유의 데이터 분할 (고정)ㄴ
    train_fold = train.iloc[train_idx].copy().reset_index(drop=True)
    val_fold   = train.iloc[val_idx].copy().reset_index(drop=True)
    
    train2_fold = train_fold.copy()
    test_fold = test.copy() 
    
    # 전처리: 모든 sample에 대해 동일하게 적용할 전처리 (all_process)
    # 각 fold 내에서는 train, val, test에 대해 먼저 전처리하여 TabularPipeline에 넘깁니다.
    train_fold_proc, val_fold_proc = all_process(train_fold, val_fold)
    train2_fold_proc, test_fold_proc = all_process(train2_fold, test_fold)
    
    # TabularPipeline을 한 번 학습(fit)시킵니다.
    pipeline = TabularPipeline(train_fold_proc, val_fold_proc, test_fold_proc, seed, numeric_cols, cat_cols)
    pipeline.prepare_data()
    pipeline.train_model()
    
    # sample for문에서는 transform만 수행합니다.
    valid_preds_samples = []
    test_preds_samples = []
    
    for sample in range(num_samples):
        print(f"Fold {fold+1} - Sample {sample+1}/{num_samples}")
        
        # 여기서 transform_data를 호출하여 embedding 추출
        # 주의: 만약 transform 과정이 결정적이라면, 매 sample마다 동일한 결과가 나올 수 있습니다.
        fold_train_trans, fold_valid_trans, fold_test_trans = pipeline.transform_data()
        
        # LGBM용 데이터 분리 (임베딩된 데이터 사용)
        X_train = fold_train_trans.drop(columns=['임신 성공 확률'])
        y_train = fold_train_trans['임신 성공 확률']
        X_valid = fold_valid_trans.drop(columns=['임신 성공 확률'])
        y_valid = fold_valid_trans['임신 성공 확률']
        
        # 불균형 처리: 타겟이 0(majority)와 0이 아닌 경우(minority) 분리
        train_with_target = X_train.copy()
        train_with_target['임신 성공 여부'] = y_train.reset_index(drop=True)
        majority = train_with_target[train_with_target['임신 성공 여부'] <= 0]
        minority = train_with_target[train_with_target['임신 성공 여부'] > 0]
        
        print(len(majority), len(minority))
        
        n_minority = len(minority)
        if n_minority == 0:
            balanced_train = train_with_target
        else:
            # fold 및 sample마다 다른 random_state 사용
            resampled_majority = majority.sample(n=n_minority, replace=True, random_state=seed + fold + sample)
            balanced_train = pd.concat([minority, resampled_majority]).sample(frac=1, random_state=seed + fold + sample).reset_index(drop=True)
        
        X_train_bal = balanced_train.drop(columns=['임신 성공 여부'])
        y_train_bal = balanced_train['임신 성공 여부']
        
        # LGBM 모델 파라미터 설정 및 학습 (회귀 모델 사용)
        lgbm_params = {
            'n_estimators': 1134,
            'learning_rate': 0.009183378614268902,
            'max_depth': 15,
            'num_leaves': 59,
            'min_child_samples': 56,
            'subsample': 0.5894604069264655,
            'colsample_bytree': 0.6305670256882752,
            'reg_alpha': 7.47936987466662,
            'reg_lambda': 0.0010986427203281623
        }
        
        model_lgb = LGBMRegressor(
            **lgbm_params,
            verbosity=-1,
            n_jobs=-1,
            random_state=seed,
        )
        
        model_lgb.fit(X_train_bal, y_train_bal)
        
        # Validation 예측 및 클리핑
        valid_preds_proba = model_lgb.predict(X_valid)
        valid_preds_proba = np.clip(valid_preds_proba, 0, 1)
        valid_preds_samples.append(valid_preds_proba)
        
        # 평가 지표 출력 (원하는 경우)
        custom_f1 = f1_score(y_valid, valid_preds_proba)
        wbs = weighted_brier_score(y_valid, valid_preds_proba)
        comp_metric = competition_metric(y_valid, valid_preds_proba)
        print(f"Fold {fold+1} Sample {sample+1} | Custom F1: {custom_f1:.7f} | WBS: {wbs:.7f} | Combined: {comp_metric:.7f}")
        
        # Test 데이터 예측 및 클리핑
        test_pred = model_lgb.predict(fold_test_trans)
        test_pred = np.clip(test_pred, 0, 1)
        test_preds_samples.append(test_pred)
    
    # 해당 fold 내에서 num_samples개의 예측값을 평균
    fold_valid_pred_mean = np.mean(valid_preds_samples, axis=0)
    fold_test_pred_mean = np.mean(test_preds_samples, axis=0)
    
    fold_valid_preds.append(fold_valid_pred_mean)
    fold_test_preds.append(fold_test_pred_mean)

# 모든 fold의 예측값 평균 (최종 예측)
final_valid_preds = np.mean(fold_valid_preds, axis=0)
final_test_preds = np.mean(fold_test_preds, axis=0)

print("최종 Validation 예측값 shape:", final_valid_preds.shape)
print("최종 Test 예측값 shape:", final_test_preds.shape)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 96 steps due to diverging loss.
Learning rate set to 0.0002511886431509582
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_6b181955-124e-41ad-9034-51acfbea250f.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_6b181955-124e-41ad-9034-51acfbea250f.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Fold 1 - Sample 1/10


Output()

Output()

68177 32818
Fold 1 Sample 1 | Custom F1: 0.4715958 | WBS: 0.8099914 | Combined: 0.6407936


Output()

Fold 1 - Sample 2/10


Output()

Output()

68177 32818
Fold 1 Sample 2 | Custom F1: 0.4705714 | WBS: 0.8101965 | Combined: 0.6403839


Output()

Fold 1 - Sample 3/10


Output()

Output()

68177 32818
Fold 1 Sample 3 | Custom F1: 0.4700401 | WBS: 0.8098518 | Combined: 0.6399460


Output()

Fold 1 - Sample 4/10


Output()

Output()

68177 32818
Fold 1 Sample 4 | Custom F1: 0.4660500 | WBS: 0.8099419 | Combined: 0.6379960


Output()

Fold 1 - Sample 5/10


Output()

Output()

68177 32818
Fold 1 Sample 5 | Custom F1: 0.4720069 | WBS: 0.8102770 | Combined: 0.6411419


Output()

Fold 1 - Sample 6/10


Output()

Output()

68177 32818
Fold 1 Sample 6 | Custom F1: 0.4677327 | WBS: 0.8096312 | Combined: 0.6386820


Output()

Fold 1 - Sample 7/10


Output()

Output()

68177 32818
Fold 1 Sample 7 | Custom F1: 0.4695980 | WBS: 0.8099230 | Combined: 0.6397605


Output()

Fold 1 - Sample 8/10


Output()

Output()

68177 32818
Fold 1 Sample 8 | Custom F1: 0.4744920 | WBS: 0.8101061 | Combined: 0.6422991


Output()

Fold 1 - Sample 9/10


Output()

Output()

68177 32818
Fold 1 Sample 9 | Custom F1: 0.4693921 | WBS: 0.8098462 | Combined: 0.6396192


Output()

Fold 1 - Sample 10/10


Output()

Output()

68177 32818
Fold 1 Sample 10 | Custom F1: 0.4692979 | WBS: 0.8099311 | Combined: 0.6396145


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 97 steps due to diverging loss.
Learning rate set to 0.0009120108393559097
Restoring states from the checkpoint path at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_ed1c5105-653e-46b0-b898-091b05759ef5.ckpt
Restored all states from the checkpoint at /home/elicer/LG_Aimers_6th/Eunho/pytorch_stacking/.lr_find_ed1c5105-653e-46b0-b898-091b05759ef5.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

Fold 2 - Sample 1/10


Output()

Output()

68112 32883
Fold 2 Sample 1 | Custom F1: 0.4703230 | WBS: 0.8101788 | Combined: 0.6402509


Output()

Fold 2 - Sample 2/10


Output()

Output()

68112 32883
Fold 2 Sample 2 | Custom F1: 0.4708452 | WBS: 0.8097775 | Combined: 0.6403114


Output()

Fold 2 - Sample 3/10


Output()

Output()

68112 32883
Fold 2 Sample 3 | Custom F1: 0.4653473 | WBS: 0.8092396 | Combined: 0.6372934


Output()

Fold 2 - Sample 4/10


Output()

Output()

68112 32883
Fold 2 Sample 4 | Custom F1: 0.4723552 | WBS: 0.8098414 | Combined: 0.6410983


Output()

Fold 2 - Sample 5/10


Output()

Output()

68112 32883
Fold 2 Sample 5 | Custom F1: 0.4679702 | WBS: 0.8093822 | Combined: 0.6386762


Output()

Fold 2 - Sample 6/10


In [None]:
tmp_submission = pd.DataFrame({'sampling10_embedding_nsplit_5': final_test_preds})
tmp_submission

Unnamed: 0,embed_lgbm_nsplit_10
0,0.089317
1,0.559448
2,0.507163
3,0.582950
4,0.375256
...,...
54407,0.735168
54408,0.000000
54409,0.651741
54410,0.816366


## 제출

In [18]:
submission = pd.read_csv('../../data/sample_submission.csv')
submission

Unnamed: 0,ID,임신 성공 확률
0,TEST_00000,0
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0
...,...,...
54407,TEST_54407,0
54408,TEST_54408,0
54409,TEST_54409,0
54410,TEST_54410,0


In [19]:
submission['임신 성공 확률'] = final_test_preds
submission

Unnamed: 0,ID,임신 성공 확률
0,TEST_00000,0.089317
1,TEST_00001,0.559448
2,TEST_00002,0.507163
3,TEST_00003,0.582950
4,TEST_00004,0.375256
...,...,...
54407,TEST_54407,0.735168
54408,TEST_54408,0.000000
54409,TEST_54409,0.651741
54410,TEST_54410,0.816366


In [20]:
submission.to_csv('./submissions/임베딩_sampleweight_lgbm_결측값완_fold10_min.csv', index=False)