In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

### Data Load

In [None]:
from utils.public import *
from utils.feature import preprocess

TRAIN_PATH = os.getenv('TRAIN_DATA_PATH')
TEST_PATH = os.getenv('TEST_DATA_PATH')
SUBMISSION_PATH = os.getenv('SUBMISSION_DATA_PATH')

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print(f"#------------Load Data Length------------#")
print(f"Train Data Length : {len(train)}")
print(f"Test Data Length : {len(test)}")


train = preprocess(train, validation = False)
test = preprocess(test, validation = True)

print(f"#------------After Pre Data Length------------#")
print(f"Train Data Length : {len(train)}")
print(f"Test Data Length : {len(test)}")

# Modeling

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('임신 성공 여부', axis=1), train['임신 성공 여부'], test_size=0.1, random_state=42)



### Train - Voting

In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# 개별 모델 정의
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.05, depth=8, verbose=100)
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, verbosity=1)
lgbm_model = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=8, verbose=1)


# 앙상블 모델 (VotingClassifier) 정의
ensemble_model = VotingClassifier(
    estimators=[
        ('catboost', cat_model),
        ('xgboost', xgb_model),
        ('lightgbm', lgbm_model)
    ],
    voting='soft'  # 소프트 투표 (각 모델의 확률값을 평균)
)

# 모델 학습
ensemble_model.fit(X_train, y_train)


y_valid_pred = ensemble_model.predict_proba(X_valid)[:, 1]

# roc score
roc_score = roc_auc_score(y_valid, y_valid_pred)
print(f"ROC AUC Score : {roc_score}")


### Train - Bagging

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18 ],
    'min_samples_split' : [8, 16, 20]
}


rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf , param_grid=params , cv=2, n_jobs=-1 )
grid_cv.fit(X_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('\n최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

### Train - TabPFN

In [None]:
from tabpfn import TabPFNClassifier
X_train_sample = X_train.sample(n=5000, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

clf = TabPFNClassifier(
    device = 'cuda',
    seed = 42,
    N_ensemble_configurations = 1,
)
clf.fit(X_train_sample, y_train_sample, overwrite_warning= True)

y_pred = clf.predict_proba(X_valid)[:, 1]

# roc score
roc_score = roc_auc_score(y_valid, y_pred)
print(f"ROC AUC Score : {roc_score}")

### Train - TabNet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
# Model
clf = TabNetClassifier(optimizer_params=dict(lr=0.05), verbose=0)
from pytorch_tabnet.augmentations import ClassificationSMOTE
aug = ClassificationSMOTE(p=0.2)


clf.fit(
    X_train.values, y_train.values,
    eval_set=[(X_valid.values, y_valid.values)],
    eval_metric=["auc"],
    max_epochs=20,
    augmentations=aug, #aug, None
)
y_pred = clf.predict_proba(X_valid.values)[:, 1]

# roc score
roc_score = roc_auc_score(y_valid, y_pred)
print(f"ROC AUC Score : {roc_score}")


### Final Predict

In [None]:
pred_proba=ensemble_model.predict_proba(X_test)[:,1]

print(pred_proba.shape)

### Submission

In [None]:
sample_submission = pd.read_csv(os.getenv('SUBMISSION_DATA_PATH'))
sample_submission['probability'] = pred_proba
sample_submission

In [None]:
sample_submission.to_csv('log/baseline_submit.csv', index=False)