In [21]:
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Data Load

In [None]:
# data_path = "/workspace/data/0_Subtask/[DICCD 분석] 251107.csv" # ~251110
data_path = "/workspace/data/0_Subtask/[DICCD 분석] 251111_SRD 추가.csv" # 251111
raw_df = pd.read_csv(data_path)

# y = 타겟 변수 / x = 예측 변수
# except_cols = ['DICCD_CP', 'ODD_Cur', 'ODD_Pas', 'DICCD_C', 'DICCD_P', 'MentD']
except_cols = ['Z_KDBDRS', 'K_ODD', 'KDBDRS', 'wt_s', 'DICCD_CP', 'ODD_Cur', 'ODD_Pas', 'DICCD_C', 'DICCD_P', 'MentD']
standard_cols = ['ZAI_Incom', 'Z_K_ODD', 'Z_K_CD', 'Z_K_IA', 'Z_K_HI', 'Z_GAD', 'Z_PHQ', 'Z_SAS']
nonstandard_cols = ['Incom', 'K_ODD', 'K_CD', 'K_IA', 'K_HI', 'GAD', 'PHQ', 'SAS']


drop_cols = except_cols + nonstandard_cols
# drop_cols = except_cols + standard_cols

print(f"기존 변수 수 : {len(raw_df.columns)}")
raw_df = raw_df.drop(columns=drop_cols) # 입력 변수 
print(f"제거된 변수: {except_cols + standard_cols}")
print(f"남은 변수 수: {len(raw_df.columns)}")

기존 변수 수 : 60
제거된 변수: ['Z_KDBDRS', 'K_ODD', 'KDBDRS', 'wt_s', 'DICCD_CP', 'ODD_Cur', 'ODD_Pas', 'DICCD_C', 'DICCD_P', 'MentD', 'ZAI_Incom', 'Z_K_ODD', 'Z_K_CD', 'Z_K_IA', 'Z_K_HI', 'Z_GAD', 'Z_PHQ', 'Z_SAS']
남은 변수 수: 43


## Missing value imputation


In [23]:
from utils.data_imputation import filter_by_missing_ratio
# 사용 예시
df = filter_by_missing_ratio(raw_df, threshold=0.25, visualize = False)
X = df.drop(columns=['ODD_CP'])
y = df['ODD_CP']

삭제 칼럼들 : ['GEdu', 'GAlc', 'GTob']

==== 결측치가 25% 미만인 컬럼만 남김 ====
남은 변수 수: 40
남은 컬럼 리스트: ['Sex', 'Answ', 'Z_K_IA', 'Z_K_HI', 'Z_K_ODD', 'Z_K_CD', 'ODD_CP', 'Age_Grp', 'MEdu', 'FEdu', 'P_Marr', 'MJob', 'FJob', 'ZAI_Incom', 'MAlc', 'FAlc', 'MTob', 'FTob', 'PSleep', 'Z_GAD', 'Z_PHQ', 'PAF', 'SBV', 'SBP', 'CBV', 'CBP', 'Avg_G', 'GDec', 'BF', 'RFG', 'AdolSlp', 'MoodD', 'AnxD', 'ST1', 'ST2', 'ST3', 'ST4', 'IGD_P', 'Z_SAS', 'SRD_CP']

==== 남아있는 컬럼 중 결측치가 있는 컬럼의 결측치 개수 ====
MEdu      56
FEdu     207
MJob      56
FJob     207
MAlc      56
FAlc     207
MTob      56
FTob     207
IGD_P    743
dtype: int64

결측치가 남아있는 변수 수: 9 / 전체 변수 수: 40


# Data Preprocess

### 데이터 전처리 검토

1. 범주형

    1-1 Ordinal : Imputation(Median) => OrdinalEncoder

    1-2 Nominal : Imputation(Unknown) => OneHotEncoder

2. 수치형

    2-1 결측치 존재 시, Imputation(Median) 적용 => Z-표준화(StandardScaler)

In [24]:
from utils.data_preprocessor import check_preprocessing_needs, preprocess_dataframe
from utils.data_preprocessor import data_preprocess_pipeline
from utils.data_analyzer import analyze_correlation_matrix
# 전처리 필요사항 검토
# recommendations = check_preprocessing_needs(X_train, target_col='ODD_CP')

# # 권장사항에 따라 전처리 (선택사항)
X = preprocess_dataframe(
    X, 
    target_col='ODD_CP',
    drop_weight=True,  # 가중치 변수 제거
    convert_categorical=['Answ', 'IGD_P', 'FEdu', 'MEdu', 'FJob', 'MJob', 'Age_Grp', 'P_Marr'],
    convert_ordinal=['ST1', 'ST2', 'ST3', 'ST4', 'PAF', 'MAlc', 'FAlc', "MTob", "FTob", "MAlc", "FAlc", "GAlc", "MTob", "FTob", "GTob"], 
    convert_binary=['SRD_CP', 'IGD_P', 'Sex', 'PSleep', 'SBV', 'SBP', 'CBV', 'CBP', 'GDec', 'BF', 'RFG', 'MentD', 'AdolSlp', 'MoodD', 'AnxD'],
    drop_low_variance=False,  # 분산이 낮은 변수 제거
    drop_leakage=True  # 데이터 누수 위험 변수 제거
)

X = data_preprocess_pipeline(X) # 전처리된 데이터 

# X_preprocessed.info()
# analyze_correlation_matrix(X, y)

▶ integer 전처리 중...
▶ float 전처리 중...
▶ 순서 있는 category(ordinal) 전처리 중...
▶ object 전처리 중...
▶ 순서 없는 category 전처리 중...
✅ 데이터 전처리 완료


### Startified Splitting 

In [25]:
from utils.data_splitter import oversample_train_test_split
# 함수 사용
df = pd.concat([X, y], axis=1)
X_train, X_test, y_train, y_test = oversample_train_test_split(
    X, y, 
    target_col='ODD_CP',
    test_size_per_class=60,
    train_size_per_class=240,
    random_state=42,
    verbose=True,
    method = "SMOTEENN"
)


데이터 분할 및 샘플링

전체 데이터:
  클래스 0: 3205개
  클래스 1: 177개

1단계: Test set 구성 (각 클래스 60개씩)
  Test set: 클래스 0=60개, 클래스 1=60개 (총 120개)

2단계: Train set용 원본 데이터 (Test set 제외)
  Train 원본: 클래스 0=3145개, 클래스 1=117개

3단계: Train set 샘플링
  클래스 0 언더샘플링: 3145개 → 240개
  클래스 1 오버샘플링: 117개 → 147개 (SMOTEENN 사용)

4단계: Train set 최종 구성
  Train set: 클래스 0=240개, 클래스 1=147개 (총 387개)
  Train 비율: 1:1.6

최종 데이터 분포:
  Train: 387개 (클래스 0=240, 클래스 1=147)
  Test:  120개 (클래스 0=60, 클래스 1=60)


# Train & Eval

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from scipy.stats import mode
import numpy as np
import shap

class CatBoostFoldTrainer:
    def __init__(self, model_params=None, n_splits=5, random_state=123, verbose=100, T=0.01):
        if model_params is None:
            model_params = dict(
                iterations=1000, learning_rate=0.38577, depth=8, l2_leaf_reg=9.587765, subsample=0.748324, random_strength=0.0, class_weights = [1, 10],
                min_data_in_leaf=59, leaf_estimation_iterations=1, loss_function='Logloss', eval_metric='AUC', verbose=verbose, random_seed=random_state
            )
        self.model_params = dict(model_params)
        self.model_params['verbose'] = False
        self.n_splits = n_splits
        self.random_state = random_state
        self.T = T

        self.metrics = { 'CatBoost': [] }  # validation metrics (per fold)
        self.test_metrics = { 'CatBoost': [] }  # NEW: test set metrics (per fold)
        self.feature_importances = { 'CatBoost': [] }
        self.test_proba = { 'CatBoost': [] }
        self.test_preds = { 'CatBoost': [] }
        self.fold_thresholds = { 'CatBoost': [] }
        self.shap_values_train = { 'CatBoost': [] }  # SHAP values for train set (per fold)
        self.shap_values_test = { 'CatBoost': [] }  # SHAP values for test set (per fold)
        self.fold_weights = None
        self.weighted_avg_metrics = None
        self.weighted_avg_test_metrics = None  # NEW: weighted test set metrics

    def fit(self, X, y, X_test, y_test=None):
        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)
        self.metrics['CatBoost'].clear()
        self.test_metrics['CatBoost'].clear()
        self.feature_importances['CatBoost'].clear()
        self.test_proba['CatBoost'].clear()
        self.test_preds['CatBoost'].clear()
        self.fold_thresholds['CatBoost'].clear()
        self.shap_values_train['CatBoost'].clear()
        self.shap_values_test['CatBoost'].clear()

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
            print(f"===== Fold {fold} =====")
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            cat_model = CatBoostClassifier(**self.model_params)
            cat_model.fit(X_train, y_train)
            
            val_proba = cat_model.predict_proba(X_val)[:, 1]
            thresholds = np.linspace(0, 1, 200)
            f1s = [f1_score(y_val, (val_proba >= t).astype(int)) for t in thresholds]
            best_idx = np.argmax(f1s)
            best_threshold = thresholds[best_idx]
            val_pred_best = (val_proba >= best_threshold).astype(int)
            val_metrics = {
                'Accuracy': accuracy_score(y_val, val_pred_best),
                'Precision': precision_score(y_val, val_pred_best),
                'Recall': recall_score(y_val, val_pred_best),
                'F1 Score': f1_score(y_val, val_pred_best),
                'ROC AUC Score': roc_auc_score(y_val, val_proba),
                'Best_Threshold': best_threshold
            }
            self.metrics['CatBoost'].append(val_metrics)
            self.feature_importances['CatBoost'].append(cat_model.feature_importances_)
            
            # Calculate SHAP values for train and test sets
            explainer = shap.TreeExplainer(cat_model)
            shap_values_train_fold = explainer.shap_values(X_train)
            shap_values_test_fold = explainer.shap_values(X_test)
            # For binary classification, shap_values is a list [values_for_class_0, values_for_class_1]
            # We take values_for_class_1 (positive class)
            if isinstance(shap_values_train_fold, list):
                shap_values_train_fold = shap_values_train_fold[1]
            if isinstance(shap_values_test_fold, list):
                shap_values_test_fold = shap_values_test_fold[1]
            self.shap_values_train['CatBoost'].append(shap_values_train_fold)
            self.shap_values_test['CatBoost'].append(shap_values_test_fold)
            
            test_proba = cat_model.predict_proba(X_test)[:, 1]
            test_pred = (test_proba >= best_threshold).astype(int)
            self.test_proba['CatBoost'].append(test_proba)
            self.test_preds['CatBoost'].append(test_pred)
            self.fold_thresholds['CatBoost'].append(best_threshold)

            # Compute and store test set metrics for each fold if y_test is provided
            if y_test is not None:
                try:
                    test_metric_this_fold = {
                        'Accuracy': accuracy_score(y_test, test_pred),
                        'Precision': precision_score(y_test, test_pred),
                        'Recall': recall_score(y_test, test_pred),
                        'F1 Score': f1_score(y_test, test_pred),
                        'ROC AUC Score': roc_auc_score(y_test, test_proba),
                        'Best_Threshold': best_threshold
                    }
                except Exception as e:
                    # In case of error, e.g. only one class in y_test, fill with np.nan
                    test_metric_this_fold = {
                        'Accuracy': np.nan,
                        'Precision': np.nan,
                        'Recall': np.nan,
                        'F1 Score': np.nan,
                        'ROC AUC Score': np.nan,
                        'Best_Threshold': best_threshold
                    }
                self.test_metrics['CatBoost'].append(test_metric_this_fold)
            
            print(f"Fold {fold} Validation Metrics")
            display(pd.DataFrame([val_metrics]))
            if y_test is not None:
                print(f"Fold {fold} Test Metrics")
                display(pd.DataFrame([test_metric_this_fold]))

        self.y_test = y_test if y_test is not None else None
        self.print_fold_results(y_test = y_test)
        return self

    @staticmethod
    def evaluate_with_threshold(y_true, proba, threshold):
        y_pred = (proba >= threshold).astype(int)
        return {
            'Accuracy': accuracy_score(y_true, y_pred),
            'Precision': precision_score(y_true, y_pred),
            'Recall': recall_score(y_true, y_pred),
            'F1 Score': f1_score(y_true, y_pred),
            'ROC AUC Score': roc_auc_score(y_true, proba),
        }

    def calc_softmax_weights(self):
        f1_scores = np.array([fold_metric['F1 Score'] for fold_metric in self.metrics['CatBoost']])
        exp_scores = np.exp(f1_scores / self.T)
        total_exp = np.sum(exp_scores)
        self.fold_weights = exp_scores / total_exp
        return self.fold_weights

    def calculate_weighted_metrics(self):
        fold_weights = self.calc_softmax_weights()
        model_metrics = self.metrics['CatBoost']
        metric_keys = [k for k in model_metrics[0] if k != 'Best_Threshold']
        self.weighted_avg_metrics = {
            metric: sum(fw * fold_metric[metric] for fw, fold_metric in zip(fold_weights, model_metrics))
            for metric in metric_keys
        }
        return self.weighted_avg_metrics

    def calculate_weighted_test_metrics(self):
        # Weighted avg of test metrics using F1-score-based fold weights
        if len(self.test_metrics['CatBoost']) == 0:
            return None
        fold_weights = self.calc_softmax_weights()
        model_metrics = self.test_metrics['CatBoost']
        metric_keys = [k for k in model_metrics[0] if k != 'Best_Threshold']
        self.weighted_avg_test_metrics = {
            metric: sum(fw * fold_metric[metric] for fw, fold_metric in zip(fold_weights, model_metrics))
            for metric in metric_keys
        }
        return self.weighted_avg_test_metrics

    def print_fold_results(self, y_test=None, mode_type=None):
        # y_test를 인자로 넘겨받지 않으면 self.y_test를 사용
        if y_test is None and hasattr(self, 'y_test'):
            y_test = self.y_test

        print("===== Val F1 Score 기준으로 Softmax 변환한 가중치 =====")
        fold_weights = self.calc_softmax_weights()
        avg_metrics = self.calculate_weighted_metrics()
        print("===== Stratified K-Fold Validation 평균 성능 (F1 Score Weighted) =====")
        print(f"\n== CatBoost Model (Validation) ==")
        print("Fold weights (by F1 Score):", fold_weights.tolist())
        for metric, value in avg_metrics.items():
            print(f"{metric}: {value:.6f}")
        avg_thr = np.mean(self.fold_thresholds["CatBoost"])
        print(f"Best Thresholds per fold: {np.round(self.fold_thresholds['CatBoost'],4)}  (mean: {avg_thr:.4f})")
        
        if y_test is not None and len(self.test_metrics['CatBoost']) > 0:
            avg_test_metrics = self.calculate_weighted_test_metrics()
            print("\n===== Stratified K-Fold Test 평균 성능 (F1 Score Weighted) =====")
            print(f"== CatBoost Model (Test Set) ==")
            for metric, value in avg_test_metrics.items():
                print(f"{metric}: {value:.6f}")

        if mode_type is None:
            print("\n--- 각 Fold 별 Validation 결과 ---")
            for i, metrics in enumerate(self.metrics["CatBoost"]):
                print(f"Fold {i+1}:")
                for metric, value in metrics.items():
                    print(f"  {metric}: {value:.6f}")
            if y_test is not None and len(self.test_metrics['CatBoost']) > 0:
                print("\n--- 각 Fold 별 Test 결과 ---")
                for i, metrics in enumerate(self.test_metrics["CatBoost"]):
                    print(f"Fold {i+1}:")
                    for metric, value in metrics.items():
                        print(f"  {metric}: {value:.6f}")
        elif mode_type == "soft_voting":
            if y_test is not None and 'CatBoost' in self.test_proba:
                test_probas = np.array(self.test_proba['CatBoost'])  # (n_folds, n_samples)
                test_preds = np.array(self.test_preds['CatBoost'])   # (n_folds, n_samples)

                # Hard voting
                hard_voting_pred = mode(test_preds, axis=0, keepdims=False).mode

                print(f":: [Test Set, Hard Voting (Majority)] ::")
                print(f"Test Accuracy: {accuracy_score(y_test, hard_voting_pred):.6f}")
                print(f"Test Precision: {precision_score(y_test, hard_voting_pred):.6f}")
                print(f"Test Recall: {recall_score(y_test, hard_voting_pred):.6f}")
                print(f"Test F1 Score: {f1_score(y_test, hard_voting_pred):.6f}")

    def get_val_metrics(self):
        return self.metrics

    def get_test_metrics(self):
        return self.test_metrics

    def get_feature_importances(self):
        return self.feature_importances

    def get_test_labels(self):
        return self.y_test

    def get_test_proba(self):
        return self.test_proba

    def get_test_preds(self):
        return self.test_preds

    def get_fold_thresholds(self):
        return self.fold_thresholds

    def get_shap_values_train(self):
        return self.shap_values_train

    def get_shap_values_test(self):
        return self.shap_values_test

# 사용 예시:
catboost_cv = CatBoostFoldTrainer(n_splits=5, random_state=42, T=0.01)
catboost_cv.fit(X_train, y_train, X_test, y_test=y_test)

===== Fold 1 =====
Fold 1 Validation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.923077,0.852941,0.966667,0.90625,0.979861,0.40201


Fold 1 Test Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.816667,0.895833,0.716667,0.796296,0.906944,0.40201


===== Fold 2 =====
Fold 2 Validation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.961538,0.935484,0.966667,0.95082,0.989583,0.552764


Fold 2 Test Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.825,0.882353,0.75,0.810811,0.915833,0.552764


===== Fold 3 =====
Fold 3 Validation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.974026,1.0,0.931034,0.964286,0.99569,0.919598


Fold 3 Test Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.75,0.916667,0.55,0.6875,0.921667,0.919598


===== Fold 4 =====
Fold 4 Validation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.935065,1.0,0.827586,0.90566,0.979167,0.974874


Fold 4 Test Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.666667,0.884615,0.383333,0.534884,0.911111,0.974874


===== Fold 5 =====
Fold 5 Validation Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.935065,0.9,0.931034,0.915254,0.976293,0.487437


Fold 5 Test Metrics


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Best_Threshold
0,0.816667,0.865385,0.75,0.803571,0.920278,0.487437


===== Val F1 Score 기준으로 Softmax 변환한 가중치 =====
===== Stratified K-Fold Validation 평균 성능 (F1 Score Weighted) =====

== CatBoost Model (Validation) ==
Fold weights (by F1 Score): [0.0023690490020396634, 0.20427269119496738, 0.7852954671414462, 0.002233402808448755, 0.005829389853098059]
Accuracy: 0.971040
Precision: 0.985890
Recall: 0.938167
F1 Score: 0.960981
ROC AUC Score: 0.994255
Best Thresholds per fold: [0.402  0.5528 0.9196 0.9749 0.4874]  (mean: 0.6673)

===== Stratified K-Fold Test 평균 성능 (F1 Score Weighted) =====
== CatBoost Model (Test Set) ==
Accuracy: 0.765681
Precision: 0.909237
Recall: 0.592043
F1 Score: 0.713283
ROC AUC Score: 0.920409

--- 각 Fold 별 Validation 결과 ---
Fold 1:
  Accuracy: 0.923077
  Precision: 0.852941
  Recall: 0.966667
  F1 Score: 0.906250
  ROC AUC Score: 0.979861
  Best_Threshold: 0.402010
Fold 2:
  Accuracy: 0.961538
  Precision: 0.935484
  Recall: 0.966667
  F1 Score: 0.950820
  ROC AUC Score: 0.989583
  Best_Threshold: 0.552764
Fold 3:
  Accuracy: 0.97

<__main__.CatBoostFoldTrainer at 0x7f71225ba390>

## Softmax 변환 후 평균 성능 계산

In [None]:
# soft voting (평균 확률, best F1 기준 threshold 사용)
feature_importances = catboost_cv.get_feature_importances()
save_dict = {
    'test_inputs': X_test,
    'test_labels': catboost_cv.get_test_labels(),
    'test_proba': catboost_cv.get_test_proba(),
    'test_preds': catboost_cv.get_test_preds(),
    'test_metrics': catboost_cv.get_test_metrics(),
    'fold_thresholds': catboost_cv.get_fold_thresholds(),
    'shap_values_test': catboost_cv.get_shap_values_test(),
    'feature_importances': pd.DataFrame({'feature': X_train.columns, 'importance': np.mean(feature_importances['CatBoost'], axis=0)}).sort_values(by='importance', ascending=False),
}

import pickle
save_path = "/workspace/data/results/catboost_downsample_results.pkl"
with open(save_path, 'wb') as f:
    pickle.dump(save_dict, f)
    

# Analysis

## Feature Importance

In [28]:
# # CatBoost feature importance의 각 fold별 평균을 계산하여 테이블로 생성 및 정렬, 소수점 3자리까지 표시
# catboost_importances = np.mean(feature_importances['CatBoost'], axis=0)
# feature_importance_df = pd.DataFrame({
#     'feature': X_train.columns,
#     'importance': catboost_importances
# }).sort_values(by='importance', ascending=False).reset_index(drop=True)
# feature_importance_df['importance'] = feature_importance_df['importance'].round(3)
# display(feature_importance_df)

# Optimize

In [29]:
# from sklearn.model_selection import StratifiedKFold
# from catboost import CatBoostClassifier
# from utils.ml_pipeline import evaluate_model
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# import optuna
# import numpy as np
# import warnings
# import os
# os.environ['PYTHONWARNINGS'] = 'ignore'
# warnings.filterwarnings('ignore')
# # CatBoost GPU에서 "Default metric period is 5 because AUC is/are not implemented for GPU"
# # -> GPU에서 AUC를 강제로 사용하려면 1) eval_metric="Logloss" 2) leaf_estimation_method='Gradient' 3) devices='0'(또는 사용 GPU) 지정해야 함
# # 참고: https://github.com/catboost/catboost/issues/1729

# def find_best_auc_threshold(y_true, y_proba):
#     """
#     AUC가 최대가 되도록 threshold를 탐색하여 반환
#     """
#     thresholds = np.linspace(0.0, 1.0, 1001)
#     best_auc = -np.inf
#     best_thres = 0.5
#     for th in thresholds:
#         preds = (y_proba >= th).astype(int)
#         try:
#             auc = roc_auc_score(y_true, preds)
#         except Exception:
#             continue  # ROC AUC Score 계산 불가(클래스 한쪽만 나오는 경우) 시 무시
#         if auc > best_auc:
#             best_auc = auc
#             best_thres = th
#     return best_thres, best_auc

# def objective(trial):
#     param = {
#         # 각 하이퍼파라미터 정의를 주석으로 추가
#         'task_type': 'GPU',                              # 연산에 GPU 사용
#         'iterations': trial.suggest_int('iterations', 500, 1000),                # 트리의 개수(부스팅 라운드 수)
#         'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.5, log=True),  # 학습률(learning rate)
#         'depth': trial.suggest_int('depth', 5, 8),                                # 개별 트리의 깊이
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 30, log=True),       # L2 정규화 항의 계수
#         'subsample': trial.suggest_float('subsample', 0.6, 0.9),                  # 훈련 데이터 샘플링 비율
#         'random_strength': trial.suggest_int('random_strength', 1, 10),           # feature splits 선택에 randomness 적용 강도
#         'border_count': trial.suggest_int('border_count', 32, 128),               # 연속형 변수의 bin 개수(최대 분할 수)
#         'loss_function': 'Logloss',                                               # 손실 함수(Logloss: 이진 분류)
#         'eval_metric': 'Logloss',                                                 # 평가 지표(Logloss 사용)
#         'leaf_estimation_method': 'Gradient',                                     # 리프값 추정 방법
#         'devices': '0-1',                                                           # 사용할 GPU 디바이스 (1개만 쓸 때 '0', 여러개면 '0-1' 등)
#         'bootstrap_type': 'Bernoulli',                                            # 샘플링 방식(Bernoulli 방식)
#         'grow_policy': 'Depthwise',                                               # 트리 성장 방식(Depthwise 방식)
#         'early_stopping_rounds': 50,                                              # early stopping patience
#         'verbose': 0,                                                             # 학습 중 출력 설정(0이면 출력 안 함)
#         'gpu_ram_part': 1.0,                                                      # 전체 GPU RAM 중 사용 비율(최대값 1.0)
#     }
    
#     cv_fold_aucs = []
#     best_thresholds = []
#     kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

#     for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
#         X_fold_train = X_train.iloc[train_idx]
#         X_fold_val = X_train.iloc[val_idx]
#         y_fold_train = y_train.iloc[train_idx]
#         y_fold_val = y_train.iloc[val_idx]

#         model = CatBoostClassifier(**param)
#         model.fit(
#             X_fold_train, y_fold_train,
#             eval_set=[(X_fold_val, y_fold_val)],
#             verbose=0
#         )

#         val_pred_proba = model.predict_proba(X_fold_val)[:, 1]

#         # AUC가 최대가 되는 threshold를 찾아 해당 threshold로 평가
#         best_thres, best_auc = find_best_auc_threshold(y_fold_val.values, val_pred_proba)
#         cv_fold_aucs.append(best_auc)
#         best_thresholds.append(best_thres)
#     mean_auc = np.mean(cv_fold_aucs)
#     return mean_auc

# # Optuna를 사용한 하이퍼파라미터 최적화 (direction='maximize'로 AUC 최대화)
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)  # 50번 시도

# # 최적의 하이퍼파라미터 출력
# print('Best trial:')
# trial = study.best_trial
# print('  Value: {}'.format(trial.value))
# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

# # 최적의 하이퍼파라미터로 최종 모델 학습
# best_params = study.best_params
# best_params.update({
#     'task_type': 'GPU',  # GPU 사용
#     'loss_function': 'Logloss',
#     'eval_metric': 'Logloss',  # GPU에서 AUC 평가 강제 활성화
#     'bootstrap_type': 'Bernoulli',
#     'grow_policy': 'Depthwise',
#     'early_stopping_rounds': 100,
#     'verbose': 100,
#     'devices': '0',
#     'leaf_estimation_method': 'Gradient',
#     'gpu_ram_part': 0.3,
# })

# final_model = CatBoostClassifier(**best_params)

# print("Best Params")
# print(best_params)

In [30]:

# # 최종적으로 검증세트 기준 best threshold로 평가 및 AUC Curve 등 시각화

# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, f1_score

# # 검증 세트 기준 평가 (best threshold 포함)
# from utils.ml_pipeline import evaluate_model

# # 검증 세트에서 best threshold 등 평가
# eval_results = evaluate_model(final_model, X_test, y_test)

# print("최종 모델 평가 결과 (Train/Validation 기준):")
# for key, value in eval_results.items():
#     print(f"{key}: {value}")

# # ROC Curve
# y_pred_proba = final_model.predict_proba(X_test)[:, 1]
# fpr, tpr, roc_thresholds = roc_curve(y_test, y_pred_proba)
# roc_auc = roc_auc_score(y_test, y_pred_proba)

# plt.figure(figsize=(7,5))
# plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
# plt.plot([0,1], [0,1], linestyle='--', color='gray')
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve (Train Set)")
# plt.legend(loc="best")
# plt.show()

# # Best Threshold로 이진분류 및 f1/precision/recall 점수 확인
# best_threshold = eval_results['Best Threshold']
# y_test_pred_binary = (y_pred_proba >= best_threshold).astype(int)
# f1 = f1_score(y_test, y_test_pred_binary)
# precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# print(f"Best Threshold (Train Set): {best_threshold:.4f}")
# print(f"F1 Score (at best threshold): {f1:.4f}")

# # 최종 테스트 데이터 예측 및 저장
# y_test_pred_proba = final_model.predict_proba(X_test)[:, 1]
# ids = [f"TEST_{i:05d}" for i in range(len(X_test))]
# result_df = pd.DataFrame({
#     'ID': ids,
#     'probability': y_test_pred_proba
# })