In [1]:
!pip install --quiet pandas numpy scikit-learn matplotlib seaborn tqdm catboost tqdm-joblib
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime
# 프로젝트 루트
base_dir = '/content/drive/MyDrive/Colab Notebooks/패턴인식'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_root = os.path.join(base_dir, 'GBM/log/trial1_orig', timestamp)
os.makedirs(output_root, exist_ok=True)

log_path = os.path.join(output_root, f'log_{timestamp}.txt')
conf_matrix_path = os.path.join(output_root, f'confusion_matrix_{timestamp}.png')
roc_path = os.path.join(output_root, f'roc_curve_{timestamp}.png')
pr_path = os.path.join(output_root, f'pr_curve_{timestamp}.png')
fi_path = os.path.join(output_root, f'feature_importance_{timestamp}.png')

# 데이터 불러오기
train_src = os.path.join(base_dir, 'data_preprocessing/result', 'trial1_train.csv')
df = pd.read_csv(train_src)

X = df.drop(columns=['target'])
y = df['target']

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    # ('cat', categorical_pipe, cat_cols),
])

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GradientBoostingClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=5,
        subsample=0.8, min_samples_split=4, min_samples_leaf=2,
        max_features='sqrt', random_state=42
    ))
])

# GridSearchCV with progress bar
param_grid = {
    'clf__n_estimators': [200, 400],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__max_depth': [3, 5],
    'clf__subsample': [0.8, 1.0],
    'clf__min_samples_split': [2, 4],
    'clf__min_samples_leaf': [1, 2],
    'clf__max_features': ['sqrt']
}
n_candidates = len(list(ParameterGrid(param_grid)))
cv_folds = 3

grid_search = GridSearchCV(pipe, param_grid, cv=cv_folds, scoring='roc_auc', verbose=0, n_jobs=-1)
print("Starting GridSearchCV with progress bar...")
with tqdm_joblib(tqdm(total=n_candidates * cv_folds, desc="GridSearchCV")):
    grid_search.fit(X_trainval, y_trainval)

best_pipe = grid_search.best_estimator_

# 5-Fold CV evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc, f1, auc = [], [], []

with open(log_path, 'w') as log_file:
    log_file.write('Starting 5-Fold Cross-Validation:\n')
    for idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_trainval, y_trainval), total=5, desc='5-Fold CV'), 1):
        X_tr, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
        y_tr, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

        best_pipe.fit(X_tr, y_tr)
        y_pred = best_pipe.predict(X_val)
        y_prob = best_pipe.predict_proba(X_val)[:, 1]

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred))
        auc.append(roc_auc_score(y_val, y_prob))

    comp = (np.array(acc) + np.array(f1) + np.array(auc)) / 3
    log_file.write('\nCross-Validation Results:\n')

    for i in range(5):
        line = f"[Fold {i+1}] Accuracy: {acc[i]:.4f}, F1: {f1[i]:.4f}, AUC: {auc[i]:.4f}, Composite: {comp[i]:.4f}"
        log_file.write(line + '\n')

    pipe.fit(X_trainval, y_trainval)
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    }
    comp_test = sum(metrics.values()) / 3

    log_file.write('\nFinal Evaluation on Holdout Test Set:\n')
    for name, val in {**metrics, 'Composite': comp_test}.items():
        log_file.write(f"{name} : {val:.4f}\n")


Starting GridSearchCV with progress bar...


GridSearchCV:   0%|          | 0/288 [00:00<?, ?it/s]

  0%|          | 0/288 [00:00<?, ?it/s]


5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s][A
5-Fold CV:  20%|██        | 1/5 [00:07<00:30,  7.73s/it][A
5-Fold CV:  40%|████      | 2/5 [00:14<00:21,  7.03s/it][A
5-Fold CV:  60%|██████    | 3/5 [00:22<00:14,  7.36s/it][A
5-Fold CV:  80%|████████  | 4/5 [00:28<00:07,  7.06s/it][A
5-Fold CV: 100%|██████████| 5/5 [00:36<00:00,  7.27s/it]


In [6]:
!pip install --quiet catboost pandas numpy scikit-learn tqdm

from google.colab import drive
drive.mount('/content/drive')

import os
import time
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, RandomizedSearchCV
)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

base_dir   = '/content/drive/MyDrive/Colab Notebooks/패턴인식'
data_path  = os.path.join(base_dir, 'data_preprocessing', 'result', 'trial1_train.csv')
log_root   = os.path.join(base_dir, 'catboost', 'log')
timestamp  = time.strftime('%Y%m%d_%H%M%S')
output_dir = os.path.join(log_root, f'trial2_{timestamp}')
os.makedirs(output_dir, exist_ok=True)

MODEL_FILE = os.path.join(output_dir, 'catboost_final_model_cpu.cbm')
LOG_FILE   = os.path.join(output_dir, 'log.txt')
FI_CSV     = os.path.join(output_dir, 'feature_importances.csv')

start_time = time.time()

df = pd.read_csv(data_path)

X = df.drop(columns=['target'])
y = df['target']

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
print(f"▶ Data split: TrainVal={X_trainval.shape}, Test={X_test.shape}")

param_dist = {
    'learning_rate':       [0.01,0.03,0.05,0.07,0.1],
    'depth':               [4,6,8,10],
    'l2_leaf_reg':         [1,3,5,7,9,12],
    'border_count':        [32,64,128,254],
    'bagging_temperature': [0,0.5,1.0,1.5,2.0],
    'random_strength':     [0.1,0.5,1,2,5],
    'colsample_bylevel':   [0.6,0.7,0.8,0.9,1.0]
}

base_model = CatBoostClassifier(
    iterations=5,
    random_state=42,
    verbose=0,
    task_type='GPU',
    devices='0'
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"▶ Starting RandomizedSearchCV (30 iters)...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=3,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True
)
search.fit(X_trainval, y_trainval)

best_params = search.best_params_
best_score  = search.best_score_
print(f"▶ Best params: {best_params}")
print(f"▶ Best CV ROC AUC: {best_score:.4f}")

print("▶ Training final model with early stopping...")
final_model = CatBoostClassifier(
    iterations=5,
    **best_params,
    eval_metric='AUC',
    early_stopping_rounds=50,
    use_best_model=True,
    random_state=42,
    verbose=100,
    task_type='GPU',
    devices='0'
)
final_model.fit(
    X_trainval, y_trainval,
    eval_set=[(X_test, y_test)]
)
best_iter = final_model.get_best_iteration()
print(f"▶ Best iteration: {best_iter}")

fi = final_model.get_feature_importance(prettified=True)
fi.head(20).to_csv(FI_CSV, index=False)
print(f"▶ Feature importances saved to {FI_CSV}")

y_pred   = final_model.predict(X_test)
y_prob   = final_model.predict_proba(X_test)[:, 1]
acc_test = accuracy_score(y_test, y_pred)
f1_test  = f1_score(y_test, y_pred)
auc_test = roc_auc_score(y_test, y_prob)
comp_test= (acc_test + f1_test + auc_test) / 3

print("▶ Final hold-out performance:")
print(f"   Accuracy : {acc_test:.4f}")
print(f"   F1 Score : {f1_test:.4f}")
print(f"   ROC AUC  : {auc_test:.4f}")
print(f"   Composite: {comp_test:.4f}")

# 13) 모델 및 로그 저장
final_model.save_model(MODEL_FILE)
print(f"▶ Model saved to {MODEL_FILE}")

with open(LOG_FILE, 'w') as f:
    f.write(f"Best params      : {best_params}\n")
    f.write(f"Best CV ROC AUC  : {best_score:.4f}\n")
    f.write(f"Best iteration   : {best_iter}\n")
    f.write(f"Test Accuracy    : {acc_test:.4f}\n")
    f.write(f"Test F1 Score    : {f1_test:.4f}\n")
    f.write(f"Test ROC AUC     : {auc_test:.4f}\n")
    f.write(f"Test Composite   : {comp_test:.4f}\n")

end_time = time.time()
elapsed  = end_time - start_time
hrs      = int(elapsed // 3600)
mins     = int((elapsed % 3600) // 60)
secs     = int(elapsed % 60)
print(f"▶ Total execution time: {hrs}h {mins}m {secs}s")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
▶ Data split: TrainVal=(17760, 57), Test=(4440, 57)
▶ Starting RandomizedSearchCV (30 iters)...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


10 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File 

▶ Best params: {'random_strength': 1, 'learning_rate': 0.1, 'l2_leaf_reg': 1, 'depth': 4, 'colsample_bylevel': 1.0, 'border_count': 128, 'bagging_temperature': 2.0}
▶ Best CV ROC AUC: 0.6706
▶ Training final model with early stopping...
0:	test: 0.6467121	best: 0.6467121 (0)	total: 11ms	remaining: 43.8ms
4:	test: 0.6846750	best: 0.6846750 (4)	total: 30.1ms	remaining: 0us
bestTest = 0.6846749783
bestIteration = 4
▶ Best iteration: 4
▶ Feature importances saved to /content/drive/MyDrive/Colab Notebooks/패턴인식/catboost/log/trial2_20250527_115019/feature_importances.csv
▶ Final hold-out performance:
   Accuracy : 0.6340
   F1 Score : 0.6370
   ROC AUC  : 0.6847
   Composite: 0.6519
▶ Model saved to /content/drive/MyDrive/Colab Notebooks/패턴인식/catboost/log/trial2_20250527_115019/catboost_final_model_cpu.cbm
▶ Total execution time: 0h 0m 11s


Default metric period is 5 because AUC is/are not implemented for GPU


-------

In [18]:
# Ensemble
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
ens_dir = os.path.join(base_dir,'GBM_catboost_ensemble','log',timestamp)
os.makedirs(ens_dir, exist_ok=True)
ens_log = os.path.join(ens_dir,'log.txt')
print(">> Ensemble log file path:", ens_log)

p1 = best_pipe.predict_proba(X_test)[:,1]
p2 = final_model.predict_proba(X_test)[:,1]
prob_ens = (p1+p2)/2
y_pred = (prob_ens>=0.5).astype(int)

acc = accuracy_score(y_test,y_pred)
f1m= f1_score(y_test,y_pred)
auc= roc_auc_score(y_test,prob_ens)
comp=(acc+f1m+auc)/3

with open(ens_log, 'w') as f:
    f.write("▶ Ensemble 평균 결과\n")
    f.write(f"Accuracy : {acc:.4f}\n")
    f.write(f"F1 Score : {f1m:.4f}\n")
    f.write(f"ROC AUC  : {auc:.4f}\n")
    f.write(f"Composite: {comp:.4f}\n")


>> Ensemble log file path: /content/drive/MyDrive/Colab Notebooks/패턴인식/GBM_catboost_ensemble/log/20250527_121327/log.txt


In [19]:
# VotingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics   import accuracy_score, f1_score, roc_auc_score
from catboost          import CatBoostClassifier
from datetime         import datetime
import os

# CatBoost 복사 후 'use_best_model' 제거
cb_params = final_model.get_params()
cb_params.pop('use_best_model', None)
cb_params.pop('early_stopping_rounds', None)

# Voting용 CatBoost
cb_for_voting = CatBoostClassifier(**cb_params)

timestamp  = datetime.now().strftime('%Y%m%d_%H%M%S')
vote_dir   = os.path.join(base_dir, 'GBM_catboost_voting', 'log', timestamp)
os.makedirs(vote_dir, exist_ok=True)
vote_log   = os.path.join(vote_dir, 'log.txt')
print(">> Voting log file path:", vote_log)

voter = VotingClassifier(
    estimators=[('gbm', best_pipe), ('cb', cb_for_voting)],
    voting='soft',
    weights=[1, 1],
    n_jobs=-1
)

voter.fit(X_trainval, y_trainval)

prob_vote   = voter.predict_proba(X_test)[:, 1]
y_pred_vote = voter.predict(X_test)

acc   = accuracy_score(y_test, y_pred_vote)
f1m   = f1_score    (y_test, y_pred_vote)
auc   = roc_auc_score(y_test, prob_vote)
comp  = (acc + f1m + auc) / 3

with open(vote_log, 'w') as f:
    f.write("▶ VotingClassifier (soft) 결과\n")
    f.write(f"Accuracy : {acc:.4f}\n")
    f.write(f"F1 Score : {f1m:.4f}\n")
    f.write(f"ROC AUC  : {auc:.4f}\n")
    f.write(f"Composite: {comp:.4f}\n")


>> Voting log file path: /content/drive/MyDrive/Colab Notebooks/패턴인식/GBM_catboost_voting/log/20250527_121354/log.txt


In [20]:
# StackingClassifier
from sklearn.ensemble      import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics      import accuracy_score, f1_score, roc_auc_score
from catboost             import CatBoostClassifier
from datetime             import datetime
import os

cb_params = final_model.get_params()
# 불필요한 파라미터 제거
for p in ('use_best_model','early_stopping_rounds','task_type','devices'):
    cb_params.pop(p, None)
# CPU 모드
cb_params['task_type'] = 'CPU'
cb_base = CatBoostClassifier(**cb_params)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
stk_dir   = os.path.join(base_dir, 'GBM_catboost_stacking', 'log', timestamp)
os.makedirs(stk_dir, exist_ok=True)
stk_log   = os.path.join(stk_dir, 'log.txt')
print(">> Stacking log file path:", stk_log)

stack = StackingClassifier(
    estimators=[('gbm', best_pipe), ('cb', cb_base)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    stack_method='predict_proba',
    n_jobs=1,
    passthrough=False
)

stack.fit(X_trainval, y_trainval)

prob_stack   = stack.predict_proba(X_test)[:, 1]
y_pred_stack = stack.predict(X_test)

acc  = accuracy_score(y_test, y_pred_stack)
f1m  = f1_score    (y_test, y_pred_stack)
auc  = roc_auc_score(y_test, prob_stack)
comp = (acc + f1m + auc) / 3

with open(stk_log, 'w') as f:
    f.write("▶ StackingClassifier 결과\n")
    f.write(f"Accuracy : {acc:.4f}\n")
    f.write(f"F1 Score : {f1m:.4f}\n")
    f.write(f"ROC AUC  : {auc:.4f}\n")
    f.write(f"Composite: {comp:.4f}\n")

>> Stacking log file path: /content/drive/MyDrive/Colab Notebooks/패턴인식/GBM_catboost_stacking/log/20250527_121446/log.txt
0:	total: 7.95ms	remaining: 31.8ms
4:	total: 33.6ms	remaining: 0us
0:	total: 6.77ms	remaining: 27.1ms
4:	total: 30.5ms	remaining: 0us
0:	total: 6.18ms	remaining: 24.7ms
4:	total: 28.7ms	remaining: 0us
0:	total: 6.27ms	remaining: 25.1ms
4:	total: 28.7ms	remaining: 0us
0:	total: 6.29ms	remaining: 25.2ms
4:	total: 28.4ms	remaining: 0us
0:	total: 5.77ms	remaining: 23.1ms
4:	total: 28.2ms	remaining: 0us
