In [None]:
# Install dependencies
!pip install --quiet pandas numpy tqdm scikit-learn matplotlib seaborn tqdm-joblib


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
base_dir = '/content/drive/MyDrive/Colab Notebooks/패턴인식'


In [None]:

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

# 타임스탬프 설정
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_root = os.path.join(base_dir, 'GBM/log/trial1_orig', timestamp)
os.makedirs(output_root, exist_ok=True)

log_path = os.path.join(output_root, f'log_{timestamp}.txt')
conf_matrix_path = os.path.join(output_root, f'confusion_matrix_{timestamp}.png')
roc_path = os.path.join(output_root, f'roc_curve_{timestamp}.png')
pr_path = os.path.join(output_root, f'pr_curve_{timestamp}.png')
fi_path = os.path.join(output_root, f'feature_importance_{timestamp}.png')

train_src = os.path.join(base_dir, 'data_preprocessing/result', 'trial1_train.csv')
df = pd.read_csv(train_src)

X = df.drop(columns=['target'])
y = df['target']

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    # ('cat', categorical_pipe, cat_cols),
])

pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', GradientBoostingClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=5,
        subsample=0.8, min_samples_split=4, min_samples_leaf=2,
        max_features='sqrt', random_state=42
    ))
])

param_grid = {
    'clf__n_estimators': [200, 400],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__max_depth': [3, 5],
    'clf__subsample': [0.8, 1.0],
    'clf__min_samples_split': [2, 4],
    'clf__min_samples_leaf': [1, 2],
    'clf__max_features': ['sqrt']
}
n_candidates = len(list(ParameterGrid(param_grid)))
cv_folds = 3

grid_search = GridSearchCV(pipe, param_grid, cv=cv_folds, scoring='roc_auc', verbose=0, n_jobs=-1)
print("Starting GridSearchCV with progress bar...")
with tqdm_joblib(tqdm(total=n_candidates * cv_folds, desc="GridSearchCV")):
    grid_search.fit(X_trainval, y_trainval)

best_pipe = grid_search.best_estimator_

# 5-Fold CV evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc, f1, auc = [], [], []

with open(log_path, 'w') as log_file:
    log_file.write('Starting 5-Fold Cross-Validation:\n')
    for idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(X_trainval, y_trainval), total=5, desc='5-Fold CV'), 1):
        X_tr, X_val = X_trainval.iloc[train_idx], X_trainval.iloc[val_idx]
        y_tr, y_val = y_trainval.iloc[train_idx], y_trainval.iloc[val_idx]

        best_pipe.fit(X_tr, y_tr)
        y_pred = best_pipe.predict(X_val)
        y_prob = best_pipe.predict_proba(X_val)[:, 1]

        acc.append(accuracy_score(y_val, y_pred))
        f1.append(f1_score(y_val, y_pred))
        auc.append(roc_auc_score(y_val, y_prob))

    comp = (np.array(acc) + np.array(f1) + np.array(auc)) / 3
    log_file.write('\nCross-Validation Results:\n')

    for i in range(5):
        line = f"[Fold {i+1}] Accuracy: {acc[i]:.4f}, F1: {f1[i]:.4f}, AUC: {auc[i]:.4f}, Composite: {comp[i]:.4f}"
        log_file.write(line + '\n')

    pipe.fit(X_trainval, y_trainval)
    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_prob)
    }
    comp_test = sum(metrics.values()) / 3

    log_file.write('\nFinal Evaluation on Holdout Test Set:\n')
    for name, val in {**metrics, 'Composite': comp_test}.items():
        log_file.write(f"{name} : {val:.4f}\n")


Starting GridSearchCV with progress bar...


GridSearchCV:   0%|          | 0/288 [00:00<?, ?it/s]

  0%|          | 0/288 [00:00<?, ?it/s]


5-Fold CV:   0%|          | 0/5 [00:00<?, ?it/s][A
5-Fold CV:  20%|██        | 1/5 [00:06<00:24,  6.20s/it][A
5-Fold CV:  40%|████      | 2/5 [00:12<00:18,  6.21s/it][A
5-Fold CV:  60%|██████    | 3/5 [00:18<00:12,  6.23s/it][A
5-Fold CV:  80%|████████  | 4/5 [00:24<00:06,  6.22s/it][A
5-Fold CV: 100%|██████████| 5/5 [00:31<00:00,  6.22s/it]
