In [1]:
!pip install --quiet pandas numpy scikit-learn matplotlib seaborn tqdm catboost tqdm-joblib
from google.colab import drive
drive.mount('/content/drive')

import os
from datetime import datetime



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [2]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [10]:
!pip install --quiet catboost pandas numpy scikit-learn tqdm imbalanced-learn

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np
import os
import time

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, RandomizedSearchCV
)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.inspection import permutation_importance

# --- 설정 및 경로 정의 ---
# 프로젝트 루트
base_dir = '/content/drive/MyDrive/Colab Notebooks/패턴인식'
data_path  = os.path.join(base_dir, 'data_preprocessing/result', 'trial1_train.csv')
log_root   = os.path.join(base_dir, 'catboost', 'log')
timestamp  = time.strftime('%Y%m%d_%H%M%S')
output_dir = os.path.join(log_root, f'trial_derived_features_{timestamp}')
os.makedirs(output_dir, exist_ok=True)

MODEL_FILE = os.path.join(output_dir, 'catboost_final_model_cpu_derived.cbm')
LOG_FILE   = os.path.join(output_dir, 'log_derived_features.txt')
FI_CSV     = os.path.join(output_dir, 'feature_importances_derived.csv')
PI_CSV     = os.path.join(output_dir, 'permutation_importances_derived.csv')

N_ITER_SEARCH = 30  # RandomizedSearchCV 반복 횟수

# --- 1) 데이터 로드 ---
start_time = time.time()
print("1) Loading data...")
df = pd.read_csv(data_path)
X = df.drop(columns=['target'])
y = df['target']

# --- 2) 파생 변수 생성 ---
print("\n2) Creating ONLY the previously useful derived features...")
epsilon = 1e-6
if 'n_tokens_content' in X and 'num_imgs' in X:
    X['feat_content_to_img_ratio'] = X['n_tokens_content'] / (X['num_imgs'] + epsilon)
if 'global_subjectivity' in X and 'global_sentiment_polarity' in X:
    X['feat_global_sentiment_strength'] = X['global_subjectivity'] * X['global_sentiment_polarity']
if 'n_tokens_content' in X and 'num_hrefs' in X:
    X['feat_content_to_href_ratio'] = X['n_tokens_content'] / (X['num_hrefs'] + epsilon)
print("Only previously useful derived features created.")

# --- 3) Train/Test 분할 ---
print("\n3) Splitting data into training and test sets...")
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# --- 4) 수치형 결측치 처리 (중위수 대체) ---
print("\n4) Applying preprocessing (missing value imputation)...")
num_cols = X_trainval.select_dtypes(include=[np.number]).columns.tolist()
medi = X_trainval[num_cols].median()
X_trainval[num_cols] = X_trainval[num_cols].fillna(medi)
X_test[num_cols]      = X_test[num_cols].fillna(medi)

# --- 4.1) 원핫 복원 → 타깃 인코딩 ---
print("\n4.1) One-hot restore & Target encoding...")
global_mean = y_trainval.mean()
# data_channel 복원
data_channel_cols = [c for c in X_trainval if c.startswith("data_channel_")]
weekday_cols      = [c for c in X_trainval if c.startswith("weekday_")]

def restore_from_onehot(df, cols, prefix):
    return df[cols].idxmax(axis=1).str.replace(f"{prefix}_", "", regex=False)

if data_channel_cols:
    X_trainval['data_channel'] = restore_from_onehot(X_trainval, data_channel_cols, "data_channel")
    X_test['data_channel']     = restore_from_onehot(X_test, data_channel_cols, "data_channel")
if weekday_cols:
    X_trainval['weekday'] = restore_from_onehot(X_trainval, weekday_cols, "weekday")
    X_test['weekday']     = restore_from_onehot(X_test, weekday_cols, "weekday")

for col in ['data_channel', 'weekday']:
    if col in X_trainval:
        means = X_trainval.join(y_trainval).groupby(col)[y.name].mean()
        X_trainval[col] = X_trainval[col].map(means)
        X_test[col]     = X_test[col].map(means).fillna(global_mean)

# --- 4.5) SMOTE 적용 ---
print("\n4.5) Adjusting for class imbalance with SMOTE...")
smote = SMOTE(random_state=42)
X_trainval, y_trainval = smote.fit_resample(X_trainval, y_trainval)
print(f"After SMOTE → TrainVal: {X_trainval.shape}, class counts: {np.bincount(y_trainval)}")

# --- 5) 하이퍼파라미터 탐색 범위 정의 ---
print("\n5) Defining hyperparameter search space...")
param_dist_tuned = {
    'learning_rate':      [0.01, 0.03, 0.05, 0.07, 0.1],
    'depth':              [4, 6, 8, 10],
    'l2_leaf_reg':        [1, 3, 5, 7, 9, 12],
    'border_count':       [32, 64, 128, 254],
    'bagging_temperature':[0, 0.5, 1.0, 1.5, 2.0],
    'random_strength':    [0.1, 0.5, 1, 2, 5],
    'colsample_bylevel':  [0.6, 0.7, 0.8, 0.9, 1.0]
}

# --- 6) RandomizedSearchCV 세팅 및 실행 ---
print(f"\n6) Starting RandomizedSearchCV ({N_ITER_SEARCH} iterations) using CPU...")
base_model = CatBoostClassifier(
    iterations=1000,
    random_state=42,
    verbose=0,
    task_type='GPU',
    devices='0'
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist_tuned,
    n_iter=N_ITER_SEARCH,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True
)

search.fit(
    X_trainval,
    y_trainval,
    cat_features=None  # 필요한 경우 cat_cols 리스트를 지정
)

best_params = search.best_params_
best_score  = search.best_score_
print("\n" + "="*50)
print("RandomizedSearchCV Results:")
print("="*50)
print("Best hyperparameters:", best_params)
print(f"Best CV ROC AUC: {best_score:.4f}")
print("="*50)

# --- 7) 최종 모델 재학습 (Early Stopping) ---
print("\n7) Training final tuned model with early stopping on test set...")
final_model = CatBoostClassifier(
    iterations=2000,
    eval_metric='AUC',
    early_stopping_rounds=50,
    use_best_model=True,
    random_state=42,
    verbose=100,
    task_type='GPU',
    devices='0',
    **best_params
)

final_model.fit(
    X_trainval, y_trainval,
    cat_features=None,
    eval_set=[(X_test, y_test)]
)
best_iter = final_model.get_best_iteration()
print(f"Best iteration: {best_iter}")

# feature importance, permutation importance, 평가, 저장
fi = final_model.get_feature_importance(prettified=True)
fi.head(20).to_csv(FI_CSV, index=False)
print(f"▶ Feature importances saved to {FI_CSV}")

y_pred   = final_model.predict(X_test)
y_prob   = final_model.predict_proba(X_test)[:, 1]
acc_test = accuracy_score(y_test, y_pred)
f1_test  = f1_score(y_test, y_pred)
auc_test = roc_auc_score(y_test, y_prob)
comp_test= (acc_test + f1_test + auc_test) / 3

print("▶ Final hold-out performance:")
print(f"   Accuracy : {acc_test:.4f}")
print(f"   F1 Score : {f1_test:.4f}")
print(f"   ROC AUC  : {auc_test:.4f}")
print(f"   Composite: {comp_test:.4f}")

final_model.save_model(MODEL_FILE)
print(f"▶ Model saved to {MODEL_FILE}")

with open(LOG_FILE, 'w') as f:
    f.write(f"Best params      : {best_params}\n")
    f.write(f"Best CV ROC AUC  : {best_score:.4f}\n")
    f.write(f"Best iteration   : {best_iter}\n")
    f.write(f"Test Accuracy    : {acc_test:.4f}\n")
    f.write(f"Test F1 Score    : {f1_test:.4f}\n")
    f.write(f"Test ROC AUC     : {auc_test:.4f}\n")
    f.write(f"Test Composite   : {comp_test:.4f}\n")

end_time = time.time()
elapsed  = end_time - start_time
hrs      = int(elapsed // 3600)
mins     = int((elapsed % 3600) // 60)
secs     = int(elapsed % 60)
print(f"▶ Total execution time: {hrs}h {mins}m {secs}s")


1) Loading data...

2) Creating ONLY the previously useful derived features...
Only previously useful derived features created.

3) Splitting data into training and test sets...

4) Applying preprocessing (missing value imputation)...

4.1) One-hot restore & Target encoding...

4.5) Adjusting for class imbalance with SMOTE...
After SMOTE → TrainVal: (17914, 62), class counts: [8957 8957]

5) Defining hyperparameter search space...

6) Starting RandomizedSearchCV (30 iterations) using CPU...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


120 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/usr/local/lib/python3.11/dist-packages/catboost/core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fi


RandomizedSearchCV Results:
Best hyperparameters: {'random_strength': 1, 'learning_rate': 0.03, 'l2_leaf_reg': 3, 'depth': 8, 'colsample_bylevel': 1.0, 'border_count': 128, 'bagging_temperature': 1.5}
Best CV ROC AUC: 0.7200

7) Training final tuned model with early stopping on test set...


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6794876	best: 0.6794876 (0)	total: 104ms	remaining: 3m 27s
100:	test: 0.7220915	best: 0.7221278 (99)	total: 10.8s	remaining: 3m 22s
200:	test: 0.7256352	best: 0.7256352 (200)	total: 20.7s	remaining: 3m 5s
300:	test: 0.7273985	best: 0.7275464 (297)	total: 30.5s	remaining: 2m 52s
400:	test: 0.7279806	best: 0.7280679 (369)	total: 41s	remaining: 2m 43s
500:	test: 0.7278011	best: 0.7283238 (468)	total: 51.6s	remaining: 2m 34s
bestTest = 0.7283237576
bestIteration = 468
Shrink model to first 469 iterations.
Best iteration: 468
▶ Feature importances saved to /content/drive/MyDrive/Colab Notebooks/패턴인식/catboost/log/trial_derived_features_20250528_120658/feature_importances_derived.csv
▶ Final hold-out performance:
   Accuracy : 0.6662
   F1 Score : 0.6643
   ROC AUC  : 0.7283
   Composite: 0.6863
▶ Model saved to /content/drive/MyDrive/Colab Notebooks/패턴인식/catboost/log/trial_derived_features_20250528_120658/catboost_final_model_cpu_derived.cbm
▶ Total execution time: 0h 40m 12s
