In [3]:
# 1) 패키지 설치
!pip install --quiet catboost pandas numpy scikit-learn tqdm

# 2) Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

# 3) 라이브러리 임포트 및 설정
import os
import time
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, RandomizedSearchCV
)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 4) 경로 설정
base_dir      = '/content/drive/MyDrive/Colab Notebooks/패턴인식'
input_path    = os.path.join(base_dir, 'data', 'train.csv')
log_root      = os.path.join(base_dir, 'catboost', 'log')
timestamp     = time.strftime('%Y%m%d_%H%M%S')
output_dir    = os.path.join(log_root, f'trial2_{timestamp}')
os.makedirs(output_dir, exist_ok=True)

MODEL_FILE    = os.path.join(output_dir, 'catboost_final_model_cpu.cbm')
LOG_FILE      = os.path.join(output_dir, 'log.txt')
FI_CSV        = os.path.join(output_dir, 'feature_importances.csv')

start_time = time.time()

# 5) 데이터 로드 및 target 생성
df = pd.read_csv(input_path)

# target 생성: shares > 1400 => 1, else 0
df['target'] = (df['shares'] > 1400).astype(int)

# 불필요한 컬럼 제거
df = df.drop(columns=['id', 'shares', 'y'], errors='ignore')

# 피처/라벨 분리
X = df.drop(columns=['target'])
y = df['target']


# 6) 파생변수 생성
print("▶ Creating derived features...")
epsilon = 1e-6
# 1) content_to_href_ratio
if {'n_tokens_content','num_hrefs'}.issubset(X.columns):
    X['feat_content_to_href_ratio'] = X['n_tokens_content'] / (X['num_hrefs'] + epsilon)
# 2) content_to_img_ratio
if {'n_tokens_content','num_imgs'}.issubset(X.columns):
    X['feat_content_to_img_ratio'] = X['n_tokens_content'] / (X['num_imgs'] + epsilon)
# 3) kw_avg_spread
if {'kw_avg_max','kw_avg_min'}.issubset(X.columns):
    X['feat_kw_avg_spread'] = X['kw_avg_max'] - X['kw_avg_min']
# 4) global_sentiment_strength
if {'global_subjectivity','global_sentiment_polarity'}.issubset(X.columns):
    X['feat_global_sentiment_strength'] = X['global_subjectivity'] * X['global_sentiment_polarity']
# 5) title_sentiment_strength
if {'title_subjectivity','title_sentiment_polarity'}.issubset(X.columns):
    X['feat_title_sentiment_strength'] = X['title_subjectivity'] * X['title_sentiment_polarity']
# 6-7) LDA features
lda_cols = [f'LDA_{i:02d}' for i in range(5)]
if all(col in X.columns for col in lda_cols):
    X['feat_lda_max_value']  = X[lda_cols].max(axis=1)
    X['feat_lda_std_dev']    = X[lda_cols].std(axis=1)
print("▶ Derived features done.")

# 7) 간단 전처리: 결측치 채우기
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = ['data_channel', 'weekday']
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
X[cat_cols] = X[cat_cols].fillna('missing')

# 8) train/test 분할
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"▶ Data split: TrainVal={X_trainval.shape}, Test={X_test.shape}")

# 9) 하이퍼파라미터 서치 설정
param_dist = {
    'learning_rate':       [0.01,0.03,0.05,0.07,0.1],
    'depth':               [4,6,8,10],
    'l2_leaf_reg':         [1,3,5,7,9,12],
    'border_count':        [32,64,128,254],
    'bagging_temperature': [0,0.5,1.0,1.5,2.0],
    'random_strength':     [0.1,0.5,1,2,5],
    'colsample_bylevel':   [0.6,0.7,0.8,0.9,1.0]
}

base_model = CatBoostClassifier(
    iterations=1000,
    random_state=42,
    verbose=0,
    task_type='CPU'
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"▶ Starting RandomizedSearchCV (30 iters)...")
search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    n_jobs=-1,
    random_state=42,
    verbose=1,
    refit=True
)
search.fit(X_trainval, y_trainval, cat_features=cat_cols)

# 10) 서치 결과 기록
best_params = search.best_params_
best_score  = search.best_score_
print(f"▶ Best params: {best_params}")
print(f"▶ Best CV ROC AUC: {best_score:.4f}")

# 11) 최종 모델 재학습 (early stopping)
print("▶ Training final model with early stopping...")
final_model = CatBoostClassifier(
    iterations=2000,
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    border_count=best_params['border_count'],
    bagging_temperature=best_params['bagging_temperature'],
    random_strength=best_params['random_strength'],
    colsample_bylevel=best_params['colsample_bylevel'],
    eval_metric='AUC',
    early_stopping_rounds=50,
    use_best_model=True,
    random_state=42,
    verbose=100,
    task_type='CPU'
)
final_model.fit(
    X_trainval, y_trainval,
    cat_features=cat_cols,
    eval_set=[(X_test, y_test)]
)
best_iter = final_model.get_best_iteration()
print(f"▶ Best iteration: {best_iter}")

# 12) Feature importance 저장
fi = final_model.get_feature_importance(prettified=True)
fi.head(20).to_csv(FI_CSV, index=False)
print(f"▶ Feature importances saved to {FI_CSV}")

# 13) Hold-out 성능 평가
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]
acc_test = accuracy_score(y_test, y_pred)
f1_test  = f1_score(y_test, y_pred)
auc_test = roc_auc_score(y_test, y_prob)
comp_test= (acc_test + f1_test + auc_test)/3

print("▶ Final hold-out performance:")
print(f"   Accuracy : {acc_test:.4f}")
print(f"   F1 Score : {f1_test:.4f}")
print(f"   ROC AUC  : {auc_test:.4f}")
print(f"   Composite: {comp_test:.4f}")

# 14) 모델 및 로그 저장
final_model.save_model(MODEL_FILE)
print(f"▶ Model saved to {MODEL_FILE}")

with open(LOG_FILE, 'w') as f:
    f.write(f"Best params: {best_params}\n")
    f.write(f"Best CV ROC AUC: {best_score:.4f}\n")
    f.write(f"Best iteration: {best_iter}\n")
    f.write(f"Test Accuracy: {acc_test:.4f}\n")
    f.write(f"Test F1 Score: {f1_test:.4f}\n")
    f.write(f"Test ROC AUC: {auc_test:.4f}\n")
    f.write(f"Test Composite: {comp_test:.4f}\n")

# 15) 전체 소요 시간 출력
end_time = time.time()
elapsed = end_time - start_time
hrs = int(elapsed // 3600)
mins= int((elapsed % 3600)//60)
secs= int(elapsed % 60)
print(f"▶ Total execution time: {hrs}h {mins}m {secs}s")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
▶ Creating derived features...
▶ Derived features done.
▶ Data split: TrainVal=(17760, 53), Test=(4440, 53)
▶ Starting RandomizedSearchCV (30 iters)...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
▶ Best params: {'random_strength': 0.5, 'learning_rate': 0.01, 'l2_leaf_reg': 1, 'depth': 8, 'colsample_bylevel': 0.9, 'border_count': 128, 'bagging_temperature': 2.0}
▶ Best CV ROC AUC: 0.7186
▶ Training final model with early stopping...
0:	test: 0.6835094	best: 0.6835094 (0)	total: 26ms	remaining: 52s
100:	test: 0.7143633	best: 0.7143633 (100)	total: 2.31s	remaining: 43.4s
200:	test: 0.7205572	best: 0.7205627 (198)	total: 4.63s	remaining: 41.5s
300:	test: 0.7233573	best: 0.7234466 (298)	total: 7.08s	remaining: 40s
400:	test: 0.7255667	best: 0.7255771 (399)	total: 9.37s	remaining: 37.4s
500:	test: 0.7270521	best: 0.7270766 (499)	total: 11.6s	remai