In [3]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, ParameterSampler, cross_validate
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone

# -----------------------
# 0) 설정 (여기만 바꾸면 됨)
# -----------------------
SEARCH_CONFIG = {
    "n_iter": 30,         # 탐색 횟수 (많을수록 오래 걸림)
    "cv": 5,              # 교차 검증 폴드 수
    "random_state": 42,
    "n_jobs": -1,
    "verbose": 1,
    "top_k_test": 20,     # Test셋 재검증할 상위 모델 개수
    
    # --- 파라미터 튜닝 공간 (F1, Recall 향상 목표) ---
    "params": {
        "rf__n_estimators": [200, 500, 1000],
        
        # 트리가 깊을수록 Recall 확보에 유리할 수 있음 (과적합 주의)
        "rf__max_depth": [10, 20, 30, None],
        
        # 불균형 데이터셋에 필수 (Recall/F1 중요 시 'balanced' 권장)
        "rf__class_weight": ["balanced", "balanced_subsample"],
        
        # 소수 클래스 포착을 위해 규제를 약간 줄이거나 조정
        "rf__min_samples_split": [2, 5, 10],
        "rf__min_samples_leaf": [1, 2, 4],
        
        "rf__max_features": ["sqrt", "log2"],
        "rf__bootstrap": [True],
        "rf__criterion": ["gini", "entropy"],
    }
}

# -----------------------
# 1) 데이터 로드 (사용자 경로 유지)
# -----------------------
train_df = pd.read_csv("../../../../data/processed/train.csv")
test_df  = pd.read_csv("../../../../data/processed/test.csv")

X_train = train_df.drop("Revenue", axis=1)
y_train = train_df["Revenue"].astype(int)

X_test  = test_df.drop("Revenue", axis=1)
y_test  = test_df["Revenue"].astype(int)

cat_cols = X_train.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = X_train.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist()

# -----------------------
# 2) 전처리 (원핫+스케일링)
# -----------------------
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# -----------------------
# 3) 모델 파이프라인
# -----------------------
rf = RandomForestClassifier(
    random_state=SEARCH_CONFIG["random_state"],
    n_jobs=SEARCH_CONFIG["n_jobs"],
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rf", rf),
])

# -----------------------
# 4) 파라미터 공간 할당
# -----------------------
# 위 설정(SEARCH_CONFIG["params"])을 그대로 사용
param_distributions = SEARCH_CONFIG["params"]

# -----------------------
# 5) Manual Search Loop (Metric per iteration)
# -----------------------
scoring = {
    "acc": "accuracy",
    "prec": "precision",
    "rec": "recall",
    "f1": "f1",
    "roc_auc": "roc_auc",
}

cv = StratifiedKFold(
    n_splits=SEARCH_CONFIG["cv"],
    shuffle=True,
    random_state=SEARCH_CONFIG["random_state"],
)

# Generate candidates
print(f"Generating {SEARCH_CONFIG['n_iter']} candidates...")
param_list = list(ParameterSampler(
    param_distributions, 
    n_iter=SEARCH_CONFIG["n_iter"], 
    random_state=SEARCH_CONFIG["random_state"]
))

results = []

print(f"{'Iter':<5} | {'F1':<7} | {'AUC':<7} | {'Acc':<7} | {'Time(s)':<7} | Params")
print("-" * 120)

for i, params in enumerate(param_list):
    # Set params (Using set_params with kwargs directly works for Pipeline steps if keys are prefixed correctly)
    pipe.set_params(**params)
    
    # Evaluate
    cv_res = cross_validate(
        pipe, X_train, y_train, 
        cv=cv, scoring=scoring, 
        n_jobs=SEARCH_CONFIG["n_jobs"],
        return_train_score=False
    )
    
    # Aggregation
    mean_fit_time = cv_res['fit_time'].mean()
    mean_score_time = cv_res['score_time'].mean()
    
    mean_test_acc = cv_res['test_acc'].mean()
    mean_test_prec = cv_res['test_prec'].mean()
    mean_test_rec = cv_res['test_rec'].mean()
    mean_test_f1 = cv_res['test_f1'].mean()
    std_test_f1 = cv_res['test_f1'].std()
    mean_test_roc_auc = cv_res['test_roc_auc'].mean()
    
    # Print status
    print(f"{i+1:<5} | {mean_test_f1:.4f}  | {mean_test_roc_auc:.4f}  | {mean_test_acc:.4f}  | {mean_fit_time:.2f}    | {params}")
    
    # Store
    row = {
        "mean_fit_time": mean_fit_time,
        "mean_score_time": mean_score_time,
        "mean_test_acc": mean_test_acc,
        "mean_test_prec": mean_test_prec,
        "mean_test_rec": mean_test_rec,
        "mean_test_f1": mean_test_f1,
        "std_test_f1": std_test_f1,
        "mean_test_roc_auc": mean_test_roc_auc,
    }
    # Add param_ prefix to match RandomizedSearchCV behavior for results
    for k, v in params.items():
        row[f"param_{k}"] = v
        
    results.append(row)

# -----------------------
# 6) CV Results DataFrame
# -----------------------
cv_results = pd.DataFrame(results)

if not cv_results.empty:
    cv_results["rank_test_f1"] = cv_results["mean_test_f1"].rank(ascending=False, method="min").astype(int)

    param_cols = [c for c in cv_results.columns if c.startswith("param_")]
    keep_cols = (
        ["rank_test_f1", "mean_fit_time"] +
        param_cols +
        ["mean_test_f1", "std_test_f1",
         "mean_test_prec", "mean_test_rec", "mean_test_acc", "mean_test_roc_auc"]
    )

    leaderboard_cv = (
        cv_results[keep_cols]
        .sort_values(["mean_test_f1", "mean_test_roc_auc"], ascending=False)
        .reset_index(drop=True)
    )

    # 표로 출력 (상위 30개)
    display(leaderboard_cv.head(30).style.format(precision=4))
else:
    print("No results found.")



Generating 30 candidates...
Iter  | F1      | AUC     | Acc     | Time(s) | Params
------------------------------------------------------------------------------------------------------------------------
1     | 0.6784  | 0.9332  | 0.9025  | 1.19    | {'rf__n_estimators': 200, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 'rf__max_depth': None, 'rf__criterion': 'gini', 'rf__class_weight': 'balanced', 'rf__bootstrap': True}
2     | 0.6904  | 0.9343  | 0.8962  | 0.95    | {'rf__n_estimators': 200, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 4, 'rf__max_features': 'sqrt', 'rf__max_depth': 20, 'rf__criterion': 'entropy', 'rf__class_weight': 'balanced_subsample', 'rf__bootstrap': True}
3     | 0.6823  | 0.9339  | 0.9027  | 5.65    | {'rf__n_estimators': 1000, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__max_features': 'log2', 'rf__max_depth': None, 'rf__criterion': 'gini', 'rf__class_weight': 'balanced_subsample', 'rf__bootstrap': 

Unnamed: 0,rank_test_f1,mean_fit_time,param_rf__n_estimators,param_rf__min_samples_split,param_rf__min_samples_leaf,param_rf__max_features,param_rf__max_depth,param_rf__criterion,param_rf__class_weight,param_rf__bootstrap,mean_test_f1,std_test_f1,mean_test_prec,mean_test_rec,mean_test_acc,mean_test_roc_auc
0,1,3.4104,500,5,4,sqrt,30.0,entropy,balanced_subsample,True,0.6921,0.0162,0.6452,0.7477,0.8969,0.9349
1,2,0.9501,200,2,4,sqrt,20.0,entropy,balanced_subsample,True,0.6904,0.0204,0.6431,0.7464,0.8962,0.9343
2,3,5.4646,1000,10,4,log2,30.0,gini,balanced_subsample,True,0.6895,0.0152,0.6327,0.7589,0.8941,0.9335
3,4,3.5543,500,10,2,log2,20.0,entropy,balanced_subsample,True,0.6884,0.0126,0.6532,0.7287,0.8978,0.9344
4,5,1.4445,200,10,1,log2,,entropy,balanced,True,0.6882,0.0121,0.6715,0.7071,0.9008,0.9336
5,6,1.2291,200,10,2,log2,20.0,gini,balanced,True,0.6876,0.0168,0.6514,0.7294,0.8973,0.9331
6,7,1.2964,200,10,2,sqrt,,entropy,balanced,True,0.686,0.0125,0.6601,0.7156,0.8985,0.934
7,8,8.4552,1000,5,2,sqrt,20.0,entropy,balanced_subsample,True,0.6853,0.0114,0.691,0.6809,0.9032,0.9348
8,9,1.387,200,5,2,log2,,entropy,balanced_subsample,True,0.6845,0.0119,0.6946,0.6756,0.9036,0.9334
9,10,3.4771,500,5,2,sqrt,20.0,gini,balanced,True,0.6842,0.0117,0.6898,0.6796,0.9029,0.9344


In [2]:
def eval_on_test(fitted_pipe, X_te, y_te):
    proba = fitted_pipe.predict_proba(X_te)[:, 1]
    pred = (proba >= 0.5).astype(int)
    return {
        "Test_Acc": accuracy_score(y_te, pred),
        "Test_Prec": precision_score(y_te, pred, zero_division=0),
        "Test_Rec": recall_score(y_te, pred, zero_division=0),
        "Test_F1": f1_score(y_te, pred, zero_division=0),
        "Test_AUC": roc_auc_score(y_te, proba),
    }

top_k = SEARCH_CONFIG["top_k_test"]
top_params = leaderboard_cv.head(top_k)[param_cols].to_dict(orient="records")

test_rows = []
base = clone(pipe)

for i, p in enumerate(top_params, 1):
    # RandomizedSearchCV의 param_ 컬럼은 이미 "rf__..." 형태라 그대로 set_params 가능
    fitted = clone(base).set_params(**p).fit(X_train, y_train)
    metrics = eval_on_test(fitted, X_test, y_test)
    row = {"CV_rank_f1": int(leaderboard_cv.loc[i-1, "rank_test_f1"]), **metrics}
    # 파라미터도 같이 붙임(원하면 축소 가능)
    row.update({k.replace("param_", ""): leaderboard_cv.loc[i-1, k] for k in param_cols})
    row.update({
        "CV_mean_f1": float(leaderboard_cv.loc[i-1, "mean_test_f1"]),
        "CV_mean_auc": float(leaderboard_cv.loc[i-1, "mean_test_roc_auc"]),
    })
    test_rows.append(row)

leaderboard_test = pd.DataFrame(test_rows).sort_values(["Test_F1", "Test_AUC"], ascending=False)
leaderboard_test.head(top_k).style.format(precision=4)


ValueError: Invalid parameter 'param_rf' for estimator Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['row_id', 'Administrative',
                                                   'Administrative_Duration',
                                                   'Informational',
                                                   'Informational_Duration',
                                                   'ProductRelated',
                                                   'ProductRelated_Duration',
                                                   'BounceRates', 'ExitRates',
                                                   'PageValues', 'SpecialDay',
                                                   'OperatingSystems',
                                                   'Browser', 'Region',
                                                   'TrafficType']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['Month', 'VisitorType',
                                                   'Weekend'])],
                                   verbose_feature_names_out=False)),
                ('rf', RandomForestClassifier(n_jobs=-1, random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].