# 5 — Tuning: RandomizedSearchCV

Notebook này tuning các model mạnh (RF/ET/HGB/XGB/LGBM nếu có).

**Phụ thuộc:** đã chạy `app/04_baseline_oof.ipynb`.


## Tuning: RandomizedSearchCV (sau khi baseline xong)

In [64]:
# =========================
# GLOBAL SETUP (1 cell dùng chung cho mọi model tuning)
# =========================

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve

# ---- CONFIG ----
SEED = 42
N_ITER = 25  # 20–30 hợp lý cho máy cá nhân

cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# ---- SEARCH SCORING (multi-metric) ----
SCORING_OPT = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}
REFIT_OPT = "pr_auc"

print(f"SEED={SEED} | N_ITER={N_ITER} | CV=5 | scoring={list(SCORING_OPT.keys())} | refit={REFIT_OPT}")

# ---- STORAGE (nếu chưa có) ----
if "best_params" not in globals():
    best_params = {}
if "tuned_results" not in globals():
    tuned_results = []

# ---- THRESHOLD PICKER (stable) ----
def pick_threshold_by_target_recall(
    y_true,
    scores,
    target_recall=0.85,
    min_precision=0.30
):
    """
    Chọn threshold ổn định:
    - Ưu tiên đạt recall >= target_recall
    - Đồng thời precision >= min_precision
    - Tránh threshold quá thấp (all-1)
    """
    prec, rec, thr = precision_recall_curve(y_true, scores)

    if len(thr) == 0:
        return 0.5

    prec2, rec2, thr2 = prec[:-1], rec[:-1], thr
    mask = (rec2 >= target_recall) & (prec2 >= min_precision)

    if mask.any():
        return float(thr2[mask][-1])

    idx = np.argmin(np.abs(rec2 - target_recall))
    return float(thr2[idx])

# ---- TEMPLATE SEARCH + OOF + THRESHOLD + METRICS ----
def run_search_oof_threshold(
    model_name: str,
    base_estimator,
    param_space: dict,      # param_distributions
    use_sampler: bool,
    X_train,
    y_train,
    cv,
    n_iter: int = None,
    target_recall: float = 0.85,
    min_precision: float = 0.30,
    topk: int = 10,
    verbose: int = 1,
    random_state: int = None
):
    """
    Template CHUNG cho mọi model:
    1) RandomizedSearchCV (PR-AUC optimize + log ROC-AUC)
    2) Fit lại best
    3) OOF scores
    4) Pick threshold theo target recall
    5) Compute metrics + append tuned_results
    """

    if n_iter is None:
        n_iter = N_ITER
    if random_state is None:
        random_state = SEED

    pipe = build_pipeline(base_estimator, use_sampler=use_sampler)

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_space,
        n_iter=n_iter,
        scoring=SCORING_OPT,
        refit=REFIT_OPT,
        cv=cv,
        n_jobs=-1,
        verbose=verbose,
        random_state=random_state,
        return_train_score=False
    )

    search.fit(X_train, y_train)
    best_params[model_name] = search.best_params_

    res = pd.DataFrame(search.cv_results_)
    best_row = res[res["rank_test_pr_auc"] == 1].iloc[0]

    print(f"\n===== {model_name} | RANDOM SEARCH DONE =====")
    print(
        "Best CV:",
        "PR-AUC =", round(float(best_row["mean_test_pr_auc"]), 5),
        "| ROC-AUC =", round(float(best_row["mean_test_roc_auc"]), 5)
    )

    cols_show = [
        "rank_test_pr_auc", "mean_test_pr_auc", "std_test_pr_auc",
        "mean_test_roc_auc", "std_test_roc_auc", "params"
    ]
    display(res.sort_values("rank_test_pr_auc").head(topk)[cols_show])

    # ----- OOF + threshold + metrics -----
    tuned_estimator = base_estimator.__class__(**base_estimator.get_params())
    tuned_pipe = build_pipeline(tuned_estimator, use_sampler=use_sampler)
    tuned_pipe.set_params(**best_params[model_name])

    oof_scores = get_oof_scores(tuned_pipe, X_train, y_train, cv)

    thr = pick_threshold_by_target_recall(
        y_train, oof_scores,
        target_recall=target_recall,
        min_precision=min_precision
    )

    m, cm = compute_metrics(y_train, oof_scores, thr)
    tn, fp, fn, tp = cm.ravel()

    print("Chosen threshold =", round(thr, 4))
    print("OOF ROC-AUC =", round(m["roc_auc"], 4), "| PR-AUC =", round(m["pr_auc"], 4))
    print("Score quantiles:", np.quantile(oof_scores, [0.01, 0.05, 0.1, 0.2, 0.5]))

    tuned_results.append({
        "model": model_name,
        "phase": "tuned",
        "status": "OK",
        "thr": thr,
        **m
    })

    display(pd.DataFrame([{**m, "TP": tp, "FP": fp, "TN": tn, "FN": fn}]).round(4))

    return {
        "search": search,
        "cv_results": res,
        "oof_scores": oof_scores,
        "thr": thr,
        "metrics": m,
        "cm": cm
    }


SEED=42 | N_ITER=25 | CV=5 | scoring=['pr_auc', 'roc_auc'] | refit=pr_auc


### RF — Random Forest (Tuning, RandomizedSearchCV)

**Tuning Random Forest (RF)**

Ở bước này, Random Forest được tuning hyperparameters để cải thiện hiệu năng so với baseline trước khi so sánh lại với các mô hình khác.

- **RandomizedSearchCV** được dùng để thử ngẫu nhiên nhiều cấu hình hyperparameter với chi phí tính toán hợp lý.
- **PR-AUC (average precision)** được sử dụng làm metric trong tuning vì dữ liệu mất cân bằng và lớp quan trọng là **CHD = 1**.
- **Không sử dụng oversampling** cho Random Forest vì đây là mô hình tree-based và đã xử lý imbalance thông qua `class_weight`.
- Sau tuning, mô hình được đánh giá lại bằng **OOF-CV** để:
  - tính ROC-AUC, PR-AUC
  - chọn threshold ưu tiên recall
  - phân tích confusion matrix và ROC curve

**Mục tiêu:** so sánh RF (tuned) với các mô hình khác để chọn ra mô hình tốt nhất cho bước calibration và final evaluation.


In [65]:
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

# -------------------------------------------------
# RF: param space
# -------------------------------------------------
rf_param_dist = {
    "model__n_estimators": randint(400, 900),
    "model__max_depth": randint(18, 32),
    "model__min_samples_leaf": randint(1, 6),
    "model__max_features": ["sqrt", "log2", None],
}

# đảm bảo best_params có tồn tại
if "best_params" not in globals():
    best_params = {}

# -------------------------------------------------
# RF: RandomizedSearchCV (opt PR-AUC, log ROC-AUC)
# -------------------------------------------------
rf_base = RandomForestClassifier(
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

rf_pipe_for_search = build_pipeline(rf_base, use_sampler=False)

SCORING_OPT = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}
REFIT_OPT = "pr_auc"

search_rf = RandomizedSearchCV(
    estimator=rf_pipe_for_search,
    param_distributions=rf_param_dist,
    n_iter=N_ITER,                 # hoặc ghi trực tiếp: 25
    scoring=SCORING_OPT,
    refit=REFIT_OPT,
    cv=cv5,
    n_jobs=-1,
    random_state=SEED,
    verbose=1,
    return_train_score=False
)

search_rf.fit(X_train, y_train)

best_params["RF"] = search_rf.best_params_

# -------------------------------------------------
# Report: best CV + top10
# -------------------------------------------------
res_rf = pd.DataFrame(search_rf.cv_results_)

print(f"Total candidates tried = {len(res_rf)} | Total fits ≈ {len(res_rf) * cv5.get_n_splits()}")

best_row_rf = res_rf.loc[res_rf["rank_test_pr_auc"] == 1].iloc[0]

print(
    "Best CV:",
    "PR-AUC =", round(float(best_row_rf["mean_test_pr_auc"]), 5),
    "| ROC-AUC =", round(float(best_row_rf["mean_test_roc_auc"]), 5)
)

cols_show = [
    "rank_test_pr_auc", "mean_test_pr_auc", "std_test_pr_auc",
    "mean_test_roc_auc", "std_test_roc_auc", "params"
]

print("\nTop 10 configs (by PR-AUC):")
display(res_rf.sort_values("rank_test_pr_auc").head(10)[cols_show])


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Total candidates tried = 25 | Total fits ≈ 125
Best CV: PR-AUC = 0.84599 | ROC-AUC = 0.93863

Top 10 configs (by PR-AUC):


Unnamed: 0,rank_test_pr_auc,mean_test_pr_auc,std_test_pr_auc,mean_test_roc_auc,std_test_roc_auc,params
5,1,0.845988,0.018197,0.938627,0.004478,"{'model__max_depth': 29, 'model__max_features'..."
24,2,0.844997,0.018982,0.938297,0.004476,"{'model__max_depth': 30, 'model__max_features'..."
23,3,0.844914,0.01865,0.938655,0.004585,"{'model__max_depth': 29, 'model__max_features'..."
22,4,0.840364,0.017965,0.93606,0.004388,"{'model__max_depth': 22, 'model__max_features'..."
1,5,0.803866,0.022823,0.920462,0.004026,"{'model__max_depth': 30, 'model__max_features'..."
19,6,0.803745,0.023281,0.920363,0.004257,"{'model__max_depth': 29, 'model__max_features'..."
12,7,0.803644,0.023368,0.920144,0.004546,"{'model__max_depth': 29, 'model__max_features'..."
17,8,0.80359,0.023658,0.920042,0.004622,"{'model__max_depth': 31, 'model__max_features'..."
13,9,0.803337,0.023422,0.920222,0.004137,"{'model__max_depth': 27, 'model__max_features'..."
4,10,0.803096,0.02376,0.919743,0.004774,"{'model__max_depth': 25, 'model__max_features'..."


In [66]:
print("Best params =", best_params["RF"])

Best params = {'model__max_depth': 29, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__n_estimators': 859}


### ET — Extra Trees (Tuning, RandomizedSearchCV)

In [67]:
from scipy.stats import randint
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

# -------------------------------------------------
# ET: param space
# -------------------------------------------------
et_param_dist = {
    "model__n_estimators": randint(200, 900),
    "model__max_depth": randint(3, 31),
    "model__min_samples_leaf": randint(1, 11),
    "model__max_features": ["sqrt", "log2", None],
}

# đảm bảo best_params / tuned_results có tồn tại
if "best_params" not in globals():
    best_params = {}
if "tuned_results" not in globals():
    tuned_results = []

# -------------------------------------------------
# ET: RandomizedSearchCV (opt PR-AUC, log ROC-AUC)
# -------------------------------------------------
et_base = ExtraTreesClassifier(
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

et_pipe_for_search = build_pipeline(et_base, use_sampler=False)

SCORING_OPT = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}
REFIT_OPT = "pr_auc"

search_et = RandomizedSearchCV(
    estimator=et_pipe_for_search,
    param_distributions=et_param_dist,
    n_iter=N_ITER,                 # muốn nhẹ hơn thì giảm (vd 15/20)
    scoring=SCORING_OPT,
    refit=REFIT_OPT,
    cv=cv5,
    n_jobs=-1,
    random_state=SEED,
    verbose=1,
    return_train_score=False
)

search_et.fit(X_train, y_train)

best_params["ET"] = search_et.best_params_

# -------------------------------------------------
# Report: best CV + top10
# -------------------------------------------------
res_et = pd.DataFrame(search_et.cv_results_)
print(f"Total candidates tried = {len(res_et)} | Total fits ≈ {len(res_et) * cv5.get_n_splits()}")

best_row_et = res_et.loc[res_et["rank_test_pr_auc"] == 1].iloc[0]
print(
    "Best CV:",
    "PR-AUC =", round(float(best_row_et["mean_test_pr_auc"]), 5),
    "| ROC-AUC =", round(float(best_row_et["mean_test_roc_auc"]), 5)
)

cols_show = [
    "rank_test_pr_auc", "mean_test_pr_auc", "std_test_pr_auc",
    "mean_test_roc_auc", "std_test_roc_auc", "params"
]
print("\nTop 10 configs (by PR-AUC):")
display(res_et.sort_values("rank_test_pr_auc").head(10)[cols_show])

# -------------------------------------------------
# ET: OOF + threshold + metrics
# -------------------------------------------------
et_tuned = ExtraTreesClassifier(
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

et_pipe_tuned = build_pipeline(et_tuned, use_sampler=False)
et_pipe_tuned.set_params(**best_params["ET"])

et_oof_t = get_oof_scores(et_pipe_tuned, X_train, y_train, cv5)

et_thr_t = pick_threshold_by_target_recall(
    y_train,
    et_oof_t,
    target_recall=0.85,
    min_precision=0.30
)

et_m_t, et_cm_t = compute_metrics(y_train, et_oof_t, et_thr_t)
tn, fp, fn, tp = et_cm_t.ravel()

et_m_t_full = {**et_m_t, "TP": tp, "FP": fp, "TN": tn, "FN": fn}

tuned_results.append({
    "model": "ET",
    "phase": "tuned",
    "status": "OK",
    "thr": et_thr_t,
    **et_m_t
})

print("\nChosen threshold =", round(et_thr_t, 4))
print(
    "OOF ROC-AUC =", round(et_m_t["roc_auc"], 4),
    "| PR-AUC =", round(et_m_t["pr_auc"], 4)
)
print("Score quantiles:", np.quantile(et_oof_t, [0.01, 0.05, 0.1, 0.2, 0.5]))

print("\nOOF metrics + confusion:")
display(pd.DataFrame([et_m_t_full]).round(4))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Total candidates tried = 25 | Total fits ≈ 125
Best CV: PR-AUC = 0.85672 | ROC-AUC = 0.94566

Top 10 configs (by PR-AUC):


Unnamed: 0,rank_test_pr_auc,mean_test_pr_auc,std_test_pr_auc,mean_test_roc_auc,std_test_roc_auc,params
5,1,0.85672,0.012424,0.945664,0.004941,"{'model__max_depth': 30, 'model__max_features'..."
8,2,0.812645,0.0182,0.917253,0.006053,"{'model__max_depth': 30, 'model__max_features'..."
10,3,0.762781,0.020798,0.898432,0.007304,"{'model__max_depth': 21, 'model__max_features'..."
22,4,0.759784,0.024308,0.891237,0.007457,"{'model__max_depth': 17, 'model__max_features'..."
24,5,0.756035,0.032565,0.890607,0.009813,"{'model__max_depth': 13, 'model__max_features'..."
11,6,0.729645,0.021434,0.885247,0.005435,"{'model__max_depth': 20, 'model__max_features'..."
18,7,0.681366,0.026968,0.87543,0.008,"{'model__max_depth': 18, 'model__max_features'..."
13,8,0.681252,0.027133,0.875946,0.008405,"{'model__max_depth': 17, 'model__max_features'..."
9,9,0.643818,0.023793,0.857893,0.007345,"{'model__max_depth': 22, 'model__max_features'..."
2,10,0.642442,0.029475,0.861814,0.008863,"{'model__max_depth': 13, 'model__max_features'..."



Chosen threshold = 0.1801
OOF ROC-AUC = 0.9456 | PR-AUC = 0.8565
Score quantiles: [0.         0.00455235 0.00781249 0.01696104 0.06069803]

OOF metrics + confusion:


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,TP,FP,TN,FN
0,0.9456,0.8565,0.8507,0.5863,0.6942,0.8912,0.8709,0.0547,1111,784,6421,195


### HGB — HistGradientBoosting (Tuning, RandomizedSearchCV)

In [68]:
from scipy.stats import randint, loguniform
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

# đảm bảo best_params / tuned_results có tồn tại
if "best_params" not in globals():
    best_params = {}
if "tuned_results" not in globals():
    tuned_results = []

# -------------------------------------------------
# HGB: param space
# -------------------------------------------------
HGB_N_ITER = 80  # 60/80/120 tuỳ máy

hgb_param_dist = {
    "model__max_iter": randint(200, 1200),
    "model__learning_rate": loguniform(1e-3, 0.2),
    "model__max_leaf_nodes": randint(15, 255),
    "model__max_depth": [None, 3, 5, 7, 9],
    "model__min_samples_leaf": randint(10, 300),
    "model__l2_regularization": loguniform(1e-8, 1e-1),
    "model__max_bins": randint(64, 256),
}

# -------------------------------------------------
# HGB: RandomizedSearchCV (opt PR-AUC, log ROC-AUC)
# -------------------------------------------------
hgb_base = HistGradientBoostingClassifier(random_state=SEED)
hgb_pipe_for_search = build_pipeline(hgb_base, use_sampler=False)

SCORING_OPT = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}
REFIT_OPT = "pr_auc"

search_hgb = RandomizedSearchCV(
    estimator=hgb_pipe_for_search,
    param_distributions=hgb_param_dist,
    n_iter=HGB_N_ITER,
    scoring=SCORING_OPT,
    refit=REFIT_OPT,
    cv=cv5,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    return_train_score=False
)

search_hgb.fit(X_train, y_train)

best_params["HGB"] = search_hgb.best_params_

# -------------------------------------------------
# Report: best CV + top10
# -------------------------------------------------
res_hgb = pd.DataFrame(search_hgb.cv_results_)
print(f"Total candidates tried = {len(res_hgb)} | Total fits ≈ {len(res_hgb) * cv5.get_n_splits()}")

best_row_hgb = res_hgb.loc[res_hgb["rank_test_pr_auc"] == 1].iloc[0]
print(
    "Best CV:",
    "PR-AUC =", round(float(best_row_hgb["mean_test_pr_auc"]), 5),
    "| ROC-AUC =", round(float(best_row_hgb["mean_test_roc_auc"]), 5)
)

cols_show = [
    "rank_test_pr_auc","mean_test_pr_auc","std_test_pr_auc",
    "mean_test_roc_auc","std_test_roc_auc","params"
]
print("\nTop 10 configs (by PR-AUC):")
display(res_hgb.sort_values("rank_test_pr_auc").head(10)[cols_show])

# -------------------------------------------------
# HGB: OOF + threshold + metrics
# -------------------------------------------------
hgb_tuned = HistGradientBoostingClassifier(random_state=SEED)
hgb_pipe_tuned = build_pipeline(hgb_tuned, use_sampler=False)
hgb_pipe_tuned.set_params(**best_params["HGB"])

hgb_oof_t = get_oof_scores(hgb_pipe_tuned, X_train, y_train, cv5)

hgb_thr_t = pick_threshold_by_target_recall(
    y_train,
    hgb_oof_t,
    target_recall=0.85,
    min_precision=0.30
)

hgb_m_t, hgb_cm_t = compute_metrics(y_train, hgb_oof_t, hgb_thr_t)
tn, fp, fn, tp = hgb_cm_t.ravel()

hgb_m_t_full = {**hgb_m_t, "TP": tp, "FP": fp, "TN": tn, "FN": fn}

tuned_results.append({
    "model": "HGB",
    "phase": "tuned",
    "status": "OK",
    "thr": hgb_thr_t,
    **hgb_m_t
})

print("\nChosen threshold =", round(hgb_thr_t, 4))
print(
    "OOF ROC-AUC =", round(hgb_m_t["roc_auc"], 4),
    "| PR-AUC =", round(hgb_m_t["pr_auc"], 4)
)
print("Score quantiles:", np.quantile(hgb_oof_t, [0.01, 0.05, 0.1, 0.2, 0.5]))

print("\nOOF metrics + confusion:")
display(pd.DataFrame([hgb_m_t_full]).round(4))


Fitting 5 folds for each of 80 candidates, totalling 400 fits
Total candidates tried = 80 | Total fits ≈ 400
Best CV: PR-AUC = 0.83233 | ROC-AUC = 0.92551

Top 10 configs (by PR-AUC):


Unnamed: 0,rank_test_pr_auc,mean_test_pr_auc,std_test_pr_auc,mean_test_roc_auc,std_test_roc_auc,params
61,1,0.832329,0.016735,0.925508,0.005671,{'model__l2_regularization': 0.002205722945374...
25,2,0.824657,0.024508,0.920632,0.009088,{'model__l2_regularization': 2.30837413695962e...
36,3,0.817935,0.023963,0.91745,0.009483,{'model__l2_regularization': 0.019044598430332...
26,4,0.788429,0.029681,0.908485,0.006459,{'model__l2_regularization': 5.34727401833469e...
27,5,0.771296,0.024381,0.898214,0.008939,{'model__l2_regularization': 7.369993416775703...
45,6,0.769131,0.024255,0.897353,0.008837,{'model__l2_regularization': 0.000101402178224...
66,7,0.754743,0.027107,0.89194,0.010217,{'model__l2_regularization': 5.0433425224965e-...
14,8,0.734688,0.028036,0.889913,0.008579,{'model__l2_regularization': 0.030486400425112...
3,9,0.719229,0.026646,0.883661,0.006002,{'model__l2_regularization': 0.088202504091702...
42,10,0.717442,0.029442,0.886741,0.006691,{'model__l2_regularization': 0.000261800445943...



Chosen threshold = 0.0021
OOF ROC-AUC = 0.9253 | PR-AUC = 0.8316
Score quantiles: [3.32573365e-08 2.71249175e-07 9.33194160e-07 4.92465674e-06
 9.70717418e-05]

OOF metrics + confusion:


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,TP,FP,TN,FN
0,0.9253,0.8316,0.8507,0.5486,0.6671,0.8731,0.8619,0.0599,1111,914,6291,195


### XGB — XGBoost (Tuning, RandomizedSearchCV)

In [69]:
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

# đảm bảo best_params / tuned_results có tồn tại
if "best_params" not in globals():
    best_params = {}
if "tuned_results" not in globals():
    tuned_results = []

if not HAS_XGB:
    print("XGB: package not installed -> Skipped tuning")
    tuned_results.append({"model": "XGB", "phase": "tuned", "status": "Skipped", "thr": np.nan})
else:
    # -------------------------------------------------
    # XGB: param space
    # -------------------------------------------------
    XGB_N_ITER = 60  # 30/60/100 tuỳ máy

    xgb_param_dist = {
        "model__n_estimators": randint(300, 1200),
        "model__max_depth": randint(2, 8),
        "model__learning_rate": loguniform(1e-3, 0.2),
        "model__subsample": uniform(0.6, 0.4),
        "model__colsample_bytree": uniform(0.6, 0.4),
        "model__min_child_weight": randint(1, 15),
        "model__gamma": loguniform(1e-8, 1.0),
    }

    # -------------------------------------------------
    # XGB: RandomizedSearchCV (opt PR-AUC, log ROC-AUC)
    # -------------------------------------------------
    xgb_base = XGBClassifier(
        random_state=SEED,
        n_jobs=-1,
        eval_metric="logloss",
        tree_method="hist"   # đổi "gpu_hist" nếu bạn setup GPU OK
    )

    xgb_pipe_for_search = build_pipeline(xgb_base, use_sampler=False)

    SCORING_OPT = {"pr_auc": "average_precision", "roc_auc": "roc_auc"}
    REFIT_OPT = "pr_auc"

    search_xgb = RandomizedSearchCV(
        estimator=xgb_pipe_for_search,
        param_distributions=xgb_param_dist,
        n_iter=XGB_N_ITER,
        scoring=SCORING_OPT,
        refit=REFIT_OPT,
        cv=cv5,
        n_jobs=-1,
        random_state=SEED,
        verbose=1,
        return_train_score=False
    )

    search_xgb.fit(X_train, y_train)

    best_params["XGB"] = search_xgb.best_params_

    # -------------------------------------------------
    # Report: best CV + top10
    # -------------------------------------------------
    res_xgb = pd.DataFrame(search_xgb.cv_results_)
    print(f"Total candidates tried = {len(res_xgb)} | Total fits ≈ {len(res_xgb) * cv5.get_n_splits()}")

    best_row_xgb = res_xgb.loc[res_xgb["rank_test_pr_auc"] == 1].iloc[0]
    print(
        "Best CV:",
        "PR-AUC =", round(float(best_row_xgb["mean_test_pr_auc"]), 5),
        "| ROC-AUC =", round(float(best_row_xgb["mean_test_roc_auc"]), 5)
    )

    cols_show = [
        "rank_test_pr_auc", "mean_test_pr_auc", "std_test_pr_auc",
        "mean_test_roc_auc", "std_test_roc_auc", "params"
    ]
    print("\nTop 10 configs (by PR-AUC):")
    display(res_xgb.sort_values("rank_test_pr_auc").head(10)[cols_show])

    # -------------------------------------------------
    # XGB: OOF + threshold + metrics
    # -------------------------------------------------
    xgb_tuned = XGBClassifier(
        random_state=SEED,
        n_jobs=-1,
        eval_metric="logloss",
        tree_method="hist"
    )

    xgb_pipe_tuned = build_pipeline(xgb_tuned, use_sampler=False)
    xgb_pipe_tuned.set_params(**best_params["XGB"])

    xgb_oof_t = get_oof_scores(xgb_pipe_tuned, X_train, y_train, cv5)

    xgb_thr_t = pick_threshold_by_target_recall(
        y_train,
        xgb_oof_t,
        target_recall=0.85,
        min_precision=0.30
    )

    xgb_m_t, xgb_cm_t = compute_metrics(y_train, xgb_oof_t, xgb_thr_t)
    tn, fp, fn, tp = xgb_cm_t.ravel()

    xgb_m_t_full = {**xgb_m_t, "TP": tp, "FP": fp, "TN": tn, "FN": fn}

    tuned_results.append({
        "model": "XGB",
        "phase": "tuned",
        "status": "OK",
        "thr": xgb_thr_t,
        **xgb_m_t
    })

    print("\nChosen threshold =", round(xgb_thr_t, 4))
    print(
        "OOF ROC-AUC =", round(xgb_m_t["roc_auc"], 4),
        "| PR-AUC =", round(xgb_m_t["pr_auc"], 4)
    )
    print("Score quantiles:", np.quantile(xgb_oof_t, [0.01, 0.05, 0.1, 0.2, 0.5]))

    print("\nOOF metrics + confusion:")
    display(pd.DataFrame([xgb_m_t_full]).round(4))


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Total candidates tried = 60 | Total fits ≈ 300
Best CV: PR-AUC = 0.72524 | ROC-AUC = 0.88629

Top 10 configs (by PR-AUC):


Unnamed: 0,rank_test_pr_auc,mean_test_pr_auc,std_test_pr_auc,mean_test_roc_auc,std_test_roc_auc,params
33,1,0.725237,0.030694,0.886294,0.009698,{'model__colsample_bytree': 0.8801431319891084...
0,2,0.698612,0.027046,0.878474,0.007048,"{'model__colsample_bytree': 0.749816047538945,..."
38,3,0.685155,0.021021,0.868738,0.011461,{'model__colsample_bytree': 0.7001847274422337...
8,4,0.684738,0.028252,0.869814,0.012135,{'model__colsample_bytree': 0.8270801311279966...
29,5,0.675251,0.025557,0.867005,0.009082,{'model__colsample_bytree': 0.6880964190262193...
26,6,0.670353,0.029495,0.865659,0.011622,{'model__colsample_bytree': 0.6579579488364892...
45,7,0.663246,0.030633,0.866375,0.008828,{'model__colsample_bytree': 0.9394679179698697...
23,8,0.628941,0.028284,0.848809,0.010636,{'model__colsample_bytree': 0.6673164168691722...
43,9,0.611834,0.030121,0.83986,0.013634,{'model__colsample_bytree': 0.6677970986744369...
20,10,0.607211,0.028371,0.844214,0.011507,{'model__colsample_bytree': 0.9570235993959911...



Chosen threshold = 0.0448
OOF ROC-AUC = 0.8862 | PR-AUC = 0.7248
Score quantiles: [0.0001246  0.0005687  0.00130535 0.00345051 0.02136761]

OOF metrics + confusion:


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,TP,FP,TN,FN
0,0.8862,0.7248,0.8507,0.3571,0.5031,0.7224,0.7866,0.0719,1111,2000,5205,195


### LGBM — LightGBM (Tuning, RandomizedSearchCV)

In [70]:
# =========================
# LGBM (ONE CELL ALL-IN-ONE):
# - RandomizedSearchCV tuning (optimize ROC-AUC, still report PR-AUC)
# - Print Top configs (by ROC-AUC & PR-AUC)
# - OOF scores + pick threshold (target recall/min precision)
# - Print: chosen threshold, OOF AUC/PR-AUC, score quantiles
# - Show OOF metrics + confusion table
# =========================

import numpy as np
import pandas as pd
from scipy.stats import randint, uniform, loguniform
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier

# ---- helper: show top configs ----
def show_top_configs(search, topk=10, metric="roc_auc"):
    df = pd.DataFrame(search.cv_results_)
    sort_col = f"mean_test_{metric}" if f"mean_test_{metric}" in df.columns else "mean_test_score"
    std_col  = f"std_test_{metric}"  if f"std_test_{metric}"  in df.columns else "std_test_score"
    rank_col = f"rank_test_{metric}" if f"rank_test_{metric}" in df.columns else "rank_test_score"

    base = df[[rank_col, sort_col, std_col, "params"]].sort_values(sort_col, ascending=False).head(topk).copy()
    params_df = pd.json_normalize(base["params"])
    out = pd.concat([base.drop(columns=["params"]).reset_index(drop=True),
                     params_df.reset_index(drop=True)], axis=1)
    display(out.round(5))

# ---- config ----
LGBM_N_ITER = 80
cv_lgbm = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

pos = int((y_train == 1).sum())
neg = int((y_train == 0).sum())
spw = neg / max(pos, 1)

# ---- model + pipeline (dùng helper của notebook) ----
lgbm_base = LGBMClassifier(
    objective="binary",
    random_state=SEED,
    n_jobs=1,
    force_col_wise=True,
    verbose=-1
)
lgbm_pipe = build_pipeline(lgbm_base, use_sampler=False)

# ---- param space (ổn định hơn để giảm "No further splits...") ----
lgbm_param_dist = {
    "model__n_estimators": randint(400, 2000),
    "model__learning_rate": loguniform(3e-3, 1e-1),
    "model__num_leaves": randint(16, 255),
    "model__max_depth": randint(2, 12),

    "model__min_child_samples": randint(5, 80),
    "model__min_child_weight": loguniform(1e-3, 10.0),

    "model__subsample": uniform(0.6, 0.4),
    "model__subsample_freq": randint(0, 10),
    "model__colsample_bytree": uniform(0.6, 0.4),

    "model__reg_alpha": loguniform(1e-8, 1e-1),
    "model__reg_lambda": loguniform(1e-8, 10.0),

    "model__scale_pos_weight": [1.0, spw*0.75, spw, spw*1.25],
}

# ---- tune: ưu tiên ROC-AUC ----
search_lgbm = RandomizedSearchCV(
    estimator=lgbm_pipe,
    param_distributions=lgbm_param_dist,
    n_iter=LGBM_N_ITER,
    scoring={"roc_auc": "roc_auc", "pr_auc": "average_precision"},
    refit="roc_auc",
    cv=cv_lgbm,
    n_jobs=-1,
    random_state=SEED,
    verbose=2,
    return_train_score=False
)

search_lgbm.fit(X_train, y_train)

print(f"\nTotal candidates tried = {LGBM_N_ITER} | Total fits ≈ {LGBM_N_ITER * cv_lgbm.get_n_splits()}")
print("Best CV: ROC-AUC =", round(float(search_lgbm.best_score_), 5))
print("Best params:\n", search_lgbm.best_params_)

cvres = pd.DataFrame(search_lgbm.cv_results_)
best_idx = int(search_lgbm.best_index_)
if "mean_test_pr_auc" in cvres.columns:
    print("PR-AUC of best-ROC-AUC config =", round(float(cvres.loc[best_idx, "mean_test_pr_auc"]), 5))

print("\nTop 10 configs (by ROC-AUC):")
show_top_configs(search_lgbm, topk=10, metric="roc_auc")

print("\nTop 10 configs (by PR-AUC):")
show_top_configs(search_lgbm, topk=10, metric="pr_auc")

# ---- OOF + threshold + report ----
best_pipe_lgbm = search_lgbm.best_estimator_
oof_lgbm = get_oof_scores(best_pipe_lgbm, X_train, y_train, cv_lgbm)

thr_lgbm = pick_threshold_by_target_recall(
    y_train, oof_lgbm,
    target_recall=0.85,
    min_precision=0.30
)

m_lgbm, cm_lgbm = compute_metrics(y_train, oof_lgbm, thr_lgbm)

print(f"\nChosen threshold = {thr_lgbm:.4f}")
print(f"OOF ROC-AUC = {m_lgbm.get('roc_auc', np.nan):.4f} | PR-AUC = {m_lgbm.get('pr_auc', np.nan):.4f}")

qs = np.array([0.01, 0.05, 0.10, 0.25, 0.50])
print("Score quantiles:", np.quantile(oof_lgbm, qs))

# ---- confusion extraction fallback (nếu compute_metrics không trả dict) ----
tp = m_lgbm.get("TP", None); fp = m_lgbm.get("FP", None); tn = m_lgbm.get("TN", None); fn = m_lgbm.get("FN", None)
if any(v is None for v in [tp, fp, tn, fn]):
    if isinstance(cm_lgbm, dict):
        tp = cm_lgbm.get("TP", tp); fp = cm_lgbm.get("FP", fp); tn = cm_lgbm.get("TN", tn); fn = cm_lgbm.get("FN", fn)
    else:
        try:
            # sklearn confusion_matrix format: [[TN, FP],[FN, TP]]
            tn, fp, fn, tp = cm_lgbm.ravel()
        except Exception:
            pass

print("\nOOF metrics + confusion:")
cols = ["roc_auc","pr_auc","recall","precision","f1","specificity","bal_acc","brier","TP","FP","TN","FN"]
row = {
    "roc_auc": m_lgbm.get("roc_auc", np.nan),
    "pr_auc": m_lgbm.get("pr_auc", np.nan),
    "recall": m_lgbm.get("recall", np.nan),
    "precision": m_lgbm.get("precision", np.nan),
    "f1": m_lgbm.get("f1", np.nan),
    "specificity": m_lgbm.get("specificity", np.nan),
    "bal_acc": m_lgbm.get("bal_acc", np.nan),
    "brier": m_lgbm.get("brier", np.nan),
    "TP": tp, "FP": fp, "TN": tn, "FN": fn
}
display(pd.DataFrame([row], columns=cols).round(4))


Fitting 5 folds for each of 80 candidates, totalling 400 fits

Total candidates tried = 80 | Total fits ≈ 400
Best CV: ROC-AUC = 0.919
Best params:
 {'model__colsample_bytree': np.float64(0.6463476238100518), 'model__learning_rate': np.float64(0.061876126258211095), 'model__max_depth': 9, 'model__min_child_samples': 16, 'model__min_child_weight': np.float64(0.021066486017042207), 'model__n_estimators': 1471, 'model__num_leaves': 166, 'model__reg_alpha': np.float64(0.0004807162118091464), 'model__reg_lambda': np.float64(0.009846938138527219), 'model__scale_pos_weight': 6.896056661562021, 'model__subsample': np.float64(0.8550229885420852), 'model__subsample_freq': 2}
PR-AUC of best-ROC-AUC config = 0.82422

Top 10 configs (by ROC-AUC):


Unnamed: 0,rank_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,model__colsample_bytree,model__learning_rate,model__max_depth,model__min_child_samples,model__min_child_weight,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__scale_pos_weight,model__subsample,model__subsample_freq
0,1,0.919,0.00589,0.64635,0.06188,9,16,0.02107,1471,166,0.00048,0.00985,6.89606,0.85502,2
1,2,0.9064,0.00658,0.9046,0.02622,7,11,0.05063,1591,185,1e-05,1.06432,4.13763,0.63254,4
2,3,0.90632,0.0086,0.79393,0.03401,11,48,0.00947,1797,175,4e-05,0.02124,1.0,0.62596,9
3,4,0.90233,0.00632,0.88099,0.01058,11,30,1.73962,1320,167,0.0247,0.0004,1.0,0.85116,3
4,5,0.90178,0.00723,0.87841,0.02214,9,36,0.4837,1361,142,4e-05,0.78769,1.0,0.93947,8
5,6,0.90069,0.00943,0.87906,0.04898,11,60,2.33542,1184,99,2e-05,6.6114,1.0,0.73897,2
6,7,0.90067,0.00729,0.99156,0.01653,9,31,0.05465,1700,45,0.00033,0.01048,1.0,0.6781,2
7,8,0.9006,0.0049,0.89768,0.03268,10,67,0.08144,1832,71,5e-05,8e-05,4.13763,0.60486,4
8,9,0.89949,0.00479,0.78812,0.09435,7,15,1.84386,543,112,0.0,0.00037,6.89606,0.61631,3
9,10,0.89941,0.00744,0.93508,0.03207,10,54,4.99775,1721,114,0.0,0.0,6.89606,0.82208,0



Top 10 configs (by PR-AUC):


Unnamed: 0,rank_test_pr_auc,mean_test_pr_auc,std_test_pr_auc,model__colsample_bytree,model__learning_rate,model__max_depth,model__min_child_samples,model__min_child_weight,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__scale_pos_weight,model__subsample,model__subsample_freq
0,1,0.82422,0.02017,0.64635,0.06188,9,16,0.02107,1471,166,0.00048,0.00985,6.89606,0.85502,2
1,2,0.79369,0.02502,0.79393,0.03401,11,48,0.00947,1797,175,4e-05,0.02124,1.0,0.62596,9
2,3,0.7919,0.027,0.9046,0.02622,7,11,0.05063,1591,185,1e-05,1.06432,4.13763,0.63254,4
3,4,0.78026,0.02021,0.78812,0.09435,7,15,1.84386,543,112,0.0,0.00037,6.89606,0.61631,3
4,5,0.77303,0.01839,0.89768,0.03268,10,67,0.08144,1832,71,5e-05,8e-05,4.13763,0.60486,4
5,6,0.77186,0.02182,0.93508,0.03207,10,54,4.99775,1721,114,0.0,0.0,6.89606,0.82208,0
6,7,0.77084,0.01834,0.61736,0.07678,6,40,0.16422,1955,200,5e-05,0.00462,4.13763,0.7178,0
7,8,0.77071,0.02487,0.87841,0.02214,9,36,0.4837,1361,142,4e-05,0.78769,1.0,0.93947,8
8,9,0.76999,0.02846,0.99156,0.01653,9,31,0.05465,1700,45,0.00033,0.01048,1.0,0.6781,2
9,10,0.76928,0.02334,0.83406,0.03828,11,78,0.03246,1508,93,0.0,0.0,1.0,0.84247,0



Chosen threshold = 0.0037
OOF ROC-AUC = 0.9188 | PR-AUC = 0.8237
Score quantiles: [4.71525624e-08 5.17962824e-07 2.11214178e-06 2.00417450e-05
 2.35036347e-04]

OOF metrics + confusion:


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,TP,FP,TN,FN
0,0.9188,0.8237,0.8507,0.5062,0.6347,0.8495,0.8501,0.0582,1111,1084,6121,195


## Tổng hợp sau tuning (OOF TRAIN)

## Bảng tổng hợp sau khi train + tuning (OOF TRAIN)

**Giải thích các thông số:**
- **PR-AUC (Average Precision)**: phù hợp khi mất cân bằng lớp; càng cao càng tốt.
- **ROC-AUC**: khả năng phân tách 2 lớp; càng cao càng tốt.
- **Accuracy (ACC)**: tỷ lệ dự đoán đúng trên toàn bộ mẫu (**(TP + TN) / (TP + TN + FP + FN)**). *Lưu ý:* khi dữ liệu mất cân bằng, ACC có thể “ảo” (cao nhưng vẫn bỏ sót nhiều ca dương tính), nên nên xem kèm Recall/PR-AUC.
- **Recall / Sensitivity (TPR)**: ưu tiên y tế (bắt đúng ca CHD); càng cao càng tốt.
- **Precision (PPV)**: trong các ca dự đoán CHD=1, tỷ lệ đúng.
- **F1**: cân bằng giữa precision và recall.


In [71]:
# ============================================================
# OOF TRAIN EVAL (BEFORE tuning - BASELINE ONLY) — 5 models
# + Metrics: ROC-AUC, PR-AUC, ACC, Precision, Recall, F1, thr_used
# + In THAM SỐ MODEL ĐÃ TRAIN (baseline params) — gọn, dễ đọc
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

# 1) 5 model bạn muốn đánh giá
try:
    selected_models
except NameError:
    selected_models = ["ET", "HGB", "LGBM", "XGB", "RF"]

# 2) CV splitter
if "cv5" in globals():
    _cv = cv5
else:
    _cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED if "SEED" in globals() else 42)

# 3) CHỈ lấy baseline pipes (*_pipe_base)
_map_base = {
    "RF":  "rf_pipe_base",
    "ET":  "et_pipe_base",
    "HGB": "hgb_pipe_base",
    "XGB": "xgb_pipe_base",
    "LGBM":"lgbm_pipe_base",
}
baseline_pipes = {k: globals()[v] for k, v in _map_base.items() if v in globals()}

missing = [m for m in selected_models if m not in baseline_pipes]
if missing:
    raise NameError(f"Thiếu pipeline baseline cho: {missing}. "
                    f"Hãy chạy các cell tạo *_pipe_base (vd: et_pipe_base, rf_pipe_base, ...).")

# 4) threshold map từ baseline_results nếu có
thr_map = {}
if "baseline_results" in globals():
    _bdf = pd.DataFrame(baseline_results).copy()
    if "thr" in _bdf.columns:
        for c in ["f1","recall"]:
            if c in _bdf.columns:
                _bdf[c] = pd.to_numeric(_bdf[c], errors="coerce")
        if "f1" in _bdf.columns:
            _bdf = _bdf.sort_values("f1", ascending=False).drop_duplicates("model", keep="first")
        elif "recall" in _bdf.columns:
            _bdf = _bdf.sort_values("recall", ascending=False).drop_duplicates("model", keep="first")
        thr_map = _bdf.set_index("model")["thr"].to_dict()

def pick_thr_max_f1(y_true, scores):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    if len(thr) == 0:
        return 0.5
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    return float(thr[np.argmax(f1s)])

# 5) Helpers lấy params gọn
def get_model_obj(pipe):
    if hasattr(pipe, "named_steps") and "model" in pipe.named_steps:
        return pipe.named_steps["model"]
    return None

def compact_params(model):
    # gọn theo từng loại model phổ biến
    cls = model.__class__.__name__.lower()
    p = model.get_params(deep=False)

    if "logistic" in cls:
        keys = ["C","penalty","solver","class_weight","max_iter"]
    elif "randomforest" in cls:
        keys = ["n_estimators","max_depth","min_samples_split","min_samples_leaf","max_features","class_weight","bootstrap"]
    elif "extra" in cls:
        keys = ["n_estimators","max_depth","min_samples_split","min_samples_leaf","max_features","bootstrap"]
    elif "histgradientboosting" in cls:
        keys = ["learning_rate","max_depth","max_iter","min_samples_leaf","l2_regularization","max_bins"]
    elif "xgb" in cls or "xgboost" in cls:
        keys = ["n_estimators","learning_rate","max_depth","subsample","colsample_bytree","reg_alpha","reg_lambda","min_child_weight","gamma","scale_pos_weight"]
    elif "lgbm" in cls or "lightgbm" in cls:
        keys = ["n_estimators","learning_rate","num_leaves","max_depth","min_child_samples",
                "subsample","subsample_freq","colsample_bytree","reg_alpha","reg_lambda","scale_pos_weight"]
    else:
        # fallback: lấy một số key hay gặp
        keys = ["n_estimators","max_depth","learning_rate","C","class_weight"]

    out = {"model_type": model.__class__.__name__}
    for k in keys:
        if k in p:
            out[k] = p[k]
    return out

# 6) Evaluate OOF + collect params
metric_rows = []
param_rows = []

for m in selected_models:
    pipe = baseline_pipes[m]
    oof_scores = get_oof_scores(pipe, X_train, y_train, _cv)

    thr = float(thr_map.get(m, np.nan))
    if not np.isfinite(thr):
        thr = pick_thr_max_f1(y_train, oof_scores)

    y_pred = (oof_scores >= thr).astype(int)

    metric_rows.append({
        "model": m,
        "thr_used": thr,
        "roc_auc": roc_auc_score(y_train, oof_scores),
        "pr_auc": average_precision_score(y_train, oof_scores),
        "acc": accuracy_score(y_train, y_pred),
        "precision": precision_score(y_train, y_pred, zero_division=0),
        "recall": recall_score(y_train, y_pred, zero_division=0),
        "f1": f1_score(y_train, y_pred, zero_division=0),
    })

    model_obj = get_model_obj(pipe)
    if model_obj is None:
        param_rows.append({"model": m, "note": "Không tìm thấy named_steps['model'] trong pipeline"})
    else:
        d = {"model": m}
        d.update(compact_params(model_obj))
        param_rows.append(d)

# 7) Display tables
report_before_oof = pd.DataFrame(metric_rows).sort_values("f1", ascending=False).reset_index(drop=True)
display(report_before_oof.round(4))

print("\nBaseline TRAINED PARAMS (compact):")
params_before = pd.DataFrame(param_rows)
display(params_before)


Unnamed: 0,model,thr_used,roc_auc,pr_auc,acc,precision,recall,f1
0,ET,0.3375,0.9462,0.8565,0.9356,0.8616,0.6914,0.7672
1,RF,0.28,0.9381,0.8435,0.9299,0.8043,0.7175,0.7584
2,LGBM,0.2468,0.8879,0.7256,0.9041,0.7018,0.6524,0.6762
3,HGB,0.2682,0.8621,0.649,0.8845,0.6308,0.5965,0.6131
4,XGB,0.2393,0.7772,0.478,0.8221,0.4318,0.5046,0.4654



Baseline TRAINED PARAMS (compact):


Unnamed: 0,model,model_type,n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features,bootstrap,learning_rate,max_iter,...,min_child_samples,subsample,subsample_freq,colsample_bytree,reg_alpha,reg_lambda,min_child_weight,gamma,scale_pos_weight,class_weight
0,ET,ExtraTreesClassifier,400.0,,2.0,1.0,sqrt,False,,,...,,,,,,,,,,
1,HGB,HistGradientBoostingClassifier,,,,20.0,,,0.1,100.0,...,,,,,,,,,,
2,LGBM,LGBMClassifier,400.0,-1.0,,,,,0.05,,...,20.0,1.0,0.0,1.0,0.0,0.0,,,,
3,XGB,XGBClassifier,400.0,3.0,,,,,0.05,,...,,0.9,,0.9,,,,,,
4,RF,RandomForestClassifier,400.0,,2.0,1.0,sqrt,True,,,...,,,,,,,,,,balanced


In [72]:
# ============================================================
# IN THAM SỐ "MODEL ĐÃ TRAIN" (baseline & tuned) + threshold
# - KHÔNG in best_params_ (tuning)
# - Chỉ lấy từ object model đã fit / được dùng để predict:
#   + baseline: *_pipe_base (hoặc baseline_pipes nếu có)
#   + tuned: search_*.best_estimator_
# ============================================================

import pandas as pd
from pprint import pprint

# 1) 5 model bạn muốn in
try:
    selected_models
except NameError:
    selected_models = ["ET", "HGB", "LGBM", "XGB", "RF"]

# 2) Map threshold (nếu có)
def _thr_map(results):
    df = pd.DataFrame(results).copy()
    if df.empty or "model" not in df.columns:
        return {}
    thr_col = "thr" if "thr" in df.columns else None
    if not thr_col:
        return {}
    # nếu trùng model, giữ dòng tốt nhất theo f1 rồi recall
    for c in ["f1", "recall"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    if "f1" in df.columns:
        df = df.sort_values("f1", ascending=False)
    elif "recall" in df.columns:
        df = df.sort_values("recall", ascending=False)
    df = df.drop_duplicates("model", keep="first")
    return df.set_index("model")[thr_col].to_dict()

thr_before = _thr_map(baseline_results) if "baseline_results" in globals() else {}
thr_after  = _thr_map(tuned_results)    if "tuned_results" in globals() else {}

# 3) Lấy baseline pipes (nếu bạn chưa có dict baseline_pipes thì auto build từ *_pipe_base)
if "baseline_pipes" not in globals():
    baseline_pipes = {}
    for name, var in {
        "RF": "rf_pipe_base",
        "ET": "et_pipe_base",
        "HGB": "hgb_pipe_base",
        "XGB": "xgb_pipe_base",
        "LGBM":"lgbm_pipe_base",
    }.items():
        if var in globals():
            baseline_pipes[name] = globals()[var]

# 4) Map search objects (tuned)
search_map = {
    "RF":  "search_rf",
    "ET":  "search_et",
    "HGB": "search_hgb",
    "XGB": "search_xgb",
    "LGBM":"search_lgbm",
}

def _get_model_params_from_pipe(pipe, deep=False):
    if hasattr(pipe, "named_steps") and "model" in pipe.named_steps:
        return pipe.named_steps["model"].get_params(deep=deep)
    # fallback (hiếm)
    return pipe.get_params(deep=deep) if hasattr(pipe, "get_params") else {}

# 5) In tham số train (không in best_params_)
for m in selected_models:
    print("\n" + "="*90)
    print(f"MODEL: {m}")

    # ---- BASELINE trained params ----
    if m in baseline_pipes:
        base_params = _get_model_params_from_pipe(baseline_pipes[m], deep=False)
        print(f"[BASELINE] thr = {thr_before.get(m, None)}")
        print(f"[BASELINE] trained model params (get_params(deep=False)):")
        pprint(base_params)
    else:
        print("[BASELINE] Không tìm thấy pipeline baseline (vd: et_pipe_base/rf_pipe_base...).")

    # ---- TUNED trained params (best_estimator_) ----
    s_var = search_map.get(m)
    if s_var in globals():
        s = globals()[s_var]
        tuned_pipe = s.best_estimator_
        tuned_params = _get_model_params_from_pipe(tuned_pipe, deep=False)
        print(f"\n[TUNED] thr = {thr_after.get(m, None)}")
        print("[TUNED] trained model params (best_estimator_.named_steps['model'].get_params(deep=False)):")
        pprint(tuned_params)
    else:
        print("\n[TUNED] Không tìm thấy search object (vd: search_et/search_lgbm...).")



MODEL: ET
[BASELINE] thr = 0.3375
[BASELINE] trained model params (get_params(deep=False)):
{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 400,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

[TUNED] thr = 0.18009127949386214
[TUNED] trained model params (best_estimator_.named_steps['model'].get_params(deep=False)):
{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 659,
 '

In [73]:
# ============================================================
# OOF TRAIN EVAL (AFTER tuning) — 5 models
# Metrics: ACC, Precision, Recall, F1 + ROC-AUC + PR-AUC
# + IN RA "THÔNG SỐ MODEL ĐÃ TRAIN" (tuned params) + threshold đã dùng
# ============================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from pprint import pprint

# 1) 5 model bạn muốn đánh giá
try:
    selected_models
except NameError:
    selected_models = ["ET", "HGB", "LGBM", "XGB", "RF"]

# 2) CV splitter
if "cv5" in globals():
    _cv = cv5
else:
    _cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED if "SEED" in globals() else 42)

# 3) Auto-build tuned_pipes nếu chưa có (ưu tiên search_* => best_estimator_)
if "tuned_pipes" not in globals():
    _tuned_map = {
        "RF":  ["search_rf",  "best_pipe_rf",  "rf_pipe_tuned",  "rf_pipe_best"],
        "ET":  ["search_et",  "best_pipe_et",  "et_pipe_tuned",  "et_pipe_best"],
        "HGB": ["search_hgb", "best_pipe_hgb", "hgb_pipe_tuned", "hgb_pipe_best"],
        "XGB": ["search_xgb", "best_pipe_xgb", "xgb_pipe_tuned", "xgb_pipe_best"],
        "LGBM":["search_lgbm","best_pipe_lgbm","lgbm_pipe_tuned","lgbm_pipe_best"],
    }
    tuned_pipes = {}
    for k, cand_vars in _tuned_map.items():
        for v in cand_vars:
            if v in globals():
                obj = globals()[v]
                tuned_pipes[k] = obj.best_estimator_ if hasattr(obj, "best_estimator_") else obj
                break

# 4) Threshold map từ tuned_results nếu có
thr_map = {}
if "tuned_results" in globals():
    _tdf = pd.DataFrame(tuned_results).copy()
    if "thr" in _tdf.columns:
        for c in ["f1", "recall"]:
            if c in _tdf.columns:
                _tdf[c] = pd.to_numeric(_tdf[c], errors="coerce")
        if "f1" in _tdf.columns:
            _tdf = _tdf.sort_values("f1", ascending=False).drop_duplicates("model", keep="first")
        elif "recall" in _tdf.columns:
            _tdf = _tdf.sort_values("recall", ascending=False).drop_duplicates("model", keep="first")
        thr_map = _tdf.set_index("model")["thr"].to_dict()

def pick_thr_max_f1(y_true, scores):
    prec, rec, thr = precision_recall_curve(y_true, scores)
    if len(thr) == 0:
        return 0.5
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    return float(thr[np.argmax(f1s)])

# 5) Evaluate OOF + IN PARAMS
missing = [m for m in selected_models if m not in tuned_pipes]
if missing:
    raise NameError(
        f"Thiếu pipeline TUNED cho: {missing}. "
        f"Hãy chắc chắn đã chạy tuning để có search_* hoặc best_pipe_* (vd: search_lgbm, best_pipe_lgbm, ...)."
    )

rows = []
param_rows = []

for m in selected_models:
    pipe = tuned_pipes[m]
    oof_scores = get_oof_scores(pipe, X_train, y_train, _cv)

    thr = float(thr_map.get(m, np.nan))
    if not np.isfinite(thr):
        thr = pick_thr_max_f1(y_train, oof_scores)

    y_pred = (oof_scores >= thr).astype(int)

    rows.append({
        "model": m,
        "thr_used": thr,
        "roc_auc": roc_auc_score(y_train, oof_scores),
        "pr_auc": average_precision_score(y_train, oof_scores),
        "acc": accuracy_score(y_train, y_pred),
        "precision": precision_score(y_train, y_pred, zero_division=0),
        "recall": recall_score(y_train, y_pred, zero_division=0),
        "f1": f1_score(y_train, y_pred, zero_division=0),
    })

    # params "model đã train" (không in best_params_)
    if hasattr(pipe, "named_steps") and "model" in pipe.named_steps:
        params = pipe.named_steps["model"].get_params(deep=False)
    else:
        params = pipe.get_params(deep=False) if hasattr(pipe, "get_params") else {}

    param_rows.append({"model": m, "thr_used": thr, "trained_params": params})

report_after_oof = pd.DataFrame(rows).sort_values("f1", ascending=False).reset_index(drop=True)
display(report_after_oof.round(4))

print("\n" + "="*90)
print("TUNED TRAINED PARAMS (mỗi model):")
for r in param_rows:
    print("\n" + "-"*90)
    print(f"MODEL: {r['model']} | thr_used: {r['thr_used']:.6f}")
    pprint(r["trained_params"])


Unnamed: 0,model,thr_used,roc_auc,pr_auc,acc,precision,recall,f1
0,LGBM,0.1641,0.9188,0.8237,0.9331,0.8264,0.7144,0.7663
1,RF,0.2768,0.9388,0.8463,0.93,0.8024,0.7213,0.7597
2,ET,0.1801,0.9456,0.8565,0.885,0.5863,0.8507,0.6942
3,HGB,0.0021,0.9253,0.8316,0.8697,0.5486,0.8507,0.6671
4,XGB,0.0448,0.8862,0.7248,0.7421,0.3571,0.8507,0.5031



TUNED TRAINED PARAMS (mỗi model):

------------------------------------------------------------------------------------------
MODEL: ET | thr_used: 0.180091
{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 30,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 659,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

------------------------------------------------------------------------------------------
MODEL: HGB | thr_used: 0.002144
{'categorical_features': 'from_dtype',
 'class_weight': None,
 'early_stopping': 'auto',
 'interaction_cst': None,
 'l2_regularization': np.float64(0.0022057229453746087),
 'learning_rate': np.float64(0.07190034331426896),
 'loss': 'log_loss',
 'max_bins': 163,
 'max_depth': None,
 