# 4 — Baseline: huấn luyện & đánh giá từng model (OOF TRAIN)

Notebook này chạy baseline cho từng model và đánh giá trên **OOF TRAIN**.
Sau mỗi model sẽ có:
- chọn threshold theo tiêu chí trong notebook
- confusion matrix
- ROC/PR (nếu có)

**Phụ thuộc:** đã chạy `app/03_preprocess_helpers.ipynb`.


## Baseline: huấn luyện & đánh giá từng model (OOF TRAIN)

### LR — Logistic Regression (Baseline, OOF TRAIN)

In [27]:
# [LR - Cell 1] Build baseline LR pipeline (không oversample để tránh double-imbalance handling)

from sklearn.linear_model import LogisticRegression

# LR baseline: class_weight balanced để xử lý imbalanced, max_iter tăng để hội tụ tốt hơn
lr_base = LogisticRegression(
    max_iter=2000,
    solver="liblinear",
    class_weight="balanced",
    random_state=SEED
)

# IMPORTANT: tắt sampler cho LR khi đã dùng class_weight (tránh đẩy proba lệch mạnh về class 1)
lr_pipe_base = build_pipeline(lr_base, use_sampler=False)


In [28]:
# [LR - Cell 2] OOF scores + chọn threshold (tối ưu F1) + lưu baseline_results

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
lr_oof = get_oof_scores(lr_pipe_base, X_train, y_train, cv5)

# Chọn threshold theo F1 (ổn định hơn "ưu tiên recall" và tránh predict-all-positive)
prec, rec, thr = precision_recall_curve(y_train, lr_oof)
if len(thr) == 0:
    lr_thr = 0.37
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    lr_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold vừa chọn (trên OOF TRAIN)
lr_m, lr_cm = compute_metrics(y_train, lr_oof, lr_thr)

# Lưu vào bảng baseline
baseline_results.append({"model": "LR", "phase": "baseline", "status": "OK", "thr": lr_thr, **lr_m})

print("Chosen threshold (F1) =", round(lr_thr, 4))
print("Predicted positive rate =", round(float((lr_oof >= lr_thr).mean()), 4))
display(pd.DataFrame([lr_m]).round(4))


Chosen threshold (F1) = 0.5323
Predicted positive rate = 0.3335


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7216,0.3419,0.6133,0.2822,0.3866,0.7173,0.6653,0.2119


In [29]:
# [LR - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

lr_pred_oof = (lr_oof >= lr_thr).astype(int)

# ROC-AUC (OOF TRAIN) — metric chính
lr_roc_auc = roc_auc_score(y_train, lr_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(lr_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, lr_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = confusion_matrix(y_train, lr_pred_oof)
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="LR Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(confusion_matrix(y_train, lr_pred_oof),
            title="LR Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, lr_oof, title="LR Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7216
              precision    recall  f1-score   support

           0      0.911     0.717     0.803      7205
           1      0.282     0.613     0.387      1306

    accuracy                          0.701      8511
   macro avg      0.597     0.665     0.595      8511
weighted avg      0.815     0.701     0.739      8511



### GNB — Gaussian Naive Bayes (Baseline, OOF TRAIN)

In [30]:
# [GNB - Cell 1] Build baseline GaussianNB pipeline
# Lưu ý: GaussianNB KHÔNG nên dùng oversampling (RandomOverSampler)
# vì NB giả định phân phối xác suất -> oversample sẽ làm phân phối bị méo

from sklearn.naive_bayes import GaussianNB

# GaussianNB baseline (không có class_weight)
gnb_base = GaussianNB()

# TẮT sampler cho GNB
gnb_pipe_base = build_pipeline(gnb_base, use_sampler=False)


In [31]:
# [GNB - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
gnb_oof = get_oof_scores(gnb_pipe_base, X_train, y_train, cv5)

# Chọn threshold theo F1 (ổn định hơn so với ưu tiên recall thuần)
prec, rec, thr = precision_recall_curve(y_train, gnb_oof)
if len(thr) == 0:
    gnb_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    gnb_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
gnb_m, gnb_cm = compute_metrics(y_train, gnb_oof, gnb_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "GNB",
    "phase": "baseline",
    "status": "OK",
    "thr": gnb_thr,
    **gnb_m
})

print("Chosen threshold (F1) =", round(gnb_thr, 4))
print("Predicted positive rate =", round(float((gnb_oof >= gnb_thr).mean()), 4))
display(pd.DataFrame([gnb_m]).round(4))


Chosen threshold (F1) = 0.0108
Predicted positive rate = 0.2658


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7016,0.2891,0.4954,0.286,0.3627,0.7759,0.6356,0.1666


In [32]:
# [GNB - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

gnb_pred_oof = (gnb_oof >= gnb_thr).astype(int)

# ROC-AUC (OOF TRAIN) — metric chính
gnb_roc_auc = roc_auc_score(y_train, gnb_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(gnb_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, gnb_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = gnb_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="GNB Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(gnb_cm, title="GNB Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, gnb_oof, title="GNB Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7016
              precision    recall  f1-score   support

           0      0.895     0.776     0.831      7205
           1      0.286     0.495     0.363      1306

    accuracy                          0.733      8511
   macro avg      0.590     0.636     0.597      8511
weighted avg      0.801     0.733     0.759      8511



### KNN — K-Nearest Neighbors (Baseline, OOF TRAIN)

In [33]:
# [KNN - Cell 1] Build baseline KNN pipeline
# Lưu ý: KNN KHÔNG dùng oversampling (RandomOverSampler)
# vì KNN rất nhạy với khoảng cách

from sklearn.neighbors import KNeighborsClassifier

# KNN baseline (n_neighbors=15 như bạn chọn)
knn_base = KNeighborsClassifier(
    n_neighbors=15,
    weights="uniform"   # baseline, sẽ tuning sau
)

# TẮT sampler cho KNN
knn_pipe_base = build_pipeline(knn_base, use_sampler=False)


In [34]:
# [KNN - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
knn_oof = get_oof_scores(knn_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu F1 (ổn định, tránh predict-all-positive)
prec, rec, thr = precision_recall_curve(y_train, knn_oof)
if len(thr) == 0:
    knn_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    knn_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
knn_m, knn_cm = compute_metrics(y_train, knn_oof, knn_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "KNN",
    "phase": "baseline",
    "status": "OK",
    "thr": knn_thr,
    **knn_m
})

print("Chosen threshold (F1) =", round(knn_thr, 4))
print("Predicted positive rate =", round(float((knn_oof >= knn_thr).mean()), 4))
display(pd.DataFrame([knn_m]).round(4))


Chosen threshold (F1) = 0.2667
Predicted positive rate = 0.2278


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7716,0.3552,0.536,0.361,0.4314,0.828,0.682,0.1127


In [35]:
# [KNN - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

knn_pred_oof = (knn_oof >= knn_thr).astype(int)

# ROC-AUC (OOF TRAIN) — metric chính
knn_roc_auc = roc_auc_score(y_train, knn_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(knn_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, knn_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = knn_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="KNN Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(knn_cm, title="KNN Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, knn_oof, title="KNN Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7716
              precision    recall  f1-score   support

           0      0.908     0.828     0.866      7205
           1      0.361     0.536     0.431      1306

    accuracy                          0.783      8511
   macro avg      0.634     0.682     0.649      8511
weighted avg      0.824     0.783     0.799      8511



### SVC — Support Vector Classifier (RBF) (Baseline, OOF TRAIN)

In [36]:
# [SVC - Cell 1] Build baseline SVC pipeline
# Lưu ý:
# - KHÔNG oversample SVC
# - Dùng class_weight="balanced"
# - probability=True để lấy predict_proba cho ROC/threshold

from sklearn.svm import SVC

svc_base = SVC(
    C=1.0,
    kernel="rbf",
    gamma="scale",
    class_weight="balanced",
    probability=True,
    random_state=SEED
)

# TẮT sampler cho SVC
svc_pipe_base = build_pipeline(svc_base, use_sampler=False)


In [37]:
# [SVC - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
svc_oof = get_oof_scores(svc_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1 (ổn định, tránh lệch recall cực đoan)
prec, rec, thr = precision_recall_curve(y_train, svc_oof)
if len(thr) == 0:
    svc_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    svc_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
svc_m, svc_cm = compute_metrics(y_train, svc_oof, svc_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "SVC",
    "phase": "baseline",
    "status": "OK",
    "thr": svc_thr,
    **svc_m
})

print("Chosen threshold (F1) =", round(svc_thr, 4))
print("Predicted positive rate =", round(float((svc_oof >= svc_thr).mean()), 4))
display(pd.DataFrame([svc_m]).round(4))


Chosen threshold (F1) = 0.26
Predicted positive rate = 0.2016


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7713,0.372,0.526,0.4003,0.4547,0.8572,0.6916,0.1119


In [38]:
# [SVC - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

svc_pred_oof = (svc_oof >= svc_thr).astype(int)

# ROC-AUC (OOF TRAIN) — metric chính
svc_roc_auc = roc_auc_score(y_train, svc_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(svc_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, svc_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = svc_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="SVC Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(svc_cm, title="SVC Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, svc_oof, title="SVC Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7713
              precision    recall  f1-score   support

           0      0.909     0.857     0.882      7205
           1      0.400     0.526     0.455      1306

    accuracy                          0.806      8511
   macro avg      0.655     0.692     0.668      8511
weighted avg      0.831     0.806     0.817      8511



### DT — Decision Tree (Baseline, OOF TRAIN)

In [39]:
# [DT - Cell 1] Build baseline Decision Tree pipeline
# Lưu ý:
# - KHÔNG oversample DT (dễ overfit)
# - Dùng class_weight="balanced" để xử lý imbalance

from sklearn.tree import DecisionTreeClassifier

dt_base = DecisionTreeClassifier(
    class_weight="balanced",
    random_state=SEED
)

# TẮT sampler cho DT
dt_pipe_base = build_pipeline(dt_base, use_sampler=False)


In [40]:
# [DT - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
dt_oof = get_oof_scores(dt_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1 (tránh lệch recall cực đoan)
prec, rec, thr = precision_recall_curve(y_train, dt_oof)
if len(thr) == 0:
    dt_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    dt_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
dt_m, dt_cm = compute_metrics(y_train, dt_oof, dt_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "DT",
    "phase": "baseline",
    "status": "OK",
    "thr": dt_thr,
    **dt_m
})

print("Chosen threshold (F1) =", round(dt_thr, 4))
print("Predicted positive rate =", round(float((dt_oof >= dt_thr).mean()), 4))
display(pd.DataFrame([dt_m]).round(4))


Chosen threshold (F1) = 1.0
Predicted positive rate = 0.1477


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7262,0.3646,0.5306,0.5513,0.5408,0.9217,0.7262,0.1383


In [41]:
# [DT - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

dt_pred_oof = (dt_oof >= dt_thr).astype(int)

# In ROC-AUC (OOF TRAIN) — metric quan trọng cho bài này
dt_roc_auc = roc_auc_score(y_train, dt_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(dt_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, dt_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = dt_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="DT Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(dt_cm, title="DT Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, dt_oof, title="DT Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7262
              precision    recall  f1-score   support

           0      0.915     0.922     0.919      7205
           1      0.551     0.531     0.541      1306

    accuracy                          0.862      8511
   macro avg      0.733     0.726     0.730      8511
weighted avg      0.860     0.862     0.861      8511



### RF — Random Forest (Baseline, OOF TRAIN)

In [42]:
# [RF - Cell 1] Build baseline RandomForest pipeline
# Lưu ý:
# - KHÔNG oversample RF
# - class_weight="balanced" là đủ cho imbalance

from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(
    n_estimators=400,
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

# TẮT sampler cho RF
rf_pipe_base = build_pipeline(rf_base, use_sampler=False)


In [43]:
# [RF - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
rf_oof = get_oof_scores(rf_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1 (ổn định, công bằng để so baseline)
prec, rec, thr = precision_recall_curve(y_train, rf_oof)
if len(thr) == 0:
    rf_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    rf_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
rf_m, rf_cm = compute_metrics(y_train, rf_oof, rf_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "RF",
    "phase": "baseline",
    "status": "OK",
    "thr": rf_thr,
    **rf_m
})

print("Chosen threshold (F1) =", round(rf_thr, 4))
print("Predicted positive rate =", round(float((rf_oof >= rf_thr).mean()), 4))
display(pd.DataFrame([rf_m]).round(4))


Chosen threshold (F1) = 0.28
Predicted positive rate = 0.1369


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.9381,0.8435,0.7175,0.8043,0.7584,0.9684,0.8429,0.0659


In [44]:
# [RF - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

rf_pred_oof = (rf_oof >= rf_thr).astype(int)

# ROC-AUC (OOF TRAIN) — metric chính cho so sánh model
rf_roc_auc = roc_auc_score(y_train, rf_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(rf_roc_auc), 4))

# Classification report (theo threshold đã chọn)
print(classification_report(y_train, rf_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = rf_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="RF Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(rf_cm, title="RF Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, rf_oof, title="RF Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.9381
              precision    recall  f1-score   support

           0      0.950     0.968     0.959      7205
           1      0.804     0.717     0.758      1306

    accuracy                          0.930      8511
   macro avg      0.877     0.843     0.859      8511
weighted avg      0.927     0.930     0.928      8511



### ET — Extra Trees (Baseline, OOF TRAIN)

In [45]:
# [ET - Cell 1] Build baseline ExtraTrees pipeline
# Lưu ý:
# - KHÔNG oversample ET
# - class_weight="balanced" là đủ

from sklearn.ensemble import ExtraTreesClassifier

et_base = ExtraTreesClassifier(
    n_estimators=400,
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1
)

# TẮT sampler cho ET
et_pipe_base = build_pipeline(et_base, use_sampler=False)


In [46]:
# [ET - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
et_oof = get_oof_scores(et_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1 (ổn định & công bằng)
prec, rec, thr = precision_recall_curve(y_train, et_oof)
if len(thr) == 0:
    et_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    et_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
et_m, et_cm = compute_metrics(y_train, et_oof, et_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "ET",
    "phase": "baseline",
    "status": "OK",
    "thr": et_thr,
    **et_m
})

print("Chosen threshold (F1) =", round(et_thr, 4))
print("Predicted positive rate =", round(float((et_oof >= et_thr).mean()), 4))
display(pd.DataFrame([et_m]).round(4))


Chosen threshold (F1) = 0.3375
Predicted positive rate = 0.1231


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.9462,0.8565,0.6914,0.8616,0.7672,0.9799,0.8356,0.0544


In [47]:
# [ET - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

et_pred_oof = (et_oof >= et_thr).astype(int)

# ROC-AUC (OOF TRAIN)
et_roc_auc = roc_auc_score(y_train, et_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(et_roc_auc), 4))

# Classification report
print(classification_report(y_train, et_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = et_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="ET Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(et_cm, title="ET Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, et_oof, title="ET Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.9462
              precision    recall  f1-score   support

           0      0.946     0.980     0.963      7205
           1      0.862     0.691     0.767      1306

    accuracy                          0.936      8511
   macro avg      0.904     0.836     0.865      8511
weighted avg      0.933     0.936     0.933      8511



### ADA — AdaBoost (Baseline, OOF TRAIN)

In [48]:
# [ADA - Cell 1] Build baseline AdaBoost pipeline
# Lưu ý:
# - KHÔNG oversample AdaBoost
# - AdaBoost nhạy với noise/duplicate

from sklearn.ensemble import AdaBoostClassifier

ada_base = AdaBoostClassifier(
    n_estimators=300,
    learning_rate=0.05,
    random_state=SEED
)

# TẮT sampler cho AdaBoost
ada_pipe_base = build_pipeline(ada_base, use_sampler=False)


In [49]:
# [ADA - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
ada_oof = get_oof_scores(ada_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1 (ổn định & công bằng)
prec, rec, thr = precision_recall_curve(y_train, ada_oof)
if len(thr) == 0:
    ada_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    ada_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
ada_m, ada_cm = compute_metrics(y_train, ada_oof, ada_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "ADA",
    "phase": "baseline",
    "status": "OK",
    "thr": ada_thr,
    **ada_m
})

print("Chosen threshold (F1) =", round(ada_thr, 4))
print("Predicted positive rate =", round(float((ada_oof >= ada_thr).mean()), 4))
display(pd.DataFrame([ada_m]).round(4))


Chosen threshold (F1) = 0.2559
Predicted positive rate = 0.3514


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7126,0.316,0.621,0.2711,0.3775,0.6974,0.6592,0.1274


In [50]:
# [ADA - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

ada_pred_oof = (ada_oof >= ada_thr).astype(int)

# ROC-AUC (OOF TRAIN)
ada_roc_auc = roc_auc_score(y_train, ada_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(ada_roc_auc), 4))

# Classification report
print(classification_report(y_train, ada_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = ada_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="ADA Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(ada_cm, title="ADA Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, ada_oof, title="ADA Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7126
              precision    recall  f1-score   support

           0      0.910     0.697     0.790      7205
           1      0.271     0.621     0.377      1306

    accuracy                          0.686      8511
   macro avg      0.591     0.659     0.584      8511
weighted avg      0.812     0.686     0.727      8511



### GB — Gradient Boosting (Baseline, OOF TRAIN)

In [51]:
# [GB - Cell 1] Build baseline GradientBoosting pipeline
# Lưu ý:
# - KHÔNG oversample GradientBoosting
# - GB học tốt non-linear nhưng nhạy với noise

from sklearn.ensemble import GradientBoostingClassifier

gb_base = GradientBoostingClassifier(
    random_state=SEED
)

# TẮT sampler cho GB
gb_pipe_base = build_pipeline(gb_base, use_sampler=False)


In [52]:
# [GB - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
gb_oof = get_oof_scores(gb_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1
prec, rec, thr = precision_recall_curve(y_train, gb_oof)
if len(thr) == 0:
    gb_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    gb_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
gb_m, gb_cm = compute_metrics(y_train, gb_oof, gb_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "GB",
    "phase": "baseline",
    "status": "OK",
    "thr": gb_thr,
    **gb_m
})

print("Chosen threshold (F1) =", round(gb_thr, 4))
print("Predicted positive rate =", round(float((gb_oof >= gb_thr).mean()), 4))
display(pd.DataFrame([gb_m]).round(4))


Chosen threshold (F1) = 0.1987
Predicted positive rate = 0.2402


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.753,0.4365,0.5482,0.3503,0.4275,0.8157,0.682,0.1101


In [53]:
# [GB - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

gb_pred_oof = (gb_oof >= gb_thr).astype(int)

# ROC-AUC (OOF TRAIN)
gb_roc_auc = roc_auc_score(y_train, gb_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(gb_roc_auc), 4))

# Classification report
print(classification_report(y_train, gb_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = gb_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="GB Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(gb_cm, title="GB Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, gb_oof, title="GB Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.753
              precision    recall  f1-score   support

           0      0.909     0.816     0.860      7205
           1      0.350     0.548     0.427      1306

    accuracy                          0.775      8511
   macro avg      0.630     0.682     0.644      8511
weighted avg      0.823     0.775     0.793      8511



### HGB — HistGradientBoosting (Baseline, OOF TRAIN)

In [54]:
# [HGB - Cell 1] Build baseline HistGradientBoosting pipeline
# Lưu ý:
# - KHÔNG oversample HGB
# - HGB rất mạnh cho tabular, xử lý non-linear tốt

from sklearn.ensemble import HistGradientBoostingClassifier

hgb_base = HistGradientBoostingClassifier(
    random_state=SEED
)

# TẮT sampler cho HGB
hgb_pipe_base = build_pipeline(hgb_base, use_sampler=False)


In [55]:
# [HGB - Cell 2] OOF scores + chọn threshold theo F1

# Lấy OOF scores (xác suất class 1) trên TRAIN bằng CV
hgb_oof = get_oof_scores(hgb_pipe_base, X_train, y_train, cv5)

# Chọn threshold tối ưu theo F1
prec, rec, thr = precision_recall_curve(y_train, hgb_oof)
if len(thr) == 0:
    hgb_thr = 0.5
else:
    f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
    hgb_thr = float(thr[np.argmax(f1s)])

# Tính metrics theo threshold đã chọn (OOF TRAIN)
hgb_m, hgb_cm = compute_metrics(y_train, hgb_oof, hgb_thr)

# Lưu kết quả baseline
baseline_results.append({
    "model": "HGB",
    "phase": "baseline",
    "status": "OK",
    "thr": hgb_thr,
    **hgb_m
})

print("Chosen threshold (F1) =", round(hgb_thr, 4))
print("Predicted positive rate =", round(float((hgb_oof >= hgb_thr).mean()), 4))
display(pd.DataFrame([hgb_m]).round(4))


Chosen threshold (F1) = 0.2682
Predicted positive rate = 0.1451


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.8621,0.649,0.5965,0.6308,0.6131,0.9367,0.7666,0.0863


In [56]:
# [HGB - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

hgb_pred_oof = (hgb_oof >= hgb_thr).astype(int)

# ROC-AUC (OOF TRAIN)
hgb_roc_auc = roc_auc_score(y_train, hgb_oof)
print("ROC-AUC (OOF TRAIN) =", round(float(hgb_roc_auc), 4))

# Classification report
print(classification_report(y_train, hgb_pred_oof, digits=3))

# Confusion Matrix (Plotly cho đẹp)
if HAS_PLOTLY:
    cm = hgb_cm
    fig = go.Figure(
        data=go.Heatmap(
            z=cm,
            x=["Pred 0", "Pred 1"],
            y=["True 0", "True 1"],
            colorscale="Blues",
            zmin=0,
            zmax=int(cm.max())
        )
    )
    fig.update_layout(
        title="HGB Baseline - Confusion Matrix (OOF TRAIN)",
        xaxis_title="Predicted",
        yaxis_title="True"
    )
    for i in range(2):
        for j in range(2):
            fig.add_annotation(
                x=j, y=i, text=str(cm[i, j]),
                showarrow=False, font=dict(size=16)
            )
    fig.show()
else:
    plot_cm(hgb_cm, title="HGB Baseline - Confusion Matrix (OOF TRAIN)")

# ROC curve
plot_roc(y_train, hgb_oof, title="HGB Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.8621
              precision    recall  f1-score   support

           0      0.928     0.937     0.932      7205
           1      0.631     0.596     0.613      1306

    accuracy                          0.885      8511
   macro avg      0.779     0.767     0.773      8511
weighted avg      0.882     0.885     0.883      8511



### XGB — XGBoost (Baseline, OOF TRAIN)

In [57]:
# [XGB - Cell 1] Build baseline XGBoost pipeline
# Lưu ý:
# - KHÔNG oversample XGB
# - XGB rất mạnh cho tabular, xử lý non-linear tốt

if not HAS_XGB:
    print("XGB: package not installed -> Skipped")
    baseline_results.append({
        "model": "XGB",
        "phase": "baseline",
        "status": "Skipped",
        "thr": np.nan
    })
else:
    xgb_base = XGBClassifier(
        n_estimators=400,
        max_depth=3,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=SEED,
        n_jobs=-1
    )

    # TẮT sampler cho XGB
    xgb_pipe_base = build_pipeline(xgb_base, use_sampler=False)


In [58]:
# [XGB - Cell 2] OOF scores + chọn threshold theo F1

if HAS_XGB:
    # OOF scores (CV validation)
    xgb_oof = get_oof_scores(xgb_pipe_base, X_train, y_train, cv5)

    # Chọn threshold tối ưu theo F1
    prec, rec, thr = precision_recall_curve(y_train, xgb_oof)
    if len(thr) == 0:
        xgb_thr = 0.5
    else:
        f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
        xgb_thr = float(thr[np.argmax(f1s)])

    # Metrics (OOF TRAIN)
    xgb_m, xgb_cm = compute_metrics(y_train, xgb_oof, xgb_thr)

    baseline_results.append({
        "model": "XGB",
        "phase": "baseline",
        "status": "OK",
        "thr": xgb_thr,
        **xgb_m
    })

    print("Chosen threshold (F1) =", round(xgb_thr, 4))
    print("Predicted positive rate =", round(float((xgb_oof >= xgb_thr).mean()), 4))
    display(pd.DataFrame([xgb_m]).round(4))


Chosen threshold (F1) = 0.2393
Predicted positive rate = 0.1793


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.7772,0.478,0.5046,0.4318,0.4654,0.8797,0.6921,0.106


In [59]:
# [XGB - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

if HAS_XGB:
    xgb_pred_oof = (xgb_oof >= xgb_thr).astype(int)

    # ROC-AUC (OOF TRAIN)
    xgb_roc_auc = roc_auc_score(y_train, xgb_oof)
    print("ROC-AUC (OOF TRAIN) =", round(float(xgb_roc_auc), 4))

    # Classification report
    print(classification_report(y_train, xgb_pred_oof, digits=3))

    # Confusion Matrix (Plotly cho đẹp)
    if HAS_PLOTLY:
        cm = xgb_cm
        fig = go.Figure(
            data=go.Heatmap(
                z=cm,
                x=["Pred 0", "Pred 1"],
                y=["True 0", "True 1"],
                colorscale="Blues",
                zmin=0,
                zmax=int(cm.max())
            )
        )
        fig.update_layout(
            title="XGB Baseline - Confusion Matrix (OOF TRAIN)",
            xaxis_title="Predicted",
            yaxis_title="True"
        )
        for i in range(2):
            for j in range(2):
                fig.add_annotation(
                    x=j, y=i, text=str(cm[i, j]),
                    showarrow=False, font=dict(size=16)
                )
        fig.show()
    else:
        plot_cm(xgb_cm, title="XGB Baseline - Confusion Matrix (OOF TRAIN)")

    # ROC curve
    plot_roc(y_train, xgb_oof, title="XGB Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.7772
              precision    recall  f1-score   support

           0      0.907     0.880     0.893      7205
           1      0.432     0.505     0.465      1306

    accuracy                          0.822      8511
   macro avg      0.670     0.692     0.679      8511
weighted avg      0.834     0.822     0.828      8511



### LGBM — LightGBM (Baseline, OOF TRAIN)

In [60]:
# [LGBM - Cell 1] Build baseline LightGBM pipeline
# Lưu ý:
# - KHÔNG oversample LGBM
# - LGBM rất mạnh cho tabular, tốc độ nhanh

if not HAS_LGBM:
    print("LGBM: package not installed -> Skipped")
    baseline_results.append({
        "model": "LGBM",
        "phase": "baseline",
        "status": "Skipped",
        "thr": np.nan
    })
else:
    from lightgbm import LGBMClassifier

    lgbm_base = LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        random_state=SEED,
        n_jobs=-1
    )
    
    # TẮT sampler cho LGBM
    lgbm_pipe_base = build_pipeline(lgbm_base, use_sampler=False)


In [61]:
# [LGBM - Cell 2] OOF scores + chọn threshold theo F1

if HAS_LGBM:
    # OOF scores (CV validation)
    lgbm_oof = get_oof_scores(lgbm_pipe_base, X_train, y_train, cv5)

    # Chọn threshold tối ưu theo F1
    prec, rec, thr = precision_recall_curve(y_train, lgbm_oof)
    if len(thr) == 0:
        lgbm_thr = 0.5
    else:
        f1s = 2 * (prec[:-1] * rec[:-1]) / (prec[:-1] + rec[:-1] + 1e-12)
        lgbm_thr = float(thr[np.argmax(f1s)])

    # Metrics (OOF TRAIN)
    lgbm_m, lgbm_cm = compute_metrics(y_train, lgbm_oof, lgbm_thr)

    baseline_results.append({
        "model": "LGBM",
        "phase": "baseline",
        "status": "OK",
        "thr": lgbm_thr,
        **lgbm_m
    })
    
    print("Chosen threshold (F1) =", round(lgbm_thr, 4))
    print("Predicted positive rate =", round(float((lgbm_oof >= lgbm_thr).mean()), 4))
    display(pd.DataFrame([lgbm_m]).round(4))


Chosen threshold (F1) = 0.2468
Predicted positive rate = 0.1426


Unnamed: 0,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier
0,0.8879,0.7256,0.6524,0.7018,0.6762,0.9498,0.8011,0.0755


In [62]:
# [LGBM - Cell 3] ROC-AUC + Classification report + Confusion Matrix + ROC (OOF TRAIN)

if HAS_LGBM:
    lgbm_pred_oof = (lgbm_oof >= lgbm_thr).astype(int)

    # ROC-AUC (OOF TRAIN)
    lgbm_roc_auc = roc_auc_score(y_train, lgbm_oof)
    print("ROC-AUC (OOF TRAIN) =", round(float(lgbm_roc_auc), 4))

    # Classification report
    print(classification_report(y_train, lgbm_pred_oof, digits=3))

    # Confusion Matrix (Plotly cho đẹp)
    if HAS_PLOTLY:
        cm = lgbm_cm
        fig = go.Figure(
            data=go.Heatmap(
                z=cm,
                x=["Pred 0", "Pred 1"],
                y=["True 0", "True 1"],
                colorscale="Blues",
                zmin=0,
                zmax=int(cm.max())
            )
        )
        fig.update_layout(
            title="LGBM Baseline - Confusion Matrix (OOF TRAIN)",
            xaxis_title="Predicted",
            yaxis_title="True"
        )
        for i in range(2):
            for j in range(2):
                fig.add_annotation(
                    x=j, y=i, text=str(cm[i, j]),
                    showarrow=False, font=dict(size=16)
                )
        fig.show()
    else:
        plot_cm(lgbm_cm, title="LGBM Baseline - Confusion Matrix (OOF TRAIN)")

    # ROC curve
    plot_roc(y_train, lgbm_oof, title="LGBM Baseline - ROC (OOF TRAIN)")


ROC-AUC (OOF TRAIN) = 0.8879
              precision    recall  f1-score   support

           0      0.938     0.950     0.944      7205
           1      0.702     0.652     0.676      1306

    accuracy                          0.904      8511
   macro avg      0.820     0.801     0.810      8511
weighted avg      0.902     0.904     0.903      8511



### Tổng hợp baseline (OOF TRAIN)

**Ý nghĩa các metrics dùng để so sánh và xếp hạng mô hình (Baseline & Tuning)**

Trong bài toán dự đoán nguy cơ bệnh tim (TenYearCHD), dữ liệu bị **mất cân bằng lớp** (số ca không bệnh lớn hơn số ca bệnh). Vì vậy, **accuracy không được sử dụng**, thay vào đó là các metrics phản ánh tốt hơn khả năng **xếp hạng rủi ro** và **phát hiện ca bệnh**.

---

**ROC-AUC (Receiver Operating Characteristic – Area Under Curve)**  
- Đo khả năng mô hình **xếp hạng rủi ro** giữa người có bệnh và không bệnh.  
- Không phụ thuộc vào threshold và ít bị ảnh hưởng bởi mất cân bằng lớp.  
- Là **metric quan trọng nhất** trong pipeline này để so sánh và xếp hạng mô hình.

---

**PR-AUC (Average Precision / Precision–Recall AUC)**  
- Đánh giá hiệu quả của mô hình trên **lớp bệnh (positive class)**.  
- Nhạy với dữ liệu mất cân bằng hơn ROC-AUC.  
- Dùng để so sánh bổ sung khi các mô hình có ROC-AUC gần nhau.

---

**Recall (Sensitivity / True Positive Rate)**  
- Tỷ lệ bệnh nhân thật sự có bệnh được mô hình phát hiện.  
- Recall thấp đồng nghĩa với việc **bỏ sót bệnh nhân**, điều không mong muốn trong y tế.  
- Được ưu tiên trong các kịch bản screening.

---

**Precision (Positive Predictive Value)**  
- Trong các ca được dự đoán là có bệnh, tỷ lệ dự đoán đúng.  
- Precision thấp dẫn đến nhiều cảnh báo giả, làm tăng gánh nặng cho hệ thống y tế.

---

**F1-score**  
- Trung bình điều hoà giữa precision và recall.  
- Dùng để đánh giá mức cân bằng giữa hai yếu tố này, nhưng **không phải metric chính** trong pipeline.

---

**Specificity (True Negative Rate)**  
- Tỷ lệ người không bệnh được dự đoán đúng.  
- Cao specificity giúp giảm số lượng false positive.

---

**Balanced Accuracy**  
- Trung bình của recall (sensitivity) và specificity.  
- Phù hợp hơn accuracy khi dữ liệu mất cân bằng.

---

**Brier Score**  
- Đánh giá **chất lượng xác suất dự đoán** của mô hình.  
- Giá trị càng thấp thì xác suất dự đoán càng đáng tin cậy.  
- Quan trọng cho các bước **calibration** và ra quyết định dựa trên xác suất.

---

**Threshold (thr)**  
- Ngưỡng xác suất dùng để chuyển từ probability sang nhãn 0/1.  
- Threshold được chọn tuỳ theo mục tiêu (ưu tiên recall hoặc tối ưu F1).  
- **Không dùng để xếp hạng mô hình**, chỉ dùng sau khi đã chọn được mô hình tốt.

---

**Nguyên tắc xếp hạng mô hình trong pipeline**

Thứ tự ưu tiên khi so sánh các mô hình (baseline và sau tuning):
1. ROC-AUC  
2. PR-AUC  
3. Recall  

Các metric còn lại được dùng để tham khảo và phân tích sâu hơn.


In [63]:
# ===============================
# BASELINE SUMMARY + TOP MODELS
# ===============================

# Gom kết quả baseline
baseline_df = pd.DataFrame(baseline_results)

# Danh sách cột chuẩn mong muốn
cols = [
    "model",
    "status",
    "roc_auc",      # metric chính
    "pr_auc",
    "recall",
    "precision",
    "f1",
    "specificity",
    "bal_acc",
    "brier",
    "thr"
]

# Đảm bảo mọi cột đều tồn tại (kể cả model Skipped)
for c in cols:
    if c not in baseline_df.columns:
        baseline_df[c] = np.nan

# Giữ đúng thứ tự cột
baseline_df = baseline_df[cols]

# Sort chuẩn cho bài toán y tế:
# 1) Model OK trước
# 2) ROC-AUC (↓) — metric chính
# 3) PR-AUC (↓) — positive class
# 4) Recall (↓) — hạn chế bỏ sót bệnh
baseline_df = baseline_df.sort_values(
    by=["status", "roc_auc", "pr_auc", "recall"],
    ascending=[True, False, False, False]
)

# Hiển thị bảng baseline
print("=== BASELINE RESULTS (OOF-CV) ===")
display(baseline_df.round(4))

# ===============================
# CHỌN TOP 5 MODELS ĐỂ TUNING
# ===============================

top5_models = baseline_df.query("status == 'OK'").head(5)

print("=== TOP 5 MODELS FOR TUNING ===")
display(top5_models.round(4))

# Lấy danh sách tên model (dùng cho bước tuning)
TOP5_MODEL_NAMES = top5_models["model"].tolist()
print("Top 5 model names:", TOP5_MODEL_NAMES)


=== BASELINE RESULTS (OOF-CV) ===


Unnamed: 0,model,status,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,thr
6,ET,OK,0.9462,0.8565,0.6914,0.8616,0.7672,0.9799,0.8356,0.0544,0.3375
5,RF,OK,0.9381,0.8435,0.7175,0.8043,0.7584,0.9684,0.8429,0.0659,0.28
11,LGBM,OK,0.8879,0.7256,0.6524,0.7018,0.6762,0.9498,0.8011,0.0755,0.2468
9,HGB,OK,0.8621,0.649,0.5965,0.6308,0.6131,0.9367,0.7666,0.0863,0.2682
10,XGB,OK,0.7772,0.478,0.5046,0.4318,0.4654,0.8797,0.6921,0.106,0.2393
2,KNN,OK,0.7716,0.3552,0.536,0.361,0.4314,0.828,0.682,0.1127,0.2667
3,SVC,OK,0.7713,0.372,0.526,0.4003,0.4547,0.8572,0.6916,0.1119,0.26
8,GB,OK,0.753,0.4365,0.5482,0.3503,0.4275,0.8157,0.682,0.1101,0.1987
4,DT,OK,0.7262,0.3646,0.5306,0.5513,0.5408,0.9217,0.7262,0.1383,1.0
0,LR,OK,0.7216,0.3419,0.6133,0.2822,0.3866,0.7173,0.6653,0.2119,0.5323


=== TOP 5 MODELS FOR TUNING ===


Unnamed: 0,model,status,roc_auc,pr_auc,recall,precision,f1,specificity,bal_acc,brier,thr
6,ET,OK,0.9462,0.8565,0.6914,0.8616,0.7672,0.9799,0.8356,0.0544,0.3375
5,RF,OK,0.9381,0.8435,0.7175,0.8043,0.7584,0.9684,0.8429,0.0659,0.28
11,LGBM,OK,0.8879,0.7256,0.6524,0.7018,0.6762,0.9498,0.8011,0.0755,0.2468
9,HGB,OK,0.8621,0.649,0.5965,0.6308,0.6131,0.9367,0.7666,0.0863,0.2682
10,XGB,OK,0.7772,0.478,0.5046,0.4318,0.4654,0.8797,0.6921,0.106,0.2393


Top 5 model names: ['ET', 'RF', 'LGBM', 'HGB', 'XGB']
