# 6 — Đánh giá cuối trên TEST + Lưu model

Notebook này:
- Lấy các model tuned tốt nhất
- Đánh giá cuối cùng trên **TEST** (chỉ chạy 1 lần)
- Lưu model ra `saved_models/` dưới dạng `.joblib`

**Phụ thuộc:** đã chạy `app/5_tuning.ipynb`.


## Best after tuning + đánh giá cuối trên TEST (chỉ 1 lần)



In [74]:
# ==============================
# RF (TUNED) -> TEST (no plot)
# ==============================
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

# Get tuned RF pipeline
if "search_rf" in globals():
    rf_pipe = search_rf.best_estimator_
elif "tuned_pipes" in globals() and "RF" in tuned_pipes:
    rf_pipe = tuned_pipes["RF"]
elif "rf_pipe_base" in globals():
    print("⚠️ No tuned RF found -> using baseline rf_pipe_base.")
    rf_pipe = rf_pipe_base
else:
    raise NameError("❌ RF pipeline not found. Run RF build/tuning cells first (search_rf or tuned_pipes).")

# Get threshold used
rf_thr = None
if "report_after_oof" in globals():
    try:
        rf_thr = float(report_after_oof.loc[report_after_oof["model"] == "RF", "thr_used"].iloc[0])
    except Exception:
        rf_thr = None
if rf_thr is None and "thr_map" in globals() and "RF" in thr_map:
    rf_thr = float(thr_map["RF"])
if rf_thr is None and "rf_thr" in globals():
    rf_thr = float(rf_thr)
if rf_thr is None:
    rf_thr = 0.5

# Fit and predict
rf_pipe.fit(X_train, y_train)
proba = rf_pipe.predict_proba(X_test)[:, 1]
pred  = (proba >= rf_thr).astype(int)

# Metrics
roc  = roc_auc_score(y_test, proba)
pra  = average_precision_score(y_test, proba)
acc  = accuracy_score(y_test, pred)
pre  = precision_score(y_test, pred, zero_division=0)
rec  = recall_score(y_test, pred, zero_division=0)
f1   = f1_score(y_test, pred, zero_division=0)
cm   = confusion_matrix(y_test, pred)

print(f"[RF TEST] thr={rf_thr:.4f} | AUC={roc:.4f} | PR-AUC={pra:.4f} | ACC={acc:.4f} | P={pre:.4f} | R={rec:.4f} | F1={f1:.4f}")
display(pd.DataFrame(cm, index=["True0","True1"], columns=["Pred0","Pred1"]))


[RF TEST] thr=0.2768 | AUC=0.9655 | PR-AUC=0.9040 | ACC=0.9445 | P=0.8276 | R=0.8073 | F1=0.8173


Unnamed: 0,Pred0,Pred1
True0,1746,55
True1,63,264


In [None]:
# ==============================
# ET (TUNED) -> TEST (no plot)
# ==============================
import numpy as np
import pandas as pd
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

# Get tuned ET pipeline
if "search_et" in globals():
    et_pipe = search_et.best_estimator_
elif "search_ET" in globals():
    et_pipe = search_ET.best_estimator_
elif "tuned_pipes" in globals() and "ET" in tuned_pipes:
    et_pipe = tuned_pipes["ET"]
elif "et_pipe_base" in globals():
    print("⚠️ No tuned ET found -> using baseline et_pipe_base.")
    et_pipe = et_pipe_base
else:
    raise NameError("❌ ET pipeline not found. Run ET build/tuning cells first (search_et or tuned_pipes).")

# Get threshold used
et_thr = None
if "report_after_oof" in globals():
    try:
        et_thr = float(report_after_oof.loc[report_after_oof["model"] == "ET", "thr_used"].iloc[0])
    except Exception:
        et_thr = None
if et_thr is None and "thr_map" in globals() and "ET" in thr_map:
    et_thr = float(thr_map["ET"])
if et_thr is None and "et_thr" in globals():
    et_thr = float(et_thr)
if et_thr is None:
    et_thr = 0.5

# Fit and predict
et_pipe.fit(X_train, y_train)
proba = et_pipe.predict_proba(X_test)[:, 1]
pred  = (proba >= et_thr).astype(int)

# Metrics
roc  = roc_auc_score(y_test, proba)
pra  = average_precision_score(y_test, proba)
acc  = accuracy_score(y_test, pred)
pre  = precision_score(y_test, pred, zero_division=0)
rec  = recall_score(y_test, pred, zero_division=0)
f1   = f1_score(y_test, pred, zero_division=0)
cm   = confusion_matrix(y_test, pred)    

print(f"[ET TEST] thr={et_thr:.4f} | AUC={roc:.4f} | PR-AUC={pra:.4f} | ACC={acc:.4f} | P={pre:.4f} | R={rec:.4f} | F1={f1:.4f}")
display(pd.DataFrame(cm, index=["True0","True1"], columns=["Pred0","Pred1"]))


[ET TEST] thr=0.1801 | AUC=0.9705 | PR-AUC=0.9114 | ACC=0.9023 | P=0.6258 | R=0.9052 | F1=0.7400


Unnamed: 0,Pred0,Pred1
True0,1624,177
True1,31,296


## Lưu model (portable path)

In [None]:
# SAVE 2 TUNED MODELS: RF + ET (portable path)
from pathlib import Path
import joblib

PROJECT_ROOT = Path.cwd()
out_dir = PROJECT_ROOT / "saved_models"
out_dir.mkdir(parents=True, exist_ok=True)

# Lấy pipeline tuned (ưu tiên search_* nếu có)
def _get_best_pipe(model_key: str):
    key = model_key.upper()
    if key == "RF":
        if "search_rf" in globals(): return search_rf.best_estimator_
        if "tuned_pipes" in globals() and "RF" in tuned_pipes: return tuned_pipes["RF"]
        if "rf_pipe_base" in globals(): 
            print("⚠️ RF tuned not found -> using baseline rf_pipe_base")
            return rf_pipe_base
        raise NameError("❌ RF pipeline not found (search_rf / tuned_pipes['RF'] / rf_pipe_base).")
    if key == "ET":
        if "search_et" in globals(): return search_et.best_estimator_
        if "search_ET" in globals(): return search_ET.best_estimator_
        if "tuned_pipes" in globals() and "ET" in tuned_pipes: return tuned_pipes["ET"]
        if "et_pipe_base" in globals():
            print("⚠️ ET tuned not found -> using baseline et_pipe_base")
            return et_pipe_base
        raise NameError("❌ ET pipeline not found (search_et / tuned_pipes['ET'] / et_pipe_base).")
    raise ValueError("model_key must be 'RF' or 'ET'")

# Lấy threshold (ưu tiên biến rf_thr/et_thr nếu notebook đã tính)
rf_thr = globals().get("rf_thr", 0.5)
et_thr = globals().get("et_thr", 0.5)

rf_pipe = _get_best_pipe("RF")
et_pipe = _get_best_pipe("ET")

# (Khuyến nghị) Fit lại trên toàn bộ TRAIN để ổn định (không dùng TEST)
if "X_train" in globals() and "y_train" in globals():
    rf_pipe.fit(X_train, y_train)
    et_pipe.fit(X_train, y_train)

rf_path = out_dir / "CHD_RF_tuned.joblib"
et_path = out_dir / "CHD_ET_tuned.joblib"

joblib.dump({"name":"RF","pipeline":rf_pipe,"threshold":float(rf_thr)}, rf_path)
joblib.dump({"name":"ET","pipeline":et_pipe,"threshold":float(et_thr)}, et_path)

print("✅ Saved models to:")
print(" -", rf_path, f"(thr={float(rf_thr):.4f})")
print(" -", et_path, f"(thr={float(et_thr):.4f})")
