In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

try:
    from xgboost import XGBClassifier
    XGB_OK = True
except Exception:
    XGB_OK = False
    print("Không tìm thấy xgboost, sẽ chạy các model còn lại.")

print("dang doc du lieu tu file bank.csv...")
file_id = "1rbbDfRI7uAWc3wb0bHBOXGdgiL6GQTVu"
url = f"https://drive.google.com/uc?id={file_id}"

print(f"Đang tải dữ liệu từ Google Drive (ID: {file_id})...")

try:
    df = pd.read_csv(url)
except pd.errors.ParserError:
    print("Dấu phẩy không được, đang thử dấu chấm phẩy (;)...")
    df = pd.read_csv(url, sep=";")

print("Đọc dữ liệu thành công!")
print("Data sample:")
print(df.head(), "\n")

Không tìm thấy xgboost, sẽ chạy các model còn lại.
dang doc du lieu tu file bank.csv...
Đang tải dữ liệu từ Google Drive (ID: 1rbbDfRI7uAWc3wb0bHBOXGdgiL6GQTVu)...
Đọc dữ liệu thành công!
Data sample:
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0 

In [2]:
df["y"] = (df["y"].astype(str).str.lower() == "yes").astype(int)

num_cols = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
cat_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome"]

required_cols = num_cols + cat_cols + ["y"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Thiếu cột: {missing}")

X = df[num_cols + cat_cols]
y = df["y"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

In [3]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, n_jobs=None),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
}

if XGB_OK:
    models["XGBoost"] = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        eval_metric="logloss",
        use_label_encoder=False
    )

kf = KFold(n_splits=5, shuffle=True, random_state=42)

scorers = {
    "Accuracy": "accuracy",
    "Precision": make_scorer(precision_score),
    "Recall": make_scorer(recall_score),
    "F1": make_scorer(f1_score),
    "AUC": "roc_auc"
}

cv_results = {}

In [4]:
print("===== Bắt đầu huấn luyện và so sánh (5-Fold CV) =====")

for name, clf in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocessor), ("clf", clf)])
    cv_scores = {}

    print(f"-> Đang chạy {name}...")
    for metric_name, scorer in scorers.items():
        try:
            scores = cross_val_score(pipe, X, y, cv=kf, scoring=scorer, n_jobs=-1)
            cv_scores[metric_name] = (scores.mean(), scores.std())
        except Exception as e:
            cv_scores[metric_name] = (np.nan, np.nan)
            print(f"{name} - lỗi khi tính {metric_name}: {e}")

    cv_results[name] = cv_scores

print("\n===== Kết quả 5-Fold CV (mean ± std) =====")
for name, metrics in cv_results.items():
    print(f"\n{name}:")
    for metric_name, (m, s) in metrics.items():
        if np.isnan(m):
            print(f"  {metric_name}: N/A")
        else:
            print(f"  {metric_name}: {m:.4f} ± {s:.4f}")

===== Bắt đầu huấn luyện và so sánh (5-Fold CV) =====
-> Đang chạy Logistic Regression...
-> Đang chạy Random Forest...
-> Đang chạy XGBoost...

===== Kết quả 5-Fold CV (mean ± std) =====

Logistic Regression:
  Accuracy: 0.8998 ± 0.0145
  Precision: 0.6392 ± 0.0535
  Recall: 0.3148 ± 0.0530
  F1: 0.4182 ± 0.0490
  AUC: 0.8911 ± 0.0098

Random Forest:
  Accuracy: 0.8960 ± 0.0151
  Precision: 0.6964 ± 0.1111
  Recall: 0.1845 ± 0.0252
  F1: 0.2907 ± 0.0374
  AUC: 0.9083 ± 0.0166

XGBoost:
  Accuracy: 0.8967 ± 0.0109
  Precision: 0.5929 ± 0.0850
  Recall: 0.3381 ± 0.0209
  F1: 0.4297 ± 0.0364
  AUC: 0.9101 ± 0.0129


In [None]:
def pick_best(cv_results):
    def get_metric(name, metric):
        val = cv_results[name].get(metric, (np.nan,))[0]
        return -1.0 if np.isnan(val) else val

    candidates = list(cv_results.keys())
    candidates.sort(
        key=lambda n: (
            get_metric(n, "F1"),
            get_metric(n, "AUC"),
            get_metric(n, "Accuracy")
        ),
        reverse=True
    )
    return candidates[0]

best_name = pick_best(cv_results)
print(f"\n Best model (by F1 → AUC → Accuracy): {best_name}")

best_clf = models[best_name]
best_pipeline = Pipeline(steps=[("preprocess", preprocessor), ("clf", best_clf)])
best_pipeline.fit(X, y)

out_path = "best_bank_model_fixed.pkl"
joblib.dump(best_pipeline, out_path)
print("✅ Saved:", out_path)

print(f"\n Đã lưu toàn bộ pipeline (preprocess + {best_name}) vào: {out_path}")

sample_payload = {
    "age": 30,
    "job": "management",
    "balance": 1500,
    "housing": "yes",
    "loan": "no",
    "contact": "cellular",
    "duration": 200,
    "campaign": 1,
    "poutcome": "success"
    # ... các cột khác
}
print("\nVí dụ input (JSON) cho App:")
print(sample_payload)


 Best model (by F1 → AUC → Accuracy): XGBoost

 Đã lưu toàn bộ pipeline (preprocess + XGBoost) vào: best_bank_model.pkl

Ví dụ input (JSON) cho App:
{'age': 30, 'job': 'management', 'balance': 1500, 'housing': 'yes', 'loan': 'no', 'contact': 'cellular', 'duration': 200, 'campaign': 1, 'poutcome': 'success'}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
