In [None]:
# =============================================================================
# Capstone 17.1 – Comparing Classifiers for Bank Term Deposit Subscription
# Author: Erfan Maleki
# Program: Berkeley Professional Certificate in Machine Learning & AI
# =============================================================================

# =========================
# 0. LIBRARIES & SETUP
# =========================
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_auc_score, RocCurveDisplay,
    ConfusionMatrixDisplay, PrecisionRecallDisplay
)

sns.set_theme(style="whitegrid")

IMG_DIR = "images"
os.makedirs(IMG_DIR, exist_ok=True)

# =========================
# 1. BUSINESS UNDERSTANDING
# =========================
"""
Goal:
Predict whether a client will subscribe to a bank term deposit ("yes"/"no"),
using campaign and socioeconomic data.
The analysis follows the CRISP-DM process.
"""

# --- CRISP-DM diagram
stages = ["Business\nUnderstanding", "Data\nUnderstanding", "Data\nPreparation",
          "Modeling", "Evaluation", "Deployment"]
plt.figure(figsize=(10, 2))
for i, s in enumerate(stages):
    plt.text(i*1.6, 0, s, ha='center', va='center',
             fontsize=10, bbox=dict(facecolor='lightblue', boxstyle='round,pad=0.4'))
plt.plot([0, 1.6*(len(stages)-1)], [0, 0], 'k--', alpha=0.3)
plt.axis('off')
plt.title("CRISP-DM Methodology Overview", fontsize=12)
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/A_cross-industry_standard_process_for_data_mining.png", dpi=150)
plt.close()

# =========================
# 2. DATA UNDERSTANDING
# =========================
df_full = pd.read_csv("bank-additional-full.csv", sep=';')
df_full["y"] = df_full["y"].map({"yes": 1, "no": 0})

# 50 % sample for EDA (faster)
df = df_full.sample(frac=0.5, random_state=42).reset_index(drop=True)

# ---- Target distribution
plt.figure(figsize=(5,4))
sns.countplot(x="y", data=df)
plt.title("Term Deposit Subscription Distribution")
plt.xlabel("Subscribed (0=No, 1=Yes)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/01_target_distribution.png", dpi=150)
plt.close()

# ---- Age distribution
plt.figure(figsize=(6,4))
sns.histplot(df["age"], bins=20, kde=True)
plt.title("Client Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/02_age_distribution.png", dpi=150)
plt.close()

# ---- Correlation heatmap
num_df = df.select_dtypes("number").iloc[:, :20]
plt.figure(figsize=(10,8))
sns.heatmap(num_df.corr(), cmap="coolwarm", center=0)
plt.title("Correlation Heatmap (Numeric Features)")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/03_correlation_heatmap.png", dpi=150)
plt.close()

# ---- Duration vs outcome
plt.figure(figsize=(6,4))
sns.boxplot(x="y", y="duration", data=df)
plt.title("Call Duration by Outcome")
plt.xlabel("Subscribed")
plt.ylabel("Duration (s)")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/04_duration_vs_outcome.png", dpi=150)
plt.close()

# ---- Outcome by month
month_order = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
plt.figure(figsize=(8,4))
sns.countplot(x="month", hue="y", data=df, order=month_order)
plt.title("Outcome by Contact Month")
plt.xlabel("Month")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/05_month_trend.png", dpi=150)
plt.close()

# ---- Job success rate
job_success = df.groupby("job")["y"].mean().sort_values(ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(x=job_success.values, y=job_success.index)
plt.title("Term Deposit Subscription Rate by Job")
plt.xlabel("Subscription rate")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/10_job_success_rate.png", dpi=150)
plt.close()

# ---- Education success rate
edu_success = df.groupby("education")["y"].mean().sort_values(ascending=False)
plt.figure(figsize=(8,5))
sns.barplot(x=edu_success.values, y=edu_success.index)
plt.title("Term Deposit Subscription Rate by Education")
plt.xlabel("Subscription rate")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/11_education_success_rate.png", dpi=150)
plt.close()

# ---- Housing / loan
fig, axes = plt.subplots(1, 2, figsize=(10,4))
sns.countplot(x="housing", hue="y", data=df, ax=axes[0])
axes[0].set_title("Housing Loan vs Subscription")
sns.countplot(x="loan", hue="y", data=df, ax=axes[1])
axes[1].set_title("Personal Loan vs Subscription")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/12_housing_loan_vs_subscription.png", dpi=150)
plt.close()

# ---- Contact type success
contact_success = df.groupby("contact")["y"].mean().sort_values(ascending=False)
plt.figure(figsize=(6,4))
sns.barplot(x=contact_success.index, y=contact_success.values)
plt.title("Subscription Rate by Contact Type")
plt.xlabel("Contact type")
plt.ylabel("Subscription rate")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/13_contact_type_success.png", dpi=150)
plt.close()

# ---- Monthly subscription trend
month_map = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,
             'sep':9,'oct':10,'nov':11,'dec':12}
df["month_num"] = df["month"].map(month_map)
month_mean = df.groupby("month_num")["y"].mean().reindex(range(1,13))
plt.figure(figsize=(7,4))
sns.lineplot(x=month_mean.index, y=month_mean.values, marker="o")
plt.xticks(range(1,13), month_order)
plt.title("Monthly Subscription Trend")
plt.ylabel("Average subscription")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/14_monthly_subscription_trend.png", dpi=150)
plt.close()

# ---- Duration histograms
plt.figure(figsize=(7,4))
sns.kdeplot(data=df, x="duration", hue="y", fill=True)
plt.title("Call Duration Distribution by Outcome")
plt.xlabel("Duration (s)")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/15_duration_effect_hist.png", dpi=150)
plt.close()

# ---- Duration vs campaign scatter
plt.figure(figsize=(7,4))
df_trim = df[(df["duration"]<df["duration"].quantile(0.99)) & (df["campaign"]<df["campaign"].quantile(0.99))]
sns.scatterplot(x="duration", y="campaign", hue="y", data=df_trim, alpha=0.4)
plt.title("Campaign Attempts vs Call Duration")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/16_duration_vs_campaign.png", dpi=150)
plt.close()

# =========================
# 3. DATA PREPARATION
# =========================
model_df = pd.get_dummies(df.drop(columns=["month_num"]), drop_first=True)
X_all = model_df.drop("y", axis=1)
y_all = model_df["y"]

# smaller subset for modeling speed
sample_n = min(5000, len(model_df))
X_small = X_all.sample(n=sample_n, random_state=42)
y_small = y_all.loc[X_small.index]

X_train, X_test, y_train, y_test = train_test_split(
    X_small, y_small, test_size=0.3, random_state=42, stratify=y_small
)
scaler = StandardScaler(with_mean=False)
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# =========================
# 4. MODELING & EVALUATION
# =========================
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(probability=True)
}
params = {
    "KNN": {"n_neighbors": [3,5,7]},
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "Decision Tree": {"max_depth": [5,10,20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear","rbf"]}
}

results, trained = [], {}
for name, model in models.items():
    grid = GridSearchCV(model, params[name], cv=5, scoring="roc_auc", n_jobs=-1)
    grid.fit(X_train_sc, y_train)
    best_model = grid.best_estimator_
    y_proba = best_model.predict_proba(X_test_sc)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    results.append({"Model": name, "Best_Params": grid.best_params_, "AUC": auc})
    trained[name] = {"model": best_model, "auc": auc, "proba": y_proba}

results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
print(results_df)

# ---- ROC curves
plt.figure(figsize=(8,6))
for name, d in trained.items():
    RocCurveDisplay.from_estimator(d["model"], X_test_sc, y_test, ax=plt.gca(), name=name)
plt.title("ROC Curves for Four Classifiers")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/06_roc_curves_comparison.png", dpi=150)
plt.close()

# ---- AUC comparison
plt.figure(figsize=(6,4))
sns.barplot(x="Model", y="AUC", data=results_df)
plt.title("Model AUC Comparison")
plt.ylim(0.7, 1.0)
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/07_auc_comparison.png", dpi=150)
plt.close()

# ---- Confusion matrix (best)
best_name = results_df.iloc[0]["Model"]
best_model = trained[best_name]["model"]
plt.figure(figsize=(4,4))
ConfusionMatrixDisplay.from_estimator(best_model, X_test_sc, y_test, cmap="Blues")
plt.title(f"Confusion Matrix – {best_name}")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/08_confusion_matrix_best.png", dpi=150)
plt.close()

# ---- Decision Tree feature importance
tree_model = trained["Decision Tree"]["model"]
importances = tree_model.feature_importances_
idx = np.argsort(importances)[-10:]
plt.figure(figsize=(6,4))
plt.barh(range(len(idx)), importances[idx])
plt.yticks(range(len(idx)), X_train.columns[idx])
plt.title("Top 10 Feature Importances – Decision Tree")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/09_feature_importance_tree.png", dpi=150)
plt.close()

# ---- Precision–Recall (Logistic Regression)
log_reg = trained["Logistic Regression"]["model"]
plt.figure(figsize=(7,5))
PrecisionRecallDisplay.from_estimator(log_reg, X_test_sc, y_test)
plt.title("Precision–Recall Curve – Logistic Regression")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/17_precision_recall_logreg.png", dpi=150)
plt.close()

# ---- Residuals (Logistic Regression)
y_scores_log = log_reg.predict_proba(X_test_sc)[:,1]
residuals = y_test - y_scores_log
plt.figure(figsize=(6,4))
sns.histplot(residuals, bins=30, kde=True)
plt.title("Residual Distribution – Logistic Regression")
plt.xlabel("Residual (True - Predicted Prob.)")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/18_residual_distribution_logreg.png", dpi=150)
plt.close()

# ---- Coefficients (Logistic Regression)
coef_series = pd.Series(log_reg.coef_[0], index=X_train.columns)
top_pos = coef_series.sort_values(ascending=False).head(10)
plt.figure(figsize=(7,4))
sns.barplot(x=top_pos.values, y=top_pos.index)
plt.title("Top 10 Positive Predictors – Logistic Regression")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/19_logreg_top_positive.png", dpi=150)
plt.close()

# ---- Confusion matrix – Logistic Regression
plt.figure(figsize=(4,4))
ConfusionMatrixDisplay.from_estimator(log_reg, X_test_sc, y_test, cmap="Blues")
plt.title("Confusion Matrix – Logistic Regression")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/20_confusion_matrix_logreg.png", dpi=150)
plt.close()

# ---- Lift curve
df_lift = pd.DataFrame({"y_true": y_test, "y_score": y_scores_log})     .sort_values("y_score", ascending=False).reset_index(drop=True)
df_lift["cum_resp"] = df_lift["y_true"].cumsum()
df_lift["perc_customers"] = (df_lift.index + 1) / len(df_lift)
df_lift["perc_responses"] = df_lift["cum_resp"] / df_lift["y_true"].sum()

plt.figure(figsize=(7,5))
plt.plot(df_lift["perc_customers"], df_lift["perc_responses"], label="Model")
plt.plot([0,1],[0,1],"--", label="Random")
plt.title("Cumulative Gain (Lift) – Logistic Regression")
plt.xlabel("Proportion of Customers (sorted by score)")
plt.ylabel("Proportion of Positive Responses")
plt.legend()
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/21_lift_curve_logreg.png", dpi=150)
plt.close()

# ---- Final Dashboard
fig, axes = plt.subplots(2,2, figsize=(12,10))
sns.countplot(x=df["y"], ax=axes[0,0])
axes[0,0].set_title("Term Deposit Subscription Distribution")
sns.boxplot(x=df["y"], y=df["duration"], ax=axes[0,1])
axes[0,1].set_title("Call Duration by Outcome")
sns.barplot(x="Model", y="AUC", data=results_df, ax=axes[1,0])
axes[1,0].set_title("Model AUC Comparison")
axes[1,0].set_ylim(0.7,1.0)
ConfusionMatrixDisplay.from_estimator(best_model, X_test_sc, y_test, ax=axes[1,1], cmap="Blues")
axes[1,1].set_title(f"Confusion Matrix – {best_name}")
plt.tight_layout()
plt.savefig(f"{IMG_DIR}/22_dashboard_summary.png", dpi=150)
plt.close()

print(" All 22 figures generated successfully in the 'images/' folder.")
print(" Modeling results:\n", results_df)

# =========================
# 5. FINDINGS & RECOMMENDATIONS
# =========================
"""
Key Takeaways:
- Call Duration is the strongest predictor of success.
- Peak months: March, June, September, December.
- Clients without loans show higher subscription rates.
- Lower euribor3m values correlate with better responses.
- Best Model: SVM (RBF) with AUC = 0.94.

Actionable Insights:
- Focus marketing in high-conversion months.
- Encourage longer, more personal conversations.
- Prioritize financially stable clients.
- Integrate SVM predictions into CRM lead-scoring system.
- Retrain models quarterly to adapt to economic changes.
"""
