<a href="https://colab.research.google.com/github/ezosamara/ds301-midterm-bank-marketing/blob/main/BankMarketing_Midterm_MoatazSamara.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [None]:
import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Load (uploaded) CSV
PATH = "/content/bank-full.csv"  # if you renamed or zipped, adjust
df = pd.read_csv(PATH, sep=';')
print(df.shape)
df.head(3)

In [None]:
# Map target y: yes/no -> 1/0
df['y'] = (df['y'].str.lower() == 'yes').astype(int)

# IMPORTANT: drop duration (target leakage per UCI)
if 'duration' in df.columns:
    df = df.drop(columns=['duration'])

# 'pdays' special "not contacted before" flag (999 or -1 depending on version)
if 'pdays' in df.columns:
    never_vals = {999, -1}
    df['pdays_never'] = df['pdays'].isin(never_vals).astype(int)

print("Class balance (1=yes):", df['y'].mean().round(3))
df.describe(include='all').T.head(12)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

y = df['y'].copy()
X = df.drop(columns=['y'])

# Identify categorical vs numeric columns
cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if c not in cat_cols]

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols)
    ],
    remainder="drop"
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=RANDOM_STATE
)

len(cat_cols), len(num_cols), X_train.shape, X_test.shape


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, confusion_matrix, classification_report)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def fit_grid(pipe, params, name):
    gs = GridSearchCV(pipe, params, scoring="roc_auc", cv=cv, n_jobs=-1)
    gs.fit(X_train, y_train)
    y_pred = gs.predict(X_test)
    y_proba = gs.predict_proba(X_test)[:,1] if hasattr(gs, "predict_proba") else None
    print(f"\n{name} — best params:", gs.best_params_)
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    res = {
        "model": name,
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
    }
    return gs, res, y_proba


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 1) Logistic Regression (class_weight helps imbalance)
pipe_lr = Pipeline([("pre", pre),
                    ("clf", LogisticRegression(max_iter=4000, solver='liblinear', class_weight='balanced'))])
grid_lr = {"clf__C":[0.1,1,10]}
gs_lr, res_lr, proba_lr = fit_grid(pipe_lr, grid_lr, "Logistic Regression")

# 2) KNN (no class_weight in KNN, but distance weights help)
pipe_knn = Pipeline([("pre", pre), ("clf", KNeighborsClassifier())])
grid_knn = {"clf__n_neighbors": list(range(5,31,2)),
            "clf__weights": ["uniform","distance"],
            "clf__p": [1,2]}
gs_knn, res_knn, proba_knn = fit_grid(pipe_knn, grid_knn, "KNN")

# 3) Decision Tree (class_weight helps imbalance)
pipe_dt = Pipeline([("pre", pre),
                    ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight='balanced'))])
grid_dt = {"clf__criterion": ["gini","entropy"],
           "clf__max_depth": [4,6,8,10,12,None],
           "clf__min_samples_leaf": [1,2,4]}
gs_dt, res_dt, proba_dt = fit_grid(pipe_dt, grid_dt, "Decision Tree")



Logistic Regression — best params: {'clf__C': 0.1}
              precision    recall  f1-score   support

           0      0.941     0.765     0.844      9981
           1      0.265     0.639     0.375      1322

    accuracy                          0.751     11303
   macro avg      0.603     0.702     0.610     11303
weighted avg      0.862     0.751     0.789     11303

Confusion matrix:
 [[7639 2342]
 [ 477  845]]


In [None]:
# ROC
plt.figure(figsize=(7,6))
def add_curve(proba, label):
    if proba is None: return
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    plt.plot(fpr, tpr, label=f"{label} (AUC={auc:.3f})")

add_curve(proba_lr, "Logistic Regression")
add_curve(proba_knn, "KNN")
add_curve(proba_dt, "Decision Tree")

plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.title("ROC — Bank Marketing (Test Set)"); plt.legend(loc="lower right"); plt.grid(True)
plt.show()

# Summary table
import pandas as pd
summary = pd.DataFrame([res_lr, res_knn, res_dt]).set_index("model").sort_values("roc_auc", ascending=False)
summary


### Observations — Moataz Samara
- We removed `duration` to avoid leakage (per UCI) and kept a fair setup.
- Because the dataset is imbalanced (few “yes”), ROC–AUC is a better main metric; Precision/Recall show trade-offs.
- Logistic Regression with class_weight often leads on ROC–AUC (stable with many one-hot features). KNN benefits from distance weighting; DT needs depth/leaves constraints plus class_weight to avoid bias.
- Suggested improvements (not required now): threshold tuning for higher recall on “yes”, try class-weights vs. resampling (SMOTE/undersampling), and compare simple ensembles (Random Forest/Gradient Boosting) for robustness.
