# Bank Customer Churn Prediction

Predict whether a bank customer will leave based on their profile and account activity.

**Dataset:** [https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset/data](https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset/data)  
**Target:** `churn`  
**Type:** Imbalanced Binary Classification

> **TODO:** Download the dataset, place it in `../../data/raw/`, then update `DATA_PATH` and `TARGET` below.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    classification_report, roc_auc_score,
    roc_curve, ConfusionMatrixDisplay,
)
sns.set_theme(style='whitegrid')

## 1. Load Data

In [None]:
# TODO: update path after downloading from https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset/data
DATA_PATH = "../../data/raw/bank_customer_churn.csv"
TARGET = "churn"  # TODO: verify column name

df = pd.read_csv(DATA_PATH)
print(f'Shape: {df.shape}')
df.head()

## 2. Exploratory Data Analysis

In [None]:
print(df.info())
print('\nNull counts:')
print(df.isnull().sum().sort_values(ascending=False).head(15))
df.describe(include='all').T

In [None]:
# Target distribution
fig, ax = plt.subplots()
df[TARGET].value_counts().plot(kind='bar', ax=ax)
ax.set_title(f'Target distribution: {TARGET}')
ax.set_xlabel(TARGET); ax.set_ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout(); plt.show()
print(df[TARGET].value_counts(normalize=True).round(3))

In [None]:
# Correlation heatmap (numeric features)
num_df = df.select_dtypes(include='number')
if len(num_df.columns) > 1:
    plt.figure(figsize=(10, 6))
    sns.heatmap(num_df.corr(), annot=False, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.tight_layout(); plt.show()

## 3. Feature Engineering

In [None]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

# TODO: encode binary string targets if needed, e.g.:
# y = y.map({'Yes': 1, 'No': 0})

numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
print('Numeric cols:', numeric_cols)
print('Categorical cols:', categorical_cols)

In [None]:
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols),
])

## 4. Train / Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

## 5. Model Training

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
}

results = {}
for name, clf in models.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('clf', clf)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, probs)
    results[name] = {'pipe': pipe, 'preds': preds, 'probs': probs, 'roc_auc': auc}
    print(f'\n=== {name} ===')
    print(f'ROC-AUC: {auc:.4f}')
    print(classification_report(y_test, preds))

## 6. Evaluation

In [None]:
best_name = max(results, key=lambda k: results[k]['roc_auc'])
best = results[best_name]
best_probs = best['probs']
print(f'Best model: {best_name}  ROC-AUC: {best["roc_auc"]:.4f}')

# Precision-Recall AUC (better for imbalanced data)
from sklearn.metrics import average_precision_score
pr_auc = average_precision_score(y_test, best_probs)
print(f"PR-AUC: {pr_auc:.4f}")

In [None]:
# Confusion Matrix
ConfusionMatrixDisplay.from_predictions(y_test, best['preds'])
plt.title(f'Confusion Matrix â€” {best_name}')
plt.show()

In [None]:
# ROC Curves
fig, ax = plt.subplots()
for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['probs'])
    ax.plot(fpr, tpr, label=f"{name} (AUC={res['roc_auc']:.3f})")
ax.plot([0, 1], [0, 1], 'k--', label='Random')
ax.set_xlabel('False Positive Rate'); ax.set_ylabel('True Positive Rate')
ax.legend(); ax.set_title('ROC Curves')
plt.tight_layout(); plt.show()

In [None]:
# Feature importances (Random Forest)
rf_pipe = results['Random Forest']['pipe']
rf_clf = rf_pipe.named_steps['clf']
feat_names = (
    rf_pipe.named_steps['preprocessor']
    .get_feature_names_out()
)
importances = pd.Series(rf_clf.feature_importances_, index=feat_names)
importances.nlargest(15).sort_values().plot(kind='barh', figsize=(8, 5))
plt.title('Top 15 Feature Importances (Random Forest)')
plt.tight_layout(); plt.show()

## 7. Conclusion

| Model | ROC-AUC |
|---|---|
| *(fill after running)* | |

**Observations:**
- 

**Next steps:**
- Hyperparameter tuning (GridSearchCV / RandomizedSearchCV)
- Try XGBoost / LightGBM
- Threshold optimisation for Precision/Recall trade-off