# Customer Churn Prediction_Feature Engineering

In [1]:
# Import libraries & load data from EDA step
import pandas as pd
import numpy as np

# Load EDA-cleaned dataset
df = pd.read_csv("../data/processed/churn_eda_clean.csv")

# Drop identifiers (no predictive value)
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"], errors="ignore")

In [2]:
# STEP 6: Feature Engineering (Bank Customer Churn)
#6.1: Create a Working Copy from EDA Data
df_fe = df.copy()

In [3]:
# 6.2: Tenure-Based Risk Buckets (In Business Logic: Customers are most vulnerable in early tenure.)
df_fe["TenureGroup"] = pd.cut(
    df_fe["Tenure"],
    bins=[-1, 12, 36, 60, 100],
    labels=["≤1yr", "1-3yr", "3-5yr", "5yr+"] 
)    

In [4]:
# 6.3 Age Risk Buckets (Very Common in Banking)
df_fe["AgeGroup"] = pd.cut(
    df_fe["Age"],
    bins=[17, 30, 45, 60, 100],
    labels=["18-30", "31-45", "46-60", "60+"]
)

In [5]:
# 6.4: Balance Utilization Indicator  
df_fe["HasBalance"] = (df_fe["Balance"] > 0).astype(int)

In [6]:
# 6.5 Financial Intensity Features
# Credit Utilization Proxy
df_fe["BalanceSalaryRatio"] = (
    df_fe["Balance"] / (df_fe["EstimatedSalary"] + 1)
)

In [7]:
# 6.6: Product Risk Flags
df_fe["HighProductCount"] = (df_fe["NumOfProducts"] >= 3).astype(int)

In [8]:
# 6.7: Engagement × Activity Interaction 
df_fe["InactiveHighProducts"] = (
    (df_fe["IsActiveMember"] == 0) &
    (df_fe["NumOfProducts"] >= 2)
).astype(int)

In [11]:
# 6.8: Credit Score Risk Buckets
df_fe["CreditScoreGroup"] = pd.cut(
    df_fe["CreditScore"],
    bins=[0, 580, 670, 740, 900],
    labels=["Poor", "Fair", "Good", "Excellent"],
    include_lowest=True
)

# Validate missing values AFTER creation
df_fe["CreditScoreGroup"].isna().sum()

0

In [12]:
# 6.9: Model Ready Encoding
categorical_features = [
    "Geography",
    "Gender",
    "TenureGroup",
    "CreditScoreGroup"
]
df_model = pd.get_dummies(
    df_fe,
    columns=categorical_features,
    drop_first=True
)

In [13]:
# 6.10: Validate Engineered Features
df_fe[
    [
        "Tenure",
        "TenureGroup",
        "Balance",
        "HasBalance",
        "NumOfProducts",
        "HighProductCount",
        "InactiveHighProducts",
        "CreditScore",
        "CreditScoreGroup"
    ]
].head(10)

Unnamed: 0,Tenure,TenureGroup,Balance,HasBalance,NumOfProducts,HighProductCount,InactiveHighProducts,CreditScore,CreditScoreGroup
0,2,≤1yr,0.0,0,1,0,0,619,Fair
1,1,≤1yr,83807.86,1,1,0,0,608,Fair
2,8,≤1yr,159660.8,1,3,1,1,502,Poor
3,1,≤1yr,0.0,0,2,0,1,699,Good
4,2,≤1yr,125510.82,1,1,0,0,850,Excellent
5,8,≤1yr,113755.78,1,2,0,1,645,Fair
6,7,≤1yr,0.0,0,2,0,0,822,Excellent
7,4,≤1yr,115046.74,1,4,1,1,376,Poor
8,4,≤1yr,142051.07,1,2,0,0,501,Poor
9,2,≤1yr,134603.88,1,1,0,0,684,Good


In [14]:
# 6.11 Final Sanity Checks
assert "Exited" in df_model.columns
df_model.isnull().sum().sort_values(ascending=False).head()
df_model.shape

(10000, 23)

In [16]:
# 6.12: Save Feature-Ready Dataset (Critical) 
# (Note: This file becomes the single source of truth for modeling.)
df_fe.to_csv(
    "../data/processed/churn_feature_engineered.csv",
    index=False
)

# Fully model-ready dataset
df_model.to_csv(
    "../data/processed/churn_model_ready.csv",
    index=False
)