## Customer Churn Prediction_class_imbalance

In [1]:
# 7.1 Load Feature-Engineered Dataset
import pandas as pd

# Load feature-engineered data
df_model = pd.read_csv("../data/processed/churn_feature_engineered.csv")

df_model.shape

(10000, 18)

In [2]:
# 7.2 Separate Features and Target
X = df_model.drop(columns=["Exited"])
y = df_model["Exited"]

In [3]:
# 7.3 Train–Test Split (Stratified)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((8000, 17), (2000, 17))

In [4]:
# 7.4 Verify Churn Distribution (Sanity Check)
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(Exited
 0    0.79625
 1    0.20375
 Name: proportion, dtype: float64,
 Exited
 0    0.7965
 1    0.2035
 Name: proportion, dtype: float64)

In [5]:
# 7.5 Save Split Data (Critical for Reproducibility)
X_train.to_csv("../data/processed/X_train_unscaled.csv", index=False)
X_test.to_csv("../data/processed/X_test_unscaled.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

In [18]:
# STEP 8 — Feature Scaling (TRAIN ONLY)
# 8.1 Load Train–Test Splits (Unscaled)
import pandas as pd

X_train = pd.read_csv("../data/processed/X_train_unscaled.csv")
X_test = pd.read_csv("../data/processed/X_test_unscaled.csv")

y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

X_train.shape, X_test.shape

((8000, 17), (2000, 17))

In [19]:
# 8.2 Identify Categorical Columns (Critical)
categorical_cols = X_train.select_dtypes(
    include=["object", "category"]
).columns.tolist()

categorical_cols

['Geography', 'Gender', 'TenureGroup', 'AgeGroup', 'CreditScoreGroup']

In [20]:
# 8.3 One-Hot Encode (TRAIN + TEST)
X_train_encoded = pd.get_dummies(
    X_train,
    columns=categorical_cols,
    drop_first=True
)

X_test_encoded = pd.get_dummies(
    X_test,
    columns=categorical_cols,
    drop_first=True
) 

In [21]:
# 8.4 Align Train & Test Feature Space (NON-NEGOTIABLE)
X_train_encoded, X_test_encoded = X_train_encoded.align(
    X_test_encoded,
    join="left",
    axis=1,
    fill_value=0
)

In [22]:
# 8.5 Identify Numeric Features (Final Matrix)
numeric_features = X_train_encoded.select_dtypes(
    include=["int64", "float64"]
).columns.tolist()

numeric_features

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'HasBalance',
 'BalanceSalaryRatio',
 'HighProductCount',
 'InactiveHighProducts']

In [23]:
# 8.6 Apply Scaling (TRAIN ONLY)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = X_train_encoded.copy()
X_test_scaled = X_test_encoded.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(
    X_train_encoded[numeric_features]
)

X_test_scaled[numeric_features] = scaler.transform(
    X_test_encoded[numeric_features]
)

In [25]:
# 8.7 Mandatory Sanity Checks
# Mean ≈ 0, Std ≈ 1 (train set)
X_train_scaled[numeric_features].describe().loc[["mean", "std"]]

# Missing values
X_train_scaled.isnull().sum().max(), X_test_scaled.isnull().sum().max()

# Shape consistency
X_train_scaled.shape, X_test_scaled.shape


((8000, 21), (2000, 21))

In [26]:
# 8.8 Save Scaled Data (Modeling Input)
X_train_scaled.to_csv(
    "../data/processed/X_train_scaled.csv", index=False
)

X_test_scaled.to_csv(
    "../data/processed/X_test_scaled.csv", index=False
)

In [27]:
# STEP 9 — Class Imbalance Handling (Churn-Focused)
# 9.1 Load Scaled Training Data (Only Training Set)
import pandas as pd

X_train_scaled = pd.read_csv("../data/processed/X_train_scaled.csv")
X_test_scaled = pd.read_csv("../data/processed/X_test_scaled.csv")

y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

X_train_scaled.shape, y_train.shape

((8000, 21), (8000,))

In [28]:
# 9.2 Quantify Class Imbalance (Baseline Reality)
y_train.value_counts(normalize=True)

Exited
0    0.79625
1    0.20375
Name: proportion, dtype: float64

In [29]:
# 9.3 Approach 1 — Class Weights (Baseline-Safe)
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weight_dict = dict(zip(np.unique(y_train), class_weights))
class_weight_dict

{0: 0.6279434850863422, 1: 2.4539877300613497}

In [30]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
# 9.4 Approach 2 — SMOTE (TRAIN ONLY)
from imblearn.over_sampling import SMOTE

smote = SMOTE(
    sampling_strategy=0.5,
    random_state=42
)

X_train_smote, y_train_smote = smote.fit_resample(
    X_train_scaled,
    y_train
)

X_train_smote.shape, y_train_smote.value_counts(normalize=True)

((9555, 21),
 Exited
 0    0.666667
 1    0.333333
 Name: proportion, dtype: float64)

In [32]:
# 9.5 Validate No Test Leakage
X_test_scaled.shape, y_test.value_counts(normalize=True)

((2000, 21),
 Exited
 0    0.7965
 1    0.2035
 Name: proportion, dtype: float64)

In [33]:
# 9.6 Save Balanced Training Data (Critical)
X_train_smote.to_csv(
    "../data/processed/X_train_smote.csv", index=False
)

y_train_smote.to_csv(
    "../data/processed/y_train_smote.csv", index=False
)