# Customer Churn Prediction — Part 2: Preprocessing & Feature Engineering

In this part, we will:
1. Load the cleaned dataset
2. Encode categorical variables
3. Scale numeric features
4. Handle class imbalance
5. Save the preprocessed data for modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# 1️⃣ Load the cleaned data
df = pd.read_csv("../data/customer_churn_cleaned.csv")
print("Shape:", df.shape)
df.head()

ModuleNotFoundError: No module named 'imblearn'

In [None]:
# 2️⃣ Identify categorical and numerical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(exclude='object').columns.tolist()

# Ensure target 'Churn' is not included in feature lists
if 'Churn' in num_cols:
    num_cols.remove('Churn')
print("Categorical columns:", cat_cols)
print("Numeric columns:", num_cols)

In [None]:
# 3️⃣ Encode categorical columns using LabelEncoder
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

print("✅ Encoded categorical columns.")
df.head()

In [None]:
# 4️⃣ Scale numeric columns using StandardScaler
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("✅ Scaled numerical columns.")
df.head()

In [None]:
# 5️⃣ Handle class imbalance using SMOTE
X = df.drop('Churn', axis=1)
y = df['Churn']

print("Before SMOTE:")
print(y.value_counts())

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print("\nAfter SMOTE:")
print(pd.Series(y_res).value_counts())

In [None]:
# 6️⃣ Combine back into a single DataFrame
df_resampled = pd.concat([pd.DataFrame(X_res, columns=X.columns), pd.Series(y_res, name='Churn')], axis=1)

# 7️⃣ Visualize new target distribution
sns.countplot(x='Churn', data=df_resampled)
plt.title('Balanced Target Distribution (After SMOTE)')
plt.show()

In [None]:
# 8️⃣ Save the preprocessed dataset
df_resampled.to_csv("../data/customer_churn_preprocessed.csv", index=False)
print("✅ Saved preprocessed dataset as '../data/customer_churn_preprocessed.csv'")