# Notebook 03 — Feature Engineering & Preprocessing Pipeline

## 03.1 Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


## 03.2 Load cleaned dataset

In [2]:
DATA_PATH = r"C:\Users\farbo\OneDrive\Desktop\churn-analysis\data\telco_churn_clean.csv"
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
df.head()


Shape: (7043, 21)


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


## 03.3 Separate target and features

In [3]:
TARGET_COL = "churn"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (7043, 20)
y shape: (7043,)


## 03.4 Identify ID column (exclude from modeling)

In [4]:
ID_COL = None
for col in ["customerid", "customer_id", "id"]:
    if col in X.columns:
        ID_COL = col
        break

ID_COL


'customerid'

In [5]:
if ID_COL:
    X = X.drop(columns=[ID_COL])
    print(f"Dropped ID column: {ID_COL}")


Dropped ID column: customerid


## 03.5 Identify numerical and categorical features

In [6]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)


Numeric features: ['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']
Categorical features: ['gender', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


## 03.6 Stratified train/test split (VERY important)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

print("\nTrain churn rate:", y_train.mean())
print("Test churn rate:", y_test.mean())


Train shape: (5634, 19) (5634,)
Test shape: (1409, 19) (1409,)

Train churn rate: 0.2653532126375577
Test churn rate: 0.2654364797728886


## 03.7 Build preprocessing pipelines

In [8]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [9]:
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
])


## 03.8 Combine pipelines using ColumnTransformer

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features)
    ]
)

preprocessor


## 03.9 Fit preprocessing on TRAIN only

In [11]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed train shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)


Processed train shape: (5634, 45)
Processed test shape: (1409, 45)




## 03.10 (Optional) Get feature names after encoding

In [12]:
feature_names = []

# numeric names
feature_names.extend(numeric_features)

# categorical names
ohe = preprocessor.named_transformers_["cat"]["onehot"]
cat_feature_names = ohe.get_feature_names_out(categorical_features)
feature_names.extend(cat_feature_names)

len(feature_names), X_train_processed.shape[1]


(45, 45)

In [13]:
feature_names[:10]


['seniorcitizen',
 'tenure',
 'monthlycharges',
 'totalcharges',
 'gender_Female',
 'gender_Male',
 'partner_No',
 'partner_Yes',
 'dependents_No',
 'dependents_Yes']

## 03.11 Summary

### Preprocessing Summary

- Performed stratified train/test split to preserve churn distribution.
- Built separate pipelines for numerical and categorical features.
- Applied scaling only to numerical variables.
- Used one-hot encoding for categorical variables with safe handling of unseen categories.
- All preprocessing was fit exclusively on the training data to prevent data leakage.
