# Data Preprocessing and Feature Engineering

## Objective
This notebook focuses on transforming raw customer data into a clean, model-ready format.
The goal is to build a **robust, reusable preprocessing pipeline** that can be used
consistently during training, evaluation, and production inference.

### Key Outcomes
- Handle missing and inconsistent values
- Encode categorical variables
- Scale numerical features
- Build a production-grade preprocessing + modeling pipeline
- Persist artifacts for downstream phases


In [1]:
import sys
sys.path.append("../src")

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from data_audit import basic_data_audit

# Load dataset
df = pd.read_csv("../data/raw/telco_churn.csv")

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

# Data audit
audit = basic_data_audit(df)
audit

{'shape': (7043, 21),
 'missing_values': customerID          0
 gender              0
 SeniorCitizen       0
 Partner             0
 Dependents          0
 tenure              0
 PhoneService        0
 MultipleLines       0
 InternetService     0
 OnlineSecurity      0
 OnlineBackup        0
 DeviceProtection    0
 TechSupport         0
 StreamingTV         0
 StreamingMovies     0
 Contract            0
 PaperlessBilling    0
 PaymentMethod       0
 MonthlyCharges      0
 TotalCharges        0
 Churn               0
 dtype: int64,
 'duplicates': np.int64(0),
 'dtypes': customerID           object
 gender               object
 SeniorCitizen         int64
 Partner              object
 Dependents           object
 tenure                int64
 PhoneService         object
 MultipleLines        object
 InternetService      object
 OnlineSecurity       object
 OnlineBackup         object
 DeviceProtection     object
 TechSupport          object
 StreamingTV          object
 StreamingMovies  

In [2]:
# Encode target variable - Map Churn to 0/1
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [3]:
# Identify Features & Target
target = 'Churn'

# Drop identifiers
drop_cols = ['customerID']
X = df.drop(columns=[target] + drop_cols)
y = df[target]


In [4]:
#Feature Categorization
# Categorical features
cat_features = X.select_dtypes(include='object').columns.tolist()

# Numerical features
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [5]:
# Handle Binary Columns
binary_cols = [
    'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling',
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]

for col in binary_cols:
    X[col] = X[col].map({'No': 0, 'Yes': 1})


In [6]:
# Handle Multi-category Variables
multi_cat_features = [
    'gender', 'MultipleLines', 'InternetService', 'Contract', 'PaymentMethod'
]

# Replace 'No internet service' / 'No phone service' with 'No'
X.replace({'No internet service': 'No', 'No phone service': 'No'}, inplace=True)


In [7]:
# Split Dataset into Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [8]:
# Preprocessing Pipeline
# Preprocessing for numerical features
num_transformer = StandardScaler()

# Preprocessing for categorical features (UPDATED)
cat_transformer = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

# Combine in ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, multi_cat_features)
    ]
)



In [9]:
# Create ML Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# Fit on training data
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7955997161107168
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1035
           1       0.64      0.52      0.58       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [10]:
import os

# Create models directory if it doesn't exist
os.makedirs("../models", exist_ok=True)


In [11]:
import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")
joblib.dump(pipeline, "../models/churn_pipeline.pkl")


['../models/churn_pipeline.pkl']

In [13]:

# Ensure folder exists
os.makedirs("../data/processed", exist_ok=True)

# Save cleaned dataset used for modeling
df.to_csv("../data/processed/cleaned_telco.csv", index=False)
os.makedirs("../models", exist_ok=True)
joblib.dump(pipeline, "../models/churn_pipeline.pkl")
print("✅ cleaned_telco.csv saved successfully")


✅ cleaned_telco.csv saved successfully
