In [1]:
# 1. Imports & Setup
import sys
import os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import joblib
from sklearn.metrics import classification_report
from pipeline.preprocess import Preprocessor

# 2. Problem Statement
"""
Customer churn prediction using a scikit-learn pipeline.
Objective: Build a reusable, tunable, and exportable pipeline for binary classification.
"""

In [2]:
# 3. Load & Clean Data
df = pd.read_csv("../data/telco_churn.csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

X = df.drop(columns=['Churn'])
y = df['Churn']

In [3]:
# 4. Load Trained Pipeline
pipeline = joblib.load("../models/churn_pipeline_tuned.joblib")

In [4]:
# 5. Evaluate Final Model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [5]:
y_pred = pipeline.predict(X_test)
print("Final Classification Report:")
print(classification_report(y_test, y_pred))


Final Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407



# 6. Summary & Insights
"""
The final tuned pipeline achieved strong performance on customer churn prediction.
It uses a full preprocessing + modeling flow with support for scaling and encoding.

Key Takeaways:
- Pipeline model can be reused in production
- Churn prediction helps optimize customer retention strategies
- Easily extensible with new models or features
"""