In [None]:
# Using PIPELINE: Built classification models to predict customer churn using logistic regression and XGBoost
# e.g. Customer-Churn.csv
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load data
df = pd.read_csv("Customer-Churn.csv")

# Clean TotalCharges
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(0)

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

# Encode target
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})
y = df["Churn"]

# Define ordinal features (with a meaningful order)
ordinal_features = ["Contract"]
ordinal_mapping = [["Month-to-month", "One year", "Two year"]]  # low to high commitment

# All categorical features
cat_cols = df.select_dtypes(include="object").columns.tolist()

# Identify nominal (non-ordinal) features
nominal_features = [col for col in cat_cols if col not in ordinal_features]

# Numerical columns
num_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_features.remove("Churn")

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("ord", OrdinalEncoder(categories=ordinal_mapping), ordinal_features),
        ("nom", OneHotEncoder(drop="first", handle_unknown="ignore"), nominal_features),
    ]
)

# Logistic Regression pipeline with feature selection
lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=200)
            ),
        ),
        ("classifier", LogisticRegression(max_iter=1000)),
    ]
)

# XGBoost pipeline (with same preprocessor and feature selection)
xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=200)
            ),
        ),
        (
            "classifier",
            XGBClassifier(eval_metric="logloss", random_state=42),
        ),
    ]
)

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate Logistic Regression
lr_scores = cross_val_score(lr_pipeline, df, y, cv=cv, scoring="accuracy")
print(
    f"Logistic Regression CV Accuracy: {lr_scores.mean():.4f} ± {lr_scores.std():.4f}"
)

# Evaluate XGBoost
xgb_scores = cross_val_score(xgb_pipeline, df, y, cv=cv, scoring="accuracy")
print(f"XGBoost CV Accuracy: {xgb_scores.mean():.4f} ± {xgb_scores.std():.4f}")

Logistic Regression CV Accuracy: 0.8055 ± 0.0104
XGBoost CV Accuracy: 0.7876 ± 0.0109


In [274]:
# Using PIPELINE
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

df = pd.read_csv("Customer-Churn.csv")
# 'customerID, gender, SeniorCitizen, Partner, Dependents, tenure, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection,
# TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn'
", ".join(df.columns)

# Data Cleanup activities
df.drop("customerID", axis=1, inplace=True)

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].mean())

# Encode Target
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})
y = df["Churn"]

# Classify Cols

ordinal_features = ["Contract"]
ordinal_mapping = [["Month-to-month", "One year", "Two year"]]

cat_cols = df.select_dtypes(include="object").columns.tolist()

nominal_features = [col for col in cat_cols if col not in ordinal_features]

num_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_features.remove("Churn")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        ("ord", OrdinalEncoder(categories=ordinal_mapping), ordinal_features),
        ("nom", OneHotEncoder(drop="first", handle_unknown="ignore"), nominal_features),
    ]
)

lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=100)
            ),
        ),
        ("classifier", LogisticRegression(max_iter=1000)),
    ]
)

xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=500),
                threshold="mean",
            ),
        ),
        ("classifier", XGBClassifier(eval_metric="logloss", random_state=42)),
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lr_scores = cross_val_score(lr_pipeline, df, y, cv=cv, scoring="accuracy")
xgb_scores = cross_val_score(xgb_pipeline, df, y, cv=cv, scoring="accuracy")

print(f"LR Scores: {lr_scores.mean()}")
print(f"XGB Scores: {xgb_scores.mean()}")

from sklearn.metrics import classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.25, stratify=y, random_state=42
)

xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)
y_prob = xgb_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

LR Scores: 0.8051933794115749
XGB Scores: 0.7827633032776309
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1294
           1       0.62      0.50      0.55       467

    accuracy                           0.79      1761
   macro avg       0.73      0.69      0.71      1761
weighted avg       0.78      0.79      0.78      1761

ROC AUC: 0.82599810027503
