##### 📊 **Customer Churn Prediction (Python, scikit-learn, XGBoost)**

* Built classification models to predict customer churn using logistic regression and XGBoost.
* Achieved 80% accuracy with cross-validation and feature selection techniques.
* Created detailed EDA dashboards and model evaluation reports.

In [1]:
# Using PIPELINE: Built classification models to predict customer churn using logistic regression and XGBoost
# e.g. Customer-Churn.csv
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load data
df = pd.read_csv("Customer-Churn.csv")

# Clean TotalCharges
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(0)

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

# Encode target
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})
y = df["Churn"]

# Define ordinal features (with a meaningful order)
ordinal_features = ["Contract"]
ordinal_mapping = [["Month-to-month", "One year", "Two year"]]  # low to high commitment

# All categorical features
cat_cols = df.select_dtypes(include="object").columns.tolist()

# Identify nominal (non-ordinal) features
nominal_features = [col for col in cat_cols if col not in ordinal_features]

# Numerical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols.remove("Churn")

# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("ord", OrdinalEncoder(categories=ordinal_mapping), ordinal_features),
        ("nom", OneHotEncoder(drop="first", handle_unknown="ignore"), nominal_features),
    ]
)

# Logistic Regression pipeline with feature selection
lr_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
            ),
        ),
        ("classifier", LogisticRegression(max_iter=1000)),
    ]
)

# XGBoost pipeline (with same preprocessor and feature selection)
xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "feature_selection",
            SelectFromModel(
                LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
            ),
        ),
        (
            "classifier",
            XGBClassifier(eval_metric="logloss", random_state=42),
        ),
    ]
)

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate Logistic Regression
lr_scores = cross_val_score(lr_pipeline, df, y, cv=cv, scoring="accuracy")
print(
    f"Logistic Regression CV Accuracy: {lr_scores.mean():.4f} ± {lr_scores.std():.4f}"
)

# Evaluate XGBoost
xgb_scores = cross_val_score(xgb_pipeline, df, y, cv=cv, scoring="accuracy")
print(f"XGBoost CV Accuracy: {xgb_scores.mean():.4f} ± {xgb_scores.std():.4f}")

Logistic Regression CV Accuracy: 0.8053 ± 0.0102
XGBoost CV Accuracy: 0.7876 ± 0.0109
