### Baseline Logistic Regression
- Data: Telco Clean (7,043 rows × 38 features)
- Cross‑validation: 5‑fold StratifiedKFold (random_state = 42)
- ROC‑AUC (mean ± std): **0.848 ± 0.011**
- PR‑AUC (mean ± std): **0.661 ± 0.015**
- Parameters: `class_weight='balanced', max_iter=1000`
- Model file: `models/logreg.pkl`


In [11]:
import joblib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
from pathlib import Path

THIS_NOTEBOOK = Path.cwd()  
PROJECT_ROOT  = THIS_NOTEBOOK.parent
DATA_DIR = PROJECT_ROOT / "data" / "processed"
X = joblib.load(DATA_DIR / "X_train.pkl")
y = joblib.load(DATA_DIR / "y_train.pkl")



cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(max_iter=1000, class_weight="balanced")

scoring = ["roc_auc", "average_precision"]
cv_res = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_estimator=True)

print("ROC‑AUC  :", cv_res['test_roc_auc'].mean().round(3))
print("PR‑AUC   :", cv_res['test_average_precision'].mean().round(3))

ROC‑AUC  : 0.848
PR‑AUC   : 0.661


In [15]:
best_idx = np.argmax(cv_res["test_roc_auc"])
best_clf = cv_res["estimator"][best_idx]
best_clf.fit(X, y)


import joblib, pathlib as pl
pl.Path(PROJECT_ROOT /"models").mkdir(exist_ok=True)
joblib.dump(best_clf, PROJECT_ROOT /"models"/"logreg.pkl")


['/Users/pc/churn-prediction-pipeline/models/logreg.pkl']

In [19]:
import numpy as np

roc_mean = cv_res["test_roc_auc"].mean().round(3)   # 0.848
roc_std  = cv_res["test_roc_auc"].std(ddof=0).round(3)   # 0.012

pr_mean  = cv_res["test_average_precision"].mean().round(3)    # 0.661
pr_std   = cv_res["test_average_precision"].std(ddof=0).round(3)    # 0.015

print(f"ROC‑AUC  : {roc_mean} ± {roc_std}")
print(f"PR‑AUC   : {pr_mean} ± {pr_std}")

ROC‑AUC  : 0.848 ± 0.011
PR‑AUC   : 0.661 ± 0.015
