<a href="https://colab.research.google.com/github/dlytica-gcp/bootcamp-2025-sept/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
#step 1 model configuration
#options: "random_forst", "log_reg", "sdg_reg", "gradient_boosting"
model_name = "sgd_log"

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

#step 2 model zoo
MODEL_ZOO = {
    "random_forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "gradient_boosting": GradientBoostingClassifier(random_state=42),
    "log_reg": LogisticRegression(max_iter=1000, random_state=42),
    "sgd_log": SGDClassifier(loss="log_loss", max_iter=1000, random_state=42)
}

#step 3 load data
DATA_DIR = Path("data")
df = pd.read_csv(DATA_DIR / "customer_churn.csv")

X = df.drop(columns=["churn"])
y = df["churn"]

num_cols = X.select_dtypes(include=["number","bool"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number","bool"]).columns.tolist()

#step 4 data preprocessing
preproc = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3,

)

#step 5 building the pipeline
def build_pipeline(name: str) -> Pipeline:
    assert name in MODEL_ZOO, f"unknown model '{name}', options: {list(MODEL_ZOO)}"
    return Pipeline(steps=[("prep", preproc), ("clf", MODEL_ZOO[name])])

#step 6 train, evaluate and export
def run(model_key="random_forest", test_size=0.2, seed=42):
  X_tr, X_te, y_tr, y_te = train_test_split(
      X, y, test_size=test_size, random_state=seed, stratify=y
  )

  pipe = build_pipeline(model_key)
  pipe.fit(X_tr, y_tr)

  if hasattr(pipe.named_steps["clf"], "predict_proba"):
    prob = pipe.predict_proba(X_te)[:, 1]
  else:
    scores = pipe.decision_function(y_te, pred)
    prob = 1 / (1 + np.exp(-scores))

  pred = (prob >= 0.5).astype(int)
  acc = accuracy_score(y_te, pred)
  auc = roc_auc_score(y_te, prob)

  print(f"Model: {model_key}")
  print(f"Accuracy: {acc:.3f} | ROC_AUC: {auc:.3f}")

  out = X_te.copy()
  out["churn_actual"] = y_te.values
  out["prob"] = prob
  out["predicted"] = pred
  out.to_csv(DATA_DIR / "churn_scored.csv", index=False)
  print("saved:", DATA_DIR / "churn_scored.csv")


run(model_name)

Model: sgd_log
Accuracy: 0.819 | ROC_AUC: 0.734
saved: data/churn_scored.csv
