# 03 — ML: pipeline + comparação de modelos

Objetivo: montar um pipeline robusto e comparar modelos com validação cruzada.

Tempo: ~25–30 min

## O que você vai fazer

1- Montar um **Pipeline** com pré-processamento (num + categórico)  
2- Comparar modelos com validação cruzada (AUC)  
3- Manter tudo reprodutível (mesma função, mesma base)

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
)
import matplotlib.pyplot as plt

def find_repo_root(start: Path | None = None) -> Path:
    cur = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (cur / "README.md").exists() and (cur / "data").exists():
            return cur
        cur = cur.parent
    return Path.cwd().resolve()

root = find_repo_root()
DATA = root / "data"

In [None]:
def build_rfm_from_sample() -> pd.DataFrame:
    sales = pd.read_csv(DATA / "sample" / "sales.csv")
    customers = pd.read_csv(DATA / "sample" / "customers.csv")
    sales["date"] = pd.to_datetime(sales["date"])
    customers["signup_date"] = pd.to_datetime(customers["signup_date"])

    df = sales.merge(customers, on="customer_id", how="left")
    as_of = df["date"].max() + pd.Timedelta(days=1)

    rfm = (
        df.groupby("customer_id")
          .agg(
              last_purchase=("date", "max"),
              frequency=("order_id", "nunique"),
              monetary=("revenue", "sum"),
              avg_order_value=("revenue", "mean"),
              category_nunique=("category", "nunique"),
              region_nunique=("region", "nunique"),
              segment=("segment", "first"),
              signup_date=("signup_date", "first"),
          )
          .reset_index()
    )
    rfm["recency_days"] = (as_of - rfm["last_purchase"]).dt.days

    threshold = rfm["monetary"].quantile(0.80)
    rfm["is_vip"] = (rfm["monetary"] >= threshold).astype(int)

    # salva para reuso
    out_dir = DATA / "processed"
    out_dir.mkdir(parents=True, exist_ok=True)
    rfm.to_parquet(out_dir / "rfm_features.parquet", index=False)

    return rfm

def load_rfm() -> pd.DataFrame:
    path = DATA / "processed" / "rfm_features.parquet"
    if path.exists():
        return pd.read_parquet(path)
    return build_rfm_from_sample()

rfm = load_rfm()
rfm.head()

## Features numéricas + categóricas

In [None]:
target = "is_vip"
num_features = ["recency_days", "frequency", "monetary", "avg_order_value", "category_nunique", "region_nunique"]
cat_features = ["segment"]

X = rfm[num_features + cat_features].copy()
y = rfm[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

## Pipeline com ColumnTransformer

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ]
)

## Comparando modelos (AUC)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

models = {
    "logreg": LogisticRegression(max_iter=400, class_weight="balanced", random_state=42),
    "rf": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []
for name, clf in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", clf)])
    aucs = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="roc_auc")
    scores.append({"model": name, "auc_mean": aucs.mean(), "auc_std": aucs.std()})

scores_df = pd.DataFrame(scores).sort_values("auc_mean", ascending=False)
display(scores_df)

## Treinando o melhor e avaliando no teste

In [None]:
best_name = scores_df.iloc[0]["model"]
best_model = models[best_name]

pipe = Pipeline([("prep", preprocess), ("model", best_model)])
pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Best:", best_name)
print("ROC AUC:", roc_auc_score(y_test, proba).round(4))
print(classification_report(y_test, pred))
RocCurveDisplay.from_predictions(y_test, proba)
plt.show()

## Salvando artefatos do pipeline

In [None]:
import json
import joblib

out_dir = DATA / "output" / "ml"
out_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(pipe, out_dir / "model_pipeline_best.joblib")
(out_dir / "scores_cv.json").write_text(json.dumps(scores, indent=2), encoding="utf-8")

print("Salvo:", out_dir / "model_pipeline_best.joblib")
print("Salvo:", out_dir / "scores_cv.json")

## Exercícios (10–15 min)

1- Adicione um terceiro modelo: `GradientBoostingClassifier`.  
2- Troque scoring por `average_precision` e compare.  
3- Plote a distribuição das probabilidades (`proba`) por classe real.