# 02 — ML: split, métricas e baseline (VIP)

Objetivo: criar um baseline rápido com métricas e artefatos salvos.

Tempo: ~25–30 min

## O que você vai fazer

1- Carregar (ou gerar) `rfm_features.parquet`  
2- Definir features/target (`is_vip`)  
3- Treinar um baseline simples e medir qualidade (AUC, matriz, relatório)

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
)
import matplotlib.pyplot as plt

def find_repo_root(start: Path | None = None) -> Path:
    cur = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (cur / "README.md").exists() and (cur / "data").exists():
            return cur
        cur = cur.parent
    return Path.cwd().resolve()

root = find_repo_root()
DATA = root / "data"

In [None]:
def build_rfm_from_sample() -> pd.DataFrame:
    sales = pd.read_csv(DATA / "sample" / "sales.csv")
    customers = pd.read_csv(DATA / "sample" / "customers.csv")
    sales["date"] = pd.to_datetime(sales["date"])
    customers["signup_date"] = pd.to_datetime(customers["signup_date"])

    df = sales.merge(customers, on="customer_id", how="left")
    as_of = df["date"].max() + pd.Timedelta(days=1)

    rfm = (
        df.groupby("customer_id")
          .agg(
              last_purchase=("date", "max"),
              frequency=("order_id", "nunique"),
              monetary=("revenue", "sum"),
              avg_order_value=("revenue", "mean"),
              category_nunique=("category", "nunique"),
              region_nunique=("region", "nunique"),
              segment=("segment", "first"),
              signup_date=("signup_date", "first"),
          )
          .reset_index()
    )
    rfm["recency_days"] = (as_of - rfm["last_purchase"]).dt.days

    threshold = rfm["monetary"].quantile(0.80)
    rfm["is_vip"] = (rfm["monetary"] >= threshold).astype(int)

    # salva para reuso
    out_dir = DATA / "processed"
    out_dir.mkdir(parents=True, exist_ok=True)
    rfm.to_parquet(out_dir / "rfm_features.parquet", index=False)

    return rfm

def load_rfm() -> pd.DataFrame:
    path = DATA / "processed" / "rfm_features.parquet"
    if path.exists():
        return pd.read_parquet(path)
    return build_rfm_from_sample()

rfm = load_rfm()
rfm.head()

## Dataset de treino (features numéricas)

In [None]:
target = "is_vip"
features = ["recency_days", "frequency", "monetary", "avg_order_value", "category_nunique", "region_nunique"]

X = rfm[features].copy()
y = rfm[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("train:", X_train.shape, "test:", X_test.shape)
print("VIP share (train):", y_train.mean().round(3), "VIP share (test):", y_test.mean().round(3))

## Baseline: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=200, class_weight="balanced", random_state=42),
)

model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_test, proba).round(4))
print(classification_report(y_test, pred))

cm = confusion_matrix(y_test, pred)
print("Confusion matrix:\n", cm)

RocCurveDisplay.from_predictions(y_test, proba)
plt.show()

## Salvando modelo + métricas (para portfólio)

In [None]:
import json
import joblib

out_dir = DATA / "output" / "ml"
out_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(model, out_dir / "model_baseline.joblib")

metrics = {"roc_auc": float(roc_auc_score(y_test, proba))}
(out_dir / "metrics_baseline.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")

print("Salvo:", out_dir / "model_baseline.joblib")
print("Salvo:", out_dir / "metrics_baseline.json")

## Exercícios (10–15 min)

1- Troque o threshold 0.5 por 0.3 e compare precisão vs recall.  
2- Remova `monetary` e avalie o impacto no AUC.  
3- Compare com um baseline `DummyClassifier`.