# 04 — ML: tuning + interpretabilidade (Permutation Importance)

Objetivo: melhorar performance com tuning leve e explicar o modelo de forma prática.

Tempo: ~25–30 min

## O que você vai fazer

1- Rodar um tuning simples com `GridSearchCV`  
2- Avaliar no conjunto de teste (AUC + relatório)  
3- Interpretar com **Permutation Importance** (o que mais influencia)

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    RocCurveDisplay,
)
import matplotlib.pyplot as plt

def find_repo_root(start: Path | None = None) -> Path:
    cur = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (cur / "README.md").exists() and (cur / "data").exists():
            return cur
        cur = cur.parent
    return Path.cwd().resolve()

root = find_repo_root()
DATA = root / "data"

In [None]:
def build_rfm_from_sample() -> pd.DataFrame:
    sales = pd.read_csv(DATA / "sample" / "sales.csv")
    customers = pd.read_csv(DATA / "sample" / "customers.csv")
    sales["date"] = pd.to_datetime(sales["date"])
    customers["signup_date"] = pd.to_datetime(customers["signup_date"])

    df = sales.merge(customers, on="customer_id", how="left")
    as_of = df["date"].max() + pd.Timedelta(days=1)

    rfm = (
        df.groupby("customer_id")
          .agg(
              last_purchase=("date", "max"),
              frequency=("order_id", "nunique"),
              monetary=("revenue", "sum"),
              avg_order_value=("revenue", "mean"),
              category_nunique=("category", "nunique"),
              region_nunique=("region", "nunique"),
              segment=("segment", "first"),
              signup_date=("signup_date", "first"),
          )
          .reset_index()
    )
    rfm["recency_days"] = (as_of - rfm["last_purchase"]).dt.days

    threshold = rfm["monetary"].quantile(0.80)
    rfm["is_vip"] = (rfm["monetary"] >= threshold).astype(int)

    # salva para reuso
    out_dir = DATA / "processed"
    out_dir.mkdir(parents=True, exist_ok=True)
    rfm.to_parquet(out_dir / "rfm_features.parquet", index=False)

    return rfm

def load_rfm() -> pd.DataFrame:
    path = DATA / "processed" / "rfm_features.parquet"
    if path.exists():
        return pd.read_parquet(path)
    return build_rfm_from_sample()

rfm = load_rfm()
rfm.head()

## Split

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

target = "is_vip"
num_features = ["recency_days", "frequency", "monetary", "avg_order_value", "category_nunique", "region_nunique"]
cat_features = ["segment"]

X = rfm[num_features + cat_features].copy()
y = rfm[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ]
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestClassifier(random_state=42, class_weight="balanced")),
])

## GridSearchCV (tuning leve)

In [None]:
param_grid = {
    "model__n_estimators": [200, 500],
    "model__max_depth": [None, 8, 12],
    "model__min_samples_leaf": [1, 3, 5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    refit=True,
)

gs.fit(X_train, y_train)

print("Best AUC (cv):", gs.best_score_.round(4))
print("Best params:", gs.best_params_)

## Avaliação no teste

In [None]:
best = gs.best_estimator_
proba = best.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Test ROC AUC:", roc_auc_score(y_test, proba).round(4))
print(classification_report(y_test, pred))
RocCurveDisplay.from_predictions(y_test, proba)
plt.show()

## Permutation Importance (interpretabilidade simples)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    best, X_test, y_test,
    n_repeats=20, random_state=42, scoring="roc_auc"
)

imp = pd.DataFrame({
    "feature": X_test.columns,
    "importance_mean": result.importances_mean,
    "importance_std": result.importances_std,
}).sort_values("importance_mean", ascending=False)

display(imp)

fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(imp["feature"], imp["importance_mean"])
ax.invert_yaxis()
ax.set_title("Permutation Importance (AUC)")
ax.set_xlabel("queda média no AUC ao permutar")
plt.show()

## Salvando resultados do tuning

In [None]:
import json
import joblib

out_dir = DATA / "output" / "ml"
out_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(best, out_dir / "model_rf_tuned.joblib")

payload = {
    "best_score_cv_auc": float(gs.best_score_),
    "best_params": gs.best_params_,
    "test_auc": float(roc_auc_score(y_test, proba)),
}
(out_dir / "tuning_rf.json").write_text(json.dumps(payload, indent=2), encoding="utf-8")

( out_dir / "feature_importance_permutation.csv" ).write_text(imp.to_csv(index=False), encoding="utf-8")

print("Salvo:", out_dir / "model_rf_tuned.joblib")
print("Salvo:", out_dir / "tuning_rf.json")
print("Salvo:", out_dir / "feature_importance_permutation.csv")

## Exercícios (10–15 min)

1- Troque `RandomForest` por `GradientBoostingClassifier` e compare tuning.  
2- Aumente `n_repeats` e veja se a ordem muda.  
3- Faça uma análise de erros: quais clientes o modelo erra mais?