# 01 — Projeto Final: EDA + RFM + ML (VIP)

Objetivo: um fluxo end-to-end pronto para portfólio (com evidências e artefatos).

Tempo: ~30–45 min

## Visão geral

Este notebook consolida o fluxo:
- carregamento e checagens
- EDA (distribuições, grupos, pareto, tendência)
- features RFM
- ML baseline + pipeline + tuning leve
- outputs para portfólio (assets + métricas)

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

def find_repo_root(start: Path | None = None) -> Path:
    """Sobe diretórios até achar README.md + pasta data."""
    cur = (start or Path.cwd()).resolve()
    for _ in range(10):
        if (cur / "README.md").exists() and (cur / "data").exists():
            return cur
        cur = cur.parent
    return Path.cwd().resolve()

root = find_repo_root()
DATA = root / "data"

In [None]:
sales = pd.read_csv(DATA / "sample" / "sales.csv")
customers = pd.read_csv(DATA / "sample" / "customers.csv")

sales["date"] = pd.to_datetime(sales["date"])
customers["signup_date"] = pd.to_datetime(customers["signup_date"])

df = sales.merge(customers, on="customer_id", how="left")

df.head()

## EDA: tendência mensal (evidência visual)

In [None]:
monthly = (
    df.assign(month=df["date"].dt.to_period("M").dt.to_timestamp())
      .groupby("month", as_index=False)["revenue"].sum()
      .sort_values("month")
)

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(monthly["month"], monthly["revenue"])
ax.set_title("Revenue total por mês")
ax.set_xlabel("mês")
ax.set_ylabel("revenue")
plt.xticks(rotation=30)
plt.show()

# salva para o portfólio
assets = root / "projects" / "99_projeto_final_end_to_end" / "assets"
assets.mkdir(parents=True, exist_ok=True)
fig.savefig(assets / "eda_monthly.png", dpi=160, bbox_inches="tight")

## EDA: Pareto por produto (80/20)

In [None]:
prod = df.groupby("product")["revenue"].sum().sort_values(ascending=False)
pareto = (prod.cumsum() / prod.sum()).reset_index()
pareto.columns = ["product", "cum_share"]
k = int((pareto["cum_share"] <= 0.80).sum())
print("Produtos que acumulam ~80% da receita:", k)

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(range(1, len(pareto)+1), pareto["cum_share"])
ax.axhline(0.8, linestyle="--")
ax.set_title("Pareto: share acumulado de receita por produto")
ax.set_xlabel("rank do produto")
ax.set_ylabel("share acumulado")
plt.show()

fig.savefig(assets / "pareto_produtos.png", dpi=160, bbox_inches="tight")

## Feature Engineering: RFM

In [None]:
as_of = df["date"].max() + pd.Timedelta(days=1)

rfm = (
    df.groupby("customer_id")
      .agg(
          last_purchase=("date", "max"),
          frequency=("order_id", "nunique"),
          monetary=("revenue", "sum"),
          avg_order_value=("revenue", "mean"),
          category_nunique=("category", "nunique"),
          region_nunique=("region", "nunique"),
          segment=("segment", "first"),
          signup_date=("signup_date", "first"),
      )
      .reset_index()
)
rfm["recency_days"] = (as_of - rfm["last_purchase"]).dt.days

# Alvo didático
thr = rfm["monetary"].quantile(0.80)
rfm["is_vip"] = (rfm["monetary"] >= thr).astype(int)

# salva features
processed = DATA / "processed"
processed.mkdir(parents=True, exist_ok=True)
rfm.to_parquet(processed / "rfm_features.parquet", index=False)

rfm.head()

## ML: Pipeline + modelo

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, RocCurveDisplay

target = "is_vip"
num_features = ["recency_days", "frequency", "monetary", "avg_order_value", "category_nunique", "region_nunique"]
cat_features = ["segment"]

X = rfm[num_features + cat_features].copy()
y = rfm[target].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("scaler", StandardScaler())]), num_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
    ]
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestClassifier(n_estimators=500, random_state=42, class_weight="balanced")),
])

pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Test ROC AUC:", roc_auc_score(y_test, proba).round(4))
print(classification_report(y_test, pred))

RocCurveDisplay.from_predictions(y_test, proba)
plt.show()

## Interpretabilidade: Permutation Importance (features originais)

In [None]:
from sklearn.inspection import permutation_importance

perm = permutation_importance(pipe, X_test, y_test, n_repeats=20, random_state=42, scoring="roc_auc")
imp = pd.DataFrame({
    "feature": X_test.columns,
    "importance_mean": perm.importances_mean,
}).sort_values("importance_mean", ascending=False)

display(imp)

fig, ax = plt.subplots(figsize=(8, 4))
ax.barh(imp["feature"], imp["importance_mean"])
ax.invert_yaxis()
ax.set_title("Permutation Importance (AUC)")
ax.set_xlabel("queda média no AUC ao permutar")
plt.show()

fig.savefig(assets / "feature_importance.png", dpi=160, bbox_inches="tight")

## Salvando artefatos (modelo + métricas)

In [None]:
import json
import joblib

out_dir = DATA / "output" / "ml"
out_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(pipe, out_dir / "project_model.joblib")
metrics = {"test_auc": float(roc_auc_score(y_test, proba))}
(out_dir / "project_metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")

print("Salvo:", out_dir / "project_model.joblib")
print("Salvo:", out_dir / "project_metrics.json")
print("Assets:", assets)

## Checklist final (portfólio)
- [ ] Executou do zero (venv + requirements)  
- [ ] Gerou `rfm_features.parquet`  
- [ ] Salvou `assets/*.png`  
- [ ] Atualizou o README do projeto com resultados (AUC + insights)