# Mini-projeto 08 — ML baseline

Classificar segmento (B2B vs B2C) a partir de comportamento de compra.

> Entregável de portfólio — gerado em 2026-01-30.


## Objetivo

        Criar features por cliente, treinar um baseline com scikit-learn e salvar métricas + artefatos.

        ## Entregáveis (para portfólio)

        - [ ] `reports/ml_metrics.json`
- [ ] `assets/confusion_matrix.png`
- [ ] `outputs/model.joblib`

        ## Como usar

        1- Rode este notebook (kernel com o `.venv` do repo)  
        2- Gere **assets** (imagens/HTML) e **reports** (markdown/json) dentro desta pasta  
        3- Faça commit dos arquivos gerados para evidenciar o resultado no GitHub

In [None]:
# Setup: detectar raiz do repositório + paths padrão
from pathlib import Path
import pandas as pd
import numpy as np

def find_repo_root(start: Path | None = None) -> Path:
    p = (start or Path.cwd()).resolve()
    for _ in range(12):
        if (p / "requirements.txt").exists() and (p / "README.md").exists():
            return p
        p = p.parent
    return (start or Path.cwd()).resolve()

ROOT = find_repo_root()
DATA_SAMPLE = ROOT / "data" / "sample"
DATA_SOURCE = ROOT / "data" / "source" / "bases-dados-analytics-powerbi-ml"

# Pasta do projeto (onde salvar assets/outputs/reports)
PROJ = ROOT / "projects" / "08_machine_learning"
ASSETS = PROJ / "assets"
OUTPUTS = PROJ / "outputs"
REPORTS = PROJ / "reports"
for d in (ASSETS, OUTPUTS, REPORTS):
    d.mkdir(parents=True, exist_ok=True)

# Dataset padrão (sempre disponível)
sales_path = DATA_SAMPLE / "sales.csv"
customers_path = DATA_SAMPLE / "customers.csv"
sales = pd.read_csv(sales_path, parse_dates=["date"])
customers = pd.read_csv(customers_path, parse_dates=["signup_date"])

# Dataset real (opcional): se você adicionou o submodule/clone em dados/source/
has_real = DATA_SOURCE.exists()

print("ROOT:", ROOT)
print("Dataset sample:", sales.shape, customers.shape)
print("Dataset real disponível?", has_real)

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# 1) Features por cliente (comportamento)
df = sales.merge(customers, on="customer_id", how="left")
df["date"] = pd.to_datetime(df["date"], errors="coerce")

feats = (
    df.groupby(["customer_id", "segment"])
    .agg(
        orders=("order_id", "nunique"),
        qty=("qty", "sum"),
        revenue=("revenue", "sum"),
        avg_ticket=("revenue", "mean"),
    )
    .reset_index()
)

# 2) X/y
X = feats[["orders", "qty", "revenue", "avg_ticket"]]
y = feats["segment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# 3) Pipeline baseline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000)),
])
pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
acc = accuracy_score(y_test, pred)
print("Accuracy:", acc)

# 4) Métricas
report_txt = classification_report(y_test, pred, output_dict=False)
print(report_txt)

cm = confusion_matrix(y_test, pred, labels=sorted(y.unique()))
plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xticks(range(cm.shape[1]), sorted(y.unique()))
plt.yticks(range(cm.shape[0]), sorted(y.unique()))
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha="center", va="center")
out_img = ASSETS / "confusion_matrix.png"
plt.tight_layout()
plt.savefig(out_img, dpi=160)
plt.close()
print("Salvo:", out_img)

metrics = {
    "accuracy": float(acc),
    "n_train": int(len(X_train)),
    "n_test": int(len(X_test)),
    "features": list(X.columns),
}
out_metrics = REPORTS / "ml_metrics.json"
out_metrics.write_text(json.dumps(metrics, indent=2, ensure_ascii=False), encoding="utf-8")
print("Salvo:", out_metrics)

# 5) Salvar modelo
out_model = OUTPUTS / "model.joblib"
joblib.dump(pipe, out_model)
print("Salvo:", out_model)

## Evidências

![](./assets/confusion_matrix.png)
