# 03 - Modélisation : Clustering RFM

Ce notebook compare plusieurs algorithmes de clustering sur les features RFM scalées.

**Étapes :**
1. Chargement des données RFM scalées
2. Recherche du K optimal (Elbow + Silhouette)
3. K-means avec K choisi
4. Comparaison : Hierarchical, DBSCAN
5. Évaluation et stabilité
6. Visualisation des clusters
7. Sauvegarde du meilleur modèle

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import joblib
from pathlib import Path

# Modules projet
from customer_segmentation.models.clustering import run_kmeans, run_hierarchical, run_dbscan
from customer_segmentation.evaluation.metrics import compute_internal_metrics, stability_score
from customer_segmentation.visualization.plots import plot_elbow, plot_silhouette, plot_clusters_2d

# Configuration
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

In [None]:
# Chemins
ROOT = Path.cwd()
if not (ROOT / "configs" / "params.yaml").exists():
    ROOT = ROOT.parent

# Chargement configuration
with open(ROOT / "configs" / "params.yaml", "r") as f:
    params = yaml.safe_load(f)

OUTPUT_DIR = ROOT / params["outputs"]["figures_dir"]
MODELS_DIR = ROOT / params["outputs"]["models_dir"]

print(f"ROOT: {ROOT}")

---
## 1. Chargement des données RFM scalées

In [None]:
# Chargement RFM scalé
rfm_scaled = pd.read_csv(ROOT / "data" / "features" / "rfm_scaled.csv")
rfm_raw = pd.read_csv(ROOT / "data" / "features" / "rfm_raw.csv")

print(f"Clients: {len(rfm_scaled):,}")
print(f"Features: {list(rfm_scaled.columns)}")
rfm_scaled.head()

In [None]:
# Extraction matrice X pour clustering
feature_cols = params["rfm"]["features"]
X = rfm_scaled[feature_cols].values

print(f"Shape X: {X.shape}")
print(f"Features: {feature_cols}")

---
## 2. Recherche du K optimal

In [None]:
# Elbow + Silhouette
k_range = range(2, 11)

fig = plot_elbow(X, k_range=k_range, random_state=params["models"]["random_state"])
plt.savefig(OUTPUT_DIR / "03_elbow_silhouette.png", dpi=200, bbox_inches="tight")
plt.show()

In [None]:
# Tableau détaillé des scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

results_k = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    results_k.append({
        "K": k,
        "Inertie": km.inertia_,
        "Silhouette": silhouette_score(X, labels),
        "Davies-Bouldin": davies_bouldin_score(X, labels),
        "Calinski-Harabasz": calinski_harabasz_score(X, labels)
    })

df_k = pd.DataFrame(results_k)
df_k.style.highlight_max(subset=["Silhouette", "Calinski-Harabasz"], color="lightgreen") \
          .highlight_min(subset=["Davies-Bouldin"], color="lightgreen")

**Analyse :**
- **Silhouette** : Plus haut = meilleur (clusters bien séparés)
- **Davies-Bouldin** : Plus bas = meilleur (clusters compacts)
- **Calinski-Harabasz** : Plus haut = meilleur (variance inter/intra)

Le coude de l'inertie et le maximum de silhouette suggèrent le K optimal.

In [None]:
# Choix du K basé sur silhouette max
K_OPTIMAL = df_k.loc[df_k["Silhouette"].idxmax(), "K"]
print(f"K optimal suggéré (max silhouette): {K_OPTIMAL}")

---
## 3. K-means avec K optimal

In [None]:
# K-means final
labels_kmeans, model_kmeans = run_kmeans(
    X, 
    n_clusters=int(K_OPTIMAL),
    random_state=params["models"]["random_state"],
    n_init=params["models"]["kmeans"]["n_init"]
)

# Métriques
metrics_kmeans = compute_internal_metrics(X, labels_kmeans)
print(f"=== K-MEANS (K={int(K_OPTIMAL)}) ===")
for k, v in metrics_kmeans.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

In [None]:
# Distribution des clusters
cluster_counts = pd.Series(labels_kmeans).value_counts().sort_index()
print("Distribution des clusters:")
for cluster, count in cluster_counts.items():
    print(f"  Cluster {cluster}: {count:,} clients ({count/len(labels_kmeans)*100:.1f}%)")

In [None]:
# Diagramme Silhouette
fig = plot_silhouette(X, labels_kmeans)
plt.savefig(OUTPUT_DIR / "03_silhouette_kmeans.png", dpi=200, bbox_inches="tight")
plt.show()

In [None]:
# Visualisation 2D des clusters
fig = plot_clusters_2d(X, labels_kmeans, feature_names=feature_cols)
plt.savefig(OUTPUT_DIR / "03_clusters_2d_kmeans.png", dpi=200, bbox_inches="tight")
plt.show()

---
## 4. Comparaison : Hierarchical et DBSCAN

In [None]:
# Hierarchical (même K)
labels_hier, model_hier = run_hierarchical(
    X, 
    n_clusters=int(K_OPTIMAL),
    linkage=params["models"]["hierarchical"]["linkage"]
)
metrics_hier = compute_internal_metrics(X, labels_hier)

print(f"=== HIERARCHICAL (K={int(K_OPTIMAL)}, linkage={params['models']['hierarchical']['linkage']}) ===")
for k, v in metrics_hier.items():
    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")

In [None]:
# DBSCAN - Grid search pour eps et min_samples
print("=== DBSCAN GRID SEARCH ===")

dbscan_results = []
for eps in params["models"]["dbscan"]["eps_range"]:
    for min_samples in params["models"]["dbscan"]["min_samples_range"]:
        labels_db, _ = run_dbscan(X, eps=eps, min_samples=min_samples)
        n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
        noise_pct = (labels_db == -1).mean() * 100
        
        if n_clusters >= 2 and noise_pct < 50:  # Filtre résultats valides
            metrics = compute_internal_metrics(X, labels_db)
            dbscan_results.append({
                "eps": eps,
                "min_samples": min_samples,
                "n_clusters": n_clusters,
                "noise_%": noise_pct,
                "silhouette": metrics["silhouette"]
            })

if dbscan_results:
    df_dbscan = pd.DataFrame(dbscan_results).sort_values("silhouette", ascending=False)
    display(df_dbscan.head(10))
else:
    print("Aucune configuration DBSCAN valide trouvée")
    df_dbscan = pd.DataFrame()

In [None]:
# Meilleur DBSCAN
if len(df_dbscan) > 0:
    best_dbscan = df_dbscan.iloc[0]
    labels_dbscan, model_dbscan = run_dbscan(
        X, 
        eps=best_dbscan["eps"], 
        min_samples=int(best_dbscan["min_samples"])
    )
    metrics_dbscan = compute_internal_metrics(X, labels_dbscan)
    
    print(f"=== MEILLEUR DBSCAN (eps={best_dbscan['eps']}, min_samples={int(best_dbscan['min_samples'])}) ===")
    for k, v in metrics_dbscan.items():
        print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")
else:
    labels_dbscan = None
    metrics_dbscan = {"silhouette": np.nan, "davies_bouldin": np.nan, "calinski_harabasz": np.nan}
    print("DBSCAN non applicable sur ces données")

---
## 5. Comparaison finale des modèles

In [None]:
# Tableau comparatif
comparison = pd.DataFrame({
    "Modèle": ["K-Means", "Hierarchical", "DBSCAN"],
    "K / Clusters": [int(K_OPTIMAL), int(K_OPTIMAL), metrics_dbscan.get("n_clusters", "N/A")],
    "Silhouette": [metrics_kmeans["silhouette"], metrics_hier["silhouette"], metrics_dbscan.get("silhouette", np.nan)],
    "Davies-Bouldin": [metrics_kmeans["davies_bouldin"], metrics_hier["davies_bouldin"], metrics_dbscan.get("davies_bouldin", np.nan)],
    "Calinski-Harabasz": [metrics_kmeans["calinski_harabasz"], metrics_hier["calinski_harabasz"], metrics_dbscan.get("calinski_harabasz", np.nan)]
})

comparison.style.highlight_max(subset=["Silhouette", "Calinski-Harabasz"], color="lightgreen") \
               .highlight_min(subset=["Davies-Bouldin"], color="lightgreen")

In [None]:
# Sélection du meilleur modèle (basé sur silhouette)
best_idx = comparison["Silhouette"].idxmax()
best_model_name = comparison.loc[best_idx, "Modèle"]

print(f"\n=== MEILLEUR MODÈLE: {best_model_name} ===")
print(comparison.loc[best_idx].to_string())

---
## 6. Stabilité du clustering (Bootstrap)

In [None]:
# Stabilité K-means
def kmeans_func(X_sample):
    km = KMeans(n_clusters=int(K_OPTIMAL), random_state=42, n_init=10)
    return km.fit_predict(X_sample)

stability = stability_score(
    X, 
    kmeans_func, 
    n_bootstrap=params["evaluation"]["n_bootstrap"],
    sample_ratio=params["evaluation"]["bootstrap_ratio"]
)

print(f"Stabilité K-means (ARI moyen): {stability:.4f}")
print(f"  > 0.8 = stable, 0.6-0.8 = modéré, < 0.6 = instable")

---
## 7. Sauvegarde du modèle et des résultats

In [None]:
# Ajout des labels au dataframe RFM
rfm_with_clusters = rfm_raw.copy()
rfm_with_clusters["Cluster"] = labels_kmeans

# Sauvegarde
rfm_with_clusters.to_csv(ROOT / "data" / "features" / "rfm_clustered.csv", index=False)
joblib.dump(model_kmeans, MODELS_DIR / "kmeans_model.joblib")

# Sauvegarde comparaison
comparison.to_csv(ROOT / "outputs" / "reports" / "model_comparison.csv", index=False)

print("=== SAUVEGARDE TERMINÉE ===")
print(f"✓ RFM avec clusters: data/features/rfm_clustered.csv")
print(f"✓ Modèle K-means: outputs/models/kmeans_model.joblib")
print(f"✓ Comparaison: outputs/reports/model_comparison.csv")

In [None]:
# Aperçu final
print("\n=== PROFIL DES CLUSTERS (valeurs brutes) ===")
cluster_profile = rfm_with_clusters.groupby("Cluster")[["Recency", "Frequency", "Monetary"]].agg(["mean", "median", "count"])
cluster_profile

---
## Résumé

| Élément | Valeur |
|---------|--------|
| Algorithme retenu | K-Means |
| Nombre de clusters | K optimal |
| Score Silhouette | (voir ci-dessus) |
| Stabilité (ARI) | (voir ci-dessus) |

**Artefacts générés :**
- `data/features/rfm_clustered.csv`
- `outputs/models/kmeans_model.joblib`
- `outputs/reports/model_comparison.csv`
- Figures dans `outputs/figures/`

→ **Prochaine étape** : Notebook 04_interpretation.ipynb (profils segments, nommage business, recommandations)