# 02 - Preprocessing & Feature Engineering RFM

In [1]:
# Imports
import pandas as pd
import numpy as np
import yaml
from pathlib import Path
import joblib

# Modules projet
from customer_segmentation.data.loader import load_raw_data, validate_dataframe
from customer_segmentation.features.rfm import compute_rfm, scale_features

In [15]:
# Chemins
ROOT = Path.cwd()
if not (ROOT / "configs" / "params.yaml").exists():
    ROOT = ROOT.parent

# Chargement configuration
with open(ROOT / "configs" / "params.yaml", "r") as f:
    params = yaml.safe_load(f)

print(f"ROOT: {ROOT}")
print(f"Paramètres chargés: {list(params.keys())}")

ROOT: /home/apollinaire_12/customer-segmentation
Paramètres chargés: ['data', 'preprocessing', 'rfm', 'models', 'evaluation', 'visualization', 'outputs']


---
## 1. Chargement des données brutes

In [3]:
# Chargement
df_raw = pd.read_excel(ROOT / params["data"]["raw_file"])
df_raw["InvoiceDate"] = pd.to_datetime(df_raw["InvoiceDate"], errors="coerce")

print(f"Shape initiale: {df_raw.shape}")
print(f"Mémoire: {df_raw.memory_usage(deep=True).sum() / 1e6:.1f} MB")

Shape initiale: (541909, 8)
Mémoire: 141.5 MB


---
## 2. Nettoyage

Critères validés en EDA :

| Critère | Raison |
|---------|--------|
| CustomerID null | Non rattachable à un client |
| Quantity ≤ 0 | Retours / erreurs |
| UnitPrice ≤ 0 | Prix invalides |
| InvoiceNo "C..." | Factures annulées |

In [4]:
# Comptage avant nettoyage
n_initial = len(df_raw)

# Application des filtres
mask_customer = df_raw["CustomerID"].notna()
mask_quantity = df_raw["Quantity"] > 0
mask_price = df_raw["UnitPrice"] > 0
mask_cancelled = ~df_raw["InvoiceNo"].astype(str).str.startswith("C")

# Rapport détaillé
print("=== IMPACT DES FILTRES ===")
print(f"CustomerID null:      {(~mask_customer).sum():>7,} lignes ({(~mask_customer).mean()*100:.2f}%)")
print(f"Quantity ≤ 0:         {(~mask_quantity).sum():>7,} lignes ({(~mask_quantity).mean()*100:.2f}%)")
print(f"UnitPrice ≤ 0:        {(~mask_price).sum():>7,} lignes ({(~mask_price).mean()*100:.2f}%)")
print(f"Factures annulées:    {(~mask_cancelled).sum():>7,} lignes ({(~mask_cancelled).mean()*100:.2f}%)")

=== IMPACT DES FILTRES ===
CustomerID null:      135,080 lignes (24.93%)
Quantity ≤ 0:          10,624 lignes (1.96%)
UnitPrice ≤ 0:          2,517 lignes (0.46%)
Factures annulées:      9,288 lignes (1.71%)


In [5]:
# Application combinée
df_clean = df_raw[mask_customer & mask_quantity & mask_price & mask_cancelled].copy()

# Ajout TotalAmount
df_clean["TotalAmount"] = df_clean["Quantity"] * df_clean["UnitPrice"]

n_final = len(df_clean)
print(f"\n=== RÉSULTAT ===")
print(f"Lignes initiales:  {n_initial:,}")
print(f"Lignes conservées: {n_final:,} ({n_final/n_initial*100:.1f}%)")
print(f"Lignes supprimées: {n_initial - n_final:,} ({(n_initial-n_final)/n_initial*100:.1f}%)")


=== RÉSULTAT ===
Lignes initiales:  541,909
Lignes conservées: 397,884 (73.4%)
Lignes supprimées: 144,025 (26.6%)


In [6]:
# Vérification post-nettoyage
assert df_clean["CustomerID"].notna().all(), "CustomerID contient encore des nulls"
assert (df_clean["Quantity"] > 0).all(), "Quantity contient des valeurs ≤ 0"
assert (df_clean["UnitPrice"] > 0).all(), "UnitPrice contient des valeurs ≤ 0"
assert (~df_clean["InvoiceNo"].astype(str).str.startswith("C")).all(), "Factures annulées présentes"

print("✓ Toutes les assertions passées")
print(f"\nClients uniques: {df_clean['CustomerID'].nunique():,}")
print(f"Factures uniques: {df_clean['InvoiceNo'].nunique():,}")

✓ Toutes les assertions passées

Clients uniques: 4,338
Factures uniques: 18,532


---
## 3. Feature Engineering : Calcul RFM

In [9]:
# Date de référence (max + 1 jour)
reference_date = df_clean["InvoiceDate"].max() + pd.Timedelta(days=1)
print(f"Date de référence: {reference_date}")

# Calcul RFM via module
rfm = compute_rfm(
    df_clean,
    customer_col="CustomerID",
    date_col="InvoiceDate",
    invoice_col="InvoiceNo",
    amount_col="TotalAmount",
    reference_date=reference_date
)

print(f"\nShape RFM: {rfm.shape}")
rfm.sample(5)

Date de référence: 2011-12-10 12:50:00

Shape RFM: (4338, 4)


Unnamed: 0,CustomerID,Recency,Frequency,Monetary
193,12587.0,8,1,144.0
3098,16572.0,127,3,1009.6
1065,13787.0,76,2,309.04
475,12963.0,8,8,1856.63
1410,14262.0,8,8,2618.23


In [10]:
# Stats descriptives RFM
rfm[["Recency", "Frequency", "Monetary"]].describe().round(2)

Unnamed: 0,Recency,Frequency,Monetary
count,4338.0,4338.0,4338.0
mean,92.54,4.27,2054.27
std,100.01,7.7,8989.23
min,1.0,1.0,3.75
25%,18.0,1.0,307.41
50%,51.0,2.0,674.48
75%,142.0,5.0,1661.74
max,374.0,209.0,280206.02


---
## 4. Preprocessing : Scaling

In [None]:
# Colonnes à scaler
rfm_cols = params["rfm"]["features"]
scaling_method = params["rfm"]["scaling_method"]
winsorize_pct = params["rfm"].get("winsorize_percentile", None)

print(f"Features: {rfm_cols}")
print(f"Méthode: {scaling_method}")
print(f"Winsorization: P{winsorize_pct}" if winsorize_pct else "Winsorization: Non")

# Application scaling avec winsorization
rfm_scaled, scaler = scale_features(
    rfm, 
    columns=rfm_cols, 
    method=scaling_method,
    winsorize_percentile=winsorize_pct
)

print(f"\nScaler: {type(scaler).__name__}")
rfm_scaled[["Recency", "Frequency", "Monetary"]].describe().round(2)

In [None]:
# Vérification scaling
print("=== VÉRIFICATION SCALING ===")
for col in rfm_cols:
    median = rfm_scaled[col].median()
    iqr = rfm_scaled[col].quantile(0.75) - rfm_scaled[col].quantile(0.25)
    max_val = rfm_scaled[col].max()
    print(f"{col}: médiane={median:.3f}, IQR={iqr:.3f}, max={max_val:.3f}")

print(f"\n✓ Winsor P{winsorize_pct} + RobustScaler appliqué")
print("  → Outliers cappés avant scaling")
print("  → Médiane ≈ 0, IQR ≈ 1 attendus")

---
## 5. Sauvegarde des artefacts

In [13]:
# Création dossiers si nécessaire
(ROOT / "data" / "processed").mkdir(parents=True, exist_ok=True)
(ROOT / "data" / "features").mkdir(parents=True, exist_ok=True)
(ROOT / "outputs" / "models").mkdir(parents=True, exist_ok=True)

# Chemins de sortie
path_transactions = ROOT / "data" / "processed" / "transactions_clean.csv"
path_rfm_raw = ROOT / "data" / "features" / "rfm_raw.csv"
path_rfm_scaled = ROOT / "data" / "features" / "rfm_scaled.csv"
path_scaler = ROOT / "outputs" / "models" / "rfm_scaler.joblib"

print("Fichiers à sauvegarder:")
print(f"  - {path_transactions}")
print(f"  - {path_rfm_raw}")
print(f"  - {path_rfm_scaled}")
print(f"  - {path_scaler}")

Fichiers à sauvegarder:
  - /home/apollinaire_12/customer-segmentation/data/processed/transactions_clean.csv
  - /home/apollinaire_12/customer-segmentation/data/features/rfm_raw.csv
  - /home/apollinaire_12/customer-segmentation/data/features/rfm_scaled.csv
  - /home/apollinaire_12/customer-segmentation/outputs/models/rfm_scaler.joblib


In [14]:
# Sauvegarde
df_clean.to_csv(path_transactions, index=False)
rfm.to_csv(path_rfm_raw, index=False)
rfm_scaled.to_csv(path_rfm_scaled, index=False)
joblib.dump(scaler, path_scaler)

print("=== SAUVEGARDE TERMINÉE ===")
print(f"✓ Transactions nettoyées: {len(df_clean):,} lignes")
print(f"✓ RFM brut: {len(rfm):,} clients")
print(f"✓ RFM scalé: {len(rfm_scaled):,} clients")
print(f"✓ Scaler: {type(scaler).__name__}")

=== SAUVEGARDE TERMINÉE ===
✓ Transactions nettoyées: 397,884 lignes
✓ RFM brut: 4,338 clients
✓ RFM scalé: 4,338 clients
✓ Scaler: RobustScaler
