# Customer_Segementation - RFM Feature Engineering

In [None]:
# Step 2 — RFM Feature Engineering
# Step 2.0 Environment & Inputs
import pandas as pd

# Load cleaned datasets
olist = pd.read_csv("../data/processed/olist_orders_cleaned.csv")
northwind = pd.read_csv("../data/processed/northwind_orders_cleaned.csv")

# Normalize date fields
olist["order_date"] = pd.to_datetime(olist["order_date"], errors="coerce")
northwind["orderDate"] = pd.to_datetime(northwind["orderDate"], errors="coerce")

In [None]:
# Step 2.1 — Reference Date Selection
# step 2.1.1 Definition (Applied Identically)
# Reference date = latest observed transaction date + 1 day
# step 2.1.2 Compute Reference Dates
olist_reference_date = olist["order_date"].max() + pd.Timedelta(days=1)
northwind_reference_date = northwind["orderDate"].max() + pd.Timedelta(days=1)

# Hard validation
assert olist_reference_date > olist["order_date"].max()
assert northwind_reference_date > northwind["orderDate"].max()

In [None]:
# Step 2.2 — Olist RFM Feature Construction
# Step 2.2.1 Aggregate to Customer Level
olist_rfm = (
    olist
    .groupby("customer_unique_id")
    .agg(
        recency=("order_date", lambda x: (olist_reference_date - x.max()).days),
        frequency=("order_id", "nunique"),
        monetary=("payment_value", "sum")
    )
    .reset_index()
)

In [4]:
# Find problematic rows
bad_rows = olist_rfm[olist_rfm["monetary"] <= 0]
print(bad_rows)

# Quick stats
print(olist_rfm["monetary"].describe())

                     customer_unique_id  recency  frequency  monetary
47882  830d5b7aaa3b6f1e9ad63703bec97d23      714          1       0.0
count    93358.000000
mean       165.197003
std        226.314012
min          0.000000
25%         63.052500
50%        107.780000
75%        182.557500
max      13664.080000
Name: monetary, dtype: float64


In [14]:
# Exclude zero‑monetary customers
olist_rfm = olist_rfm[olist_rfm["monetary"] > 0].copy()

# Zero-monetary customers cannot be meaningfully ranked in value-based segmentation and were excluded prior to scoring.

In [None]:
# Step 2.2.2 Structural Validations
# Remove invalid customers
olist_rfm = olist_rfm[olist_rfm["monetary"] > 0]

# Ensure uniqueness
olist_rfm = olist_rfm.drop_duplicates(subset="customer_unique_id")

# Re-run validations
assert (olist_rfm["recency"] >= 1).all()
assert (olist_rfm["frequency"] >= 1).all()
assert (olist_rfm["monetary"] > 0).all()
assert olist_rfm["customer_unique_id"].is_unique


In [None]:
# Step 2.3 — Northwind RFM Feature Construction (Independent, identical logic)
# Step 2.3.1 Aggregate to Customer Level
northwind_rfm = (
    northwind
    .groupby("customerID")
    .agg(
        recency=("orderDate", lambda x: (northwind_reference_date - x.max()).days),
        frequency=("orderID", "nunique"),
        monetary=("order_revenue", "sum")
    )
    .reset_index()
)


In [None]:
# Step 2.3.2 Structural Validations
assert (northwind_rfm["recency"] >= 1).all()
assert (northwind_rfm["frequency"] >= 1).all()
assert (northwind_rfm["monetary"] > 0).all()
assert northwind_rfm["customerID"].is_unique

In [20]:
# Step 2.4 — Quantile-Based RFM Score Construction (Scoring only, no interpretation)
# Step 2.4.1 Olist RFM Scores
olist_rfm["R_score"] = pd.qcut(
    olist_rfm["recency"],
    q=5,
    labels=[5, 4, 3, 2, 1],  # lower recency = better
    duplicates="drop"
).astype(int)

olist_rfm["F_score"] = pd.qcut(
    olist_rfm["frequency"],
    q=5,
    labels=False,
    duplicates="drop"
) + 1


olist_rfm["M_score"] = pd.qcut(
    olist_rfm["monetary"],
    q=5,
    labels=[1, 2, 3, 4, 5],
    duplicates="drop"
).astype(int)

In [21]:
# Step 2.4.2 Northwind RFM Scores
northwind_rfm["R_score"] = pd.qcut(
    northwind_rfm["recency"],
    q=5,
    labels=[5, 4, 3, 2, 1]
).astype(int)

northwind_rfm["F_score"] = pd.qcut(
    northwind_rfm["frequency"],
    q=5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

northwind_rfm["M_score"] = pd.qcut(
    northwind_rfm["monetary"],
    q=5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

In [22]:
# Step 2.4.3 Composite RFM Code
olist_rfm["RFM"] = (
    olist_rfm["R_score"].astype(str)
    + olist_rfm["F_score"].astype(str)
    + olist_rfm["M_score"].astype(str)
)

northwind_rfm["RFM"] = (
    northwind_rfm["R_score"].astype(str)
    + northwind_rfm["F_score"].astype(str)
    + northwind_rfm["M_score"].astype(str)
)

In [23]:
# Step 3 Validation & Inspection
# Output Validation & Inspection (Non-Modeling)

print("Olist RFM shape:", olist_rfm.shape)
print("Northwind RFM shape:", northwind_rfm.shape)

print("\nOlist RFM sample:")
display(olist_rfm.head())

print("\nNorthwind RFM sample:")
display(northwind_rfm.head())

print("\nOlist RFM summary statistics:")
olist_rfm.describe()


Olist RFM shape: (93357, 8)
Northwind RFM shape: (89, 8)

Olist RFM sample:


Unnamed: 0,customer_unique_id,recency,frequency,monetary,R_score,F_score,M_score,RFM
0,0000366f3b9a7992bf8c76cfdf3221e2,112,1,141.9,4,1,4,414
1,0000b849f77a49e4a4ce2b2a4ca5be3f,115,1,27.19,4,1,1,411
2,0000f46a3911fa3c0805444483337064,537,1,86.22,1,1,2,112
3,0000f6ccb0745a6a4b88665a16c9f078,321,1,43.62,2,1,1,211
4,0004aac84e0df4da2b147fca70cf8255,288,1,196.89,2,1,4,214



Northwind RFM sample:


Unnamed: 0,customerID,recency,frequency,monetary,R_score,F_score,M_score,RFM
0,ALFKI,28,6,4273.0,3,2,2,322
1,ANATR,64,4,1402.95,2,1,1,211
2,ANTON,99,7,7023.9775,1,2,3,123
3,AROUT,27,13,13390.65,3,4,4,344
4,BERGS,64,18,24927.5775,2,5,5,255



Olist RFM summary statistics:


Unnamed: 0,recency,frequency,monetary,R_score,F_score,M_score
count,93357.0,93357.0,93357.0,93357.0,93357.0,93357.0
mean,237.937155,1.03342,165.198772,3.003299,1.0,2.999807
std,152.584344,0.209099,226.314579,1.414338,0.0,1.414251
min,1.0,1.0,9.59,1.0,1.0,1.0
25%,114.0,1.0,63.06,2.0,1.0,2.0
50%,219.0,1.0,107.78,3.0,1.0,3.0
75%,346.0,1.0,182.56,4.0,1.0,4.0
max,695.0,15.0,13664.08,5.0,1.0,5.0


## Methodological Notes and Justifications
### Frequency Score Behavior in Olist Dataset

The Olist dataset exhibits a highly right-skewed purchase distribution, where the overwhelming majority of customers placed exactly one order. As a result, quantile-based binning for the frequency dimension collapses into a single dominant score. This behavior reflects the underlying data characteristics rather than a methodological error. The frequency feature is retained for methodological completeness and consistency with standard RFM frameworks, but it provides limited discriminative power for this dataset.  


### Treatment of Zero-Monetary Customers

A small number of customers exhibited zero total monetary value across all recorded transactions. These observations were excluded prior to RFM scoring, as monetary value is a required dimension for value-based customer segmentation. Including zero-monetary customers would prevent meaningful ranking and distort quantile-based scoring. The exclusion affects a negligible fraction of the dataset and does not materially impact overall results.  


### Choice of Quantile-Based RFM Scoring

Quantile-based scoring was selected to ensure scale-free, distribution-agnostic segmentation and to allow relative comparison among customers within the same dataset. Each RFM dimension is scored independently based on its empirical distribution, which avoids assumptions of normality and supports interpretability in marketing and customer analytics contexts. Scores are therefore dataset-relative and should not be directly compared across different datasets.

In [24]:
# Save RFM datasets
olist_rfm.to_csv("../data/processed/olist_rfm.csv", index=False)
northwind_rfm.to_csv("../data/processed/northwind_rfm.csv", index=False)