# Customer_Segmentation - Customer Personas & Business Translation

In [1]:
# Step 5: Import libraries 
import pandas as pd
import numpy as np

# Load clustered RFM outputs
olist = pd.read_csv("../data/processed/olist_rfm_with_clusters.csv")
northwind = pd.read_csv("../data/processed/northwind_rfm_with_clusters.csv")

In [2]:
# Step 5.1: Structural validation (hard checks)
required_cols = ["recency", "frequency", "monetary", "cluster"]

for df, name in [(olist, "Olist"), (northwind, "Northwind")]:
    for col in required_cols:
        assert col in df.columns, f"{name}: Missing {col}"
    assert df[required_cols].isna().sum().sum() == 0

In [3]:
# Step 5.2: Cluster Profiling (Numerical)
# Step 5.2.1: Olist Cluster Profile
olist_cluster_profile = (
    olist
    .groupby("cluster")
    .agg(
        customers=("customer_unique_id", "count"),
        avg_recency=("recency", "mean"),
        avg_frequency=("frequency", "mean"),
        avg_monetary=("monetary", "mean"),
        total_revenue=("monetary", "sum")
    )
    .sort_values("total_revenue", ascending=False)
)

olist_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,50644,128.076001,1.0,134.380553,6805568.73
1,37525,387.400453,1.0,133.461317,5008135.91
3,2416,239.46399,1.012831,1161.32594,2805763.47
2,2772,220.444084,2.114358,289.680253,802993.66


In [6]:
# Revenue share (business critical-Olist)
olist_cluster_profile["revenue_share_pct"] = (
    olist_cluster_profile["total_revenue"]
    / olist_cluster_profile["total_revenue"].sum()
    * 100
)

olist_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue,revenue_share_pct
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,50644,128.076001,1.0,134.380553,6805568.73,44.127642
1,37525,387.400453,1.0,133.461317,5008135.91,32.472999
3,2416,239.46399,1.012831,1161.32594,2805763.47,18.192708
2,2772,220.444084,2.114358,289.680253,802993.66,5.20665


In [7]:
# Step 5.2.2: Northwind Cluster Profile (Validation)
northwind_cluster_profile = (
    northwind
    .groupby("cluster")
    .agg(
        customers=("customerID", "count"),
        avg_recency=("recency", "mean"),
        avg_frequency=("frequency", "mean"),
        avg_monetary=("monetary", "mean"),
        total_revenue=("monetary", "sum")
    )
    .sort_values("total_revenue", ascending=False)
)

northwind_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,28,23.464286,13.821429,22777.556143,637771.572
3,3,10.333333,29.666667,106504.7445,319514.2335
1,56,53.25,6.267857,5500.88275,308049.434
2,2,504.0,1.5,228.9,457.8


In [8]:
# Revenue share (business critical-northwind)
northwind_cluster_profile["revenue_share_pct"] = (
    northwind_cluster_profile["total_revenue"]
    / northwind_cluster_profile["total_revenue"].sum()
    * 100
)

northwind_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue,revenue_share_pct
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,23.464286,13.821429,22777.556143,637771.572,50.385138
3,3,10.333333,29.666667,106504.7445,319514.2335,25.242218
1,56,53.25,6.267857,5500.88275,308049.434,24.336477
2,2,504.0,1.5,228.9,457.8,0.036167


In [9]:
# Step 5.3: Persona Definition Logic (Rule-Based, Interpretable)
# Step 5.3.1: Define Persona Mapping (Olist-Calibrated)
def assign_persona(row):
    if row["avg_monetary"] > olist["monetary"].quantile(0.75) and row["avg_recency"] < olist["recency"].quantile(0.25):
        return "Champions"
    elif row["avg_monetary"] > olist["monetary"].median():
        return "High-Value Loyal"
    elif row["avg_recency"] < olist["recency"].median():
        return "Potential Loyalists"
    else:
        return "At-Risk / Low Value"

In [11]:
# Step 5.4: Assign Personas to Clusters
# Step 5.4.1: Olist Personas
olist_cluster_profile["persona"] = olist_cluster_profile.apply(assign_persona, axis=1)
olist_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue,revenue_share_pct,persona
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,50644,128.076001,1.0,134.380553,6805568.73,44.127642,High-Value Loyal
1,37525,387.400453,1.0,133.461317,5008135.91,32.472999,High-Value Loyal
3,2416,239.46399,1.012831,1161.32594,2805763.47,18.192708,High-Value Loyal
2,2772,220.444084,2.114358,289.680253,802993.66,5.20665,High-Value Loyal


In [12]:
# Map personas back to customer-level table
olist = olist.merge(
    olist_cluster_profile["persona"],
    left_on="cluster",
    right_index=True,
    how="left"
)

In [13]:
# Step 5.4.2: Northwind Personas (Structural Validation)
northwind_cluster_profile["persona"] = northwind_cluster_profile.apply(assign_persona, axis=1)
northwind_cluster_profile

Unnamed: 0_level_0,customers,avg_recency,avg_frequency,avg_monetary,total_revenue,revenue_share_pct,persona
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,28,23.464286,13.821429,22777.556143,637771.572,50.385138,Champions
3,3,10.333333,29.666667,106504.7445,319514.2335,25.242218,Champions
1,56,53.25,6.267857,5500.88275,308049.434,24.336477,Champions
2,2,504.0,1.5,228.9,457.8,0.036167,High-Value Loyal


In [14]:
northwind = northwind.merge(
    northwind_cluster_profile["persona"],
    left_on="cluster",
    right_index=True,
    how="left"
)

In [15]:
# Step 5.5: Persona-Level Business Metrics
# Step 5.5.1 Olist Persona Summary
olist_persona_summary = (
    olist
    .groupby("persona")
    .agg(
        customers=("customer_unique_id", "count"),
        total_revenue=("monetary", "sum"),
        avg_order_value=("monetary", "mean"),
        avg_recency=("recency", "mean")
    )
    .sort_values("total_revenue", ascending=False)
)

olist_persona_summary

Unnamed: 0_level_0,customers,total_revenue,avg_order_value,avg_recency
persona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High-Value Loyal,93357,15422461.77,165.198772,237.937155


In [16]:
olist_persona_summary["revenue_share_pct"] = (
    olist_persona_summary["total_revenue"]
    / olist_persona_summary["total_revenue"].sum()
    * 100
)

olist_persona_summary

Unnamed: 0_level_0,customers,total_revenue,avg_order_value,avg_recency,revenue_share_pct
persona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
High-Value Loyal,93357,15422461.77,165.198772,237.937155,100.0


In [17]:
# Step 5.5.2: Northwind Persona Summary (Validation)
northwind_persona_summary = (
    northwind
    .groupby("persona")
    .agg(
        customers=("customerID", "count"),
        total_revenue=("monetary", "sum"),
        avg_order_value=("monetary", "mean"),
        avg_recency=("recency", "mean")
    )
    .sort_values("total_revenue", ascending=False)
)

northwind_persona_summary

Unnamed: 0_level_0,customers,total_revenue,avg_order_value,avg_recency
persona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Champions,87,1265335.0,14544.083213,42.183908
High-Value Loyal,2,457.8,228.9,504.0


In [18]:
# Step 5.6: Business Action Mapping (Decision Translation)
business_actions = {
    "Champions": "Retention, VIP rewards, early access, premium bundles",
    "High-Value Loyal": "Upsell, cross-sell, subscription nudges",
    "Potential Loyalists": "Engagement campaigns, incentives to repeat",
    "At-Risk / Low Value": "Reactivation offers, churn prevention or cost control"
}

In [None]:
olist_persona_summary["recommended_action"] = (
    olist_persona_summary.index.map(business_actions)
)

olist_persona_summary 

Unnamed: 0_level_0,customers,total_revenue,avg_order_value,avg_recency,revenue_share_pct,recommended_action
persona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
High-Value Loyal,93357,15422461.77,165.198772,237.937155,100.0,"Upsell, cross-sell, subscription nudges"


In [20]:
# Step 5.7: Cross-Dataset Validation (Executive Check)
comparison = pd.concat(
    [
        olist_persona_summary[["customers", "revenue_share_pct"]],
        northwind_persona_summary[["customers", "total_revenue"]]
    ],
    axis=1,
    keys=["Olist", "Northwind"]
)

comparison

Unnamed: 0_level_0,Olist,Olist,Northwind,Northwind
Unnamed: 0_level_1,customers,revenue_share_pct,customers,total_revenue
persona,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
High-Value Loyal,93357.0,100.0,2,457.8
Champions,,,87,1265335.0


In [21]:
# Step 5.8: Save Final Business Outputs
olist.to_csv("../data/processed/olist_customers_with_personas.csv", index=False)
northwind.to_csv("../data/processed/northwind_customers_with_personas.csv", index=False)

olist_cluster_profile.to_csv("../data/processed/olist_cluster_personas.csv")
northwind_cluster_profile.to_csv("../data/processed/northwind_cluster_personas.csv")
