# E-Commerce-Analysis-and-Prediction-Modelling :- RFM Segmentation

In [None]:
import os
import pandas as pd
import numpy as np

os.makedirs("data/processed", exist_ok=True)

# Load data
df = pd.read_csv("data/processed/transactions_with_features.csv", parse_dates=["InvoiceDate"])

In [9]:
snapshot_date = df["InvoiceDate"].max() + pd.Timedelta(days=1)
print("Snapshot date:", snapshot_date)

# Group by customer and compute Recency / Frequency / Monetary
rfm = (
    df.groupby("CustomerID")
        .agg({
            "InvoiceDate": lambda x: (snapshot_date - x.max()).days,  # days since last purchase
            "InvoiceNo": "nunique",                                   # number of invoices
            "TotalAmount": "sum"                                      # total spent
        })
        .reset_index()
)

rfm.columns = ["CustomerID", "Recency", "Frequency", "Monetary"]

print("RFM base table:")
print(rfm.head())

# Drop any customers with missing values in R/F/M just in case
rfm = rfm.dropna(subset=["Recency", "Frequency", "Monetary"]).copy()

Snapshot date: 2025-12-09 23:48:00
RFM base table:
  CustomerID  Recency  Frequency  Monetary
0   CUST1000    157.0         11  18138.19
1   CUST1001    155.0         11  31621.84
2   CUST1002      3.0         14  33668.52
3   CUST1003    429.0          6  10754.45
4   CUST1004     62.0         12  24496.20


In [10]:
# Use rank() so qcut doesn't break on ties
r_rank = rfm["Recency"].rank(method="first")     # lower is better (more recent)
f_rank = rfm["Frequency"].rank(method="first")   # higher is better
m_rank = rfm["Monetary"].rank(method="first")    # higher is better

# labels=False -> numeric bins: 0,1,2,3
r_bins = pd.qcut(r_rank, 4, labels=False, duplicates="drop")
f_bins = pd.qcut(f_rank, 4, labels=False, duplicates="drop")
m_bins = pd.qcut(m_rank, 4, labels=False, duplicates="drop")

# Recency: lower recency (more recent) => higher score
rfm["R_Score"] = r_bins.map({0: 4, 1: 3, 2: 2, 3: 1}).fillna(1).astype(int)

# Frequency: higher frequency => higher score
rfm["F_Score"] = f_bins.map({0: 1, 1: 2, 2: 3, 3: 4}).fillna(1).astype(int)

# Monetary: higher spend => higher score
rfm["M_Score"] = m_bins.map({0: 1, 1: 2, 2: 3, 3: 4}).fillna(1).astype(int)

print("\nScore NaN check:")
print(rfm[["R_Score", "F_Score", "M_Score"]].isna().sum())

rfm["RFM_Score"] = (
    rfm["R_Score"].astype(str) +
    rfm["F_Score"].astype(str) +
    rfm["M_Score"].astype(str)
)

print("\nRFM with scores:")
print(rfm.head())



Score NaN check:
R_Score    0
F_Score    0
M_Score    0
dtype: int64

RFM with scores:
  CustomerID  Recency  Frequency  Monetary  R_Score  F_Score  M_Score  \
0   CUST1000    157.0         11  18138.19        3        2        1   
1   CUST1001    155.0         11  31621.84        3        2        3   
2   CUST1002      3.0         14  33668.52        4        4        3   
3   CUST1003    429.0          6  10754.45        1        1        1   
4   CUST1004     62.0         12  24496.20        4        3        2   

  RFM_Score  
0       321  
1       323  
2       443  
3       111  
4       432  


In [None]:
# Define segments based on RFM scores
conditions = [
    (rfm["R_Score"] >= 3) & (rfm["F_Score"] >= 3) & (rfm["M_Score"] >= 3),    # good on all 3
    (rfm["R_Score"] >= 3) & (rfm["F_Score"] >= 2),                            # recent & frequent
    (rfm["R_Score"] >= 2) & (rfm["F_Score"] >= 2),                            # average but engaged
    (rfm["R_Score"] >= 2) & (rfm["F_Score"] < 2),                             # used to be active
]

choices = ["VIP", "Loyal", "Regular", "At Risk"]

rfm["Segment"] = np.select(conditions, choices, default="Low Value")

print("\nSegment counts:")
print(rfm["Segment"].value_counts())


Segment counts:
Segment
Low Value    2223
VIP          2060
Regular      1660
Loyal        1616
At Risk      1333
Name: count, dtype: int64


In [None]:
# Save RFM table with segments
rfm_path = "data/processed/rfm_table.csv"
rfm.to_csv(rfm_path, index=False)

print(f"\nRFM table with segments saved to: {rfm_path}")



RFM table with segments saved to: data/processed/rfm_table.csv
