<h4><b>Importing necessary libraries and connecting to the database</b></h4>

In [22]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Connect to SQLite database
conn = sqlite3.connect("../ecommerce.db")

<h4><b>Loading and joining tables as dataset</b></h4>

In [23]:
fact_sales = pd.read_sql("SELECT * FROM fact_sales", conn)
dim_date = pd.read_sql("SELECT date_key, date FROM dim_date", conn)

fact_sales = fact_sales.merge(dim_date, on="date_key", how="left")
fact_sales["date"] = pd.to_datetime(fact_sales["date"])

<h4><b>Defining Analysis Date (For Recency Calculations)</b></h4>

In [24]:
analysis_date = fact_sales["date"].max()
analysis_date

Timestamp('2011-12-09 00:00:00')

<h4><b>Computing Recency</b></h4>

In [25]:
recency = fact_sales.groupby("customer_id")["date"].max().reset_index()
recency["recency_days"] = (analysis_date - recency["date"]).dt.days
recency.head()

Unnamed: 0,customer_id,date,recency_days
0,12346,2011-01-18,325
1,12347,2011-12-07,2
2,12348,2011-09-25,75
3,12349,2011-11-21,18
4,12350,2011-02-02,310


<h4><b>Computing Frequency</b></h4>

In [26]:
frequency= fact_sales.groupby("customer_id").size().reset_index(name="frequency")
frequency.head()

Unnamed: 0,customer_id,frequency
0,12346,1
1,12347,182
2,12348,31
3,12349,73
4,12350,17


<h4><b>Computing Monetary value and log transforming it</b></h4>

In [27]:
monetary=fact_sales.groupby("customer_id",as_index=False)["revenue"].sum().rename(columns={"revenue":"monetary"})
monetary["monetary_log"] = np.log1p(monetary["monetary"]) #we are using log1p(log(1+x)) so that the code doesnt break in the event of unexpected 0
monetary.describe()

Unnamed: 0,customer_id,monetary,monetary_log
count,4338.0,4338.0,4338.0
mean,15300.408022,2054.26646,6.593627
std,1721.808492,8989.230441,1.257578
min,12346.0,3.75,1.558145
25%,13813.25,307.415,5.731446
50%,15299.5,674.485,6.515431
75%,16778.75,1661.74,7.416222
max,18287.0,280206.02,12.543284


<h4><b>Merging Recency , Frequency , Monetary(R,F,M)</b><h4>

In [28]:
rfm = monetary.merge(recency, on="customer_id",how="inner").merge(frequency,on="customer_id",how="inner")
rfm.head()

Unnamed: 0,customer_id,monetary,monetary_log,date,recency_days,frequency
0,12346,77183.6,11.253955,2011-01-18,325,1
1,12347,4310.0,8.368925,2011-12-07,2,182
2,12348,1797.24,7.494564,2011-09-25,75,31
3,12349,1757.55,7.472245,2011-11-21,18,73
4,12350,334.4,5.815324,2011-02-02,310,17


<h4><b>Quantile RFM Scoring (5 point based)</b><h4>

In [29]:
# Recency Score
rfm["r_score"] = pd.qcut(rfm["recency_days"],5,labels=[5,4,3,2,1]) #in case of recency fewer days is better, hence opposite direction of labelling
# Frequency Score
rfm["f_score"]=pd.qcut(rfm["frequency"],5,labels=[1,2,3,4,5])
# Monetary Score
rfm["m_score"]=pd.qcut(rfm["monetary_log"],5,labels=[1,2,3,4,5])
rfm[["r_score","f_score","m_score"]].astype(int).describe()

Unnamed: 0,r_score,f_score,m_score
count,4338.0,4338.0,4338.0
mean,3.006455,2.978792,3.0
std,1.41322,1.4293,1.41454
min,1.0,1.0,1.0
25%,2.0,2.0,2.0
50%,3.0,3.0,3.0
75%,4.0,4.0,4.0
max,5.0,5.0,5.0


<h4><b>Building RFM code </h4><b>

In [30]:
rfm["rfm_code"]=rfm["r_score"].astype("str")+rfm["f_score"].astype("str") + rfm["m_score"].astype("str")
rfm.head()

Unnamed: 0,customer_id,monetary,monetary_log,date,recency_days,frequency,r_score,f_score,m_score,rfm_code
0,12346,77183.6,11.253955,2011-01-18,325,1,1,1,5,115
1,12347,4310.0,8.368925,2011-12-07,2,182,5,5,5,555
2,12348,1797.24,7.494564,2011-09-25,75,31,2,3,4,234
3,12349,1757.55,7.472245,2011-11-21,18,73,4,4,4,444
4,12350,334.4,5.815324,2011-02-02,310,17,1,2,2,122


<h4><b>Implementing Segmentation Logic</b></h4>

In [56]:
def rfm_segmentation(row):
    r = int(row["r_score"])
    f = int(row["f_score"])
    m = int(row["m_score"])
    active = r >= 3
    high_value = (f >= 4) or (m >= 4)
    if active and high_value:
        return "Active High-Value"
    elif active and not high_value:
        return "Active Low-Value"
    elif not active and high_value:
        return "Inactive High-Value"
    return "Inactive Low-Value"
    

rfm["segments"] = rfm.apply(rfm_segmentation, axis=1)
rfm.head()


Unnamed: 0,customer_id,monetary,monetary_log,date,recency_days,frequency,r_score,f_score,m_score,rfm_code,segments
0,12346,77183.6,11.253955,2011-01-18,325,1,1,1,5,115,Inactive High-Value
1,12347,4310.0,8.368925,2011-12-07,2,182,5,5,5,555,Active High-Value
2,12348,1797.24,7.494564,2011-09-25,75,31,2,3,4,234,Inactive High-Value
3,12349,1757.55,7.472245,2011-11-21,18,73,4,4,4,444,Active High-Value
4,12350,334.4,5.815324,2011-02-02,310,17,1,2,2,122,Inactive Low-Value


<h4><b>Creating Segmentation Summary</b></h4>

In [58]:
segment_summary = (
    rfm
    .groupby("segments")
    .agg(
        no_of_customers=("customer_id", "count"),
        total_revenue=("monetary", "sum"),
        avg_revenue=("monetary", "mean"), median_revenue=("monetary", "median"),
        avg_recency=("recency_days", "mean"),median_recency=("recency_days", "median"),
        avg_frequency=("frequency", "mean"), median_frequency=("frequency", "median")
        
    )
    .sort_values("total_revenue", ascending=False).round(3)
)
segment_summary["revenue_share%"] = (
    segment_summary["total_revenue"]
    / segment_summary["total_revenue"].sum()
    * 100
).round(2)

segment_summary


Unnamed: 0_level_0,no_of_customers,total_revenue,avg_revenue,median_revenue,avg_recency,median_recency,avg_frequency,median_frequency,revenue_share%
segments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Active High-Value,1646,7174640.87,4358.834,1907.17,22.296,17.0,189.249,116.0,80.51
Inactive High-Value,449,902587.241,2010.217,1285.04,152.902,127.0,83.929,71.0,10.13
Inactive Low-Value,1278,440036.992,344.317,304.83,205.201,200.0,19.185,16.0,4.94
Active Low-Value,965,394142.801,408.438,365.76,32.907,30.0,25.055,24.0,4.42
