In [41]:
# Setup the database and save the data
import duckdb
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

df = pd.read_csv(
    '../data/ground-truth/data.csv', 
    sep=";", 
    decimal=',', 
    low_memory=False,
    parse_dates=['LastPaymentDate', 'PenultimatePaymentDate', 'Date of Birth'], 
    dayfirst=True
)
df.columns = df.columns.str.replace(' ', '_').str.lower()

df.dtypes


customer_number                    int64
gender                            object
date_of_birth             datetime64[ns]
postcode                          object
count2015                          int64
sum2015                          float64
merchandise2015                    int64
count2016                          int64
sum2016                          float64
merchandise2016                    int64
count2017                          int64
sum2017                          float64
merchandiese2017                   int64
count2018                          int64
sum2018                          float64
merchandiese2018                   int64
count2019                          int64
sum2019                          float64
merchandise2019                    int64
lastpaymentdate           datetime64[ns]
penultimatepaymentdate    datetime64[ns]
dtype: object

In [68]:
df_quantiles = duckdb.query("""
WITH summary_data AS (
    SELECT
        customer_number,
        sum(sum2015 + sum2016 + sum2017 + sum2018 + sum2019) AS total_sum,
        sum(count2015 + count2016 + count2017 + count2018 + count2019) AS total_count,
        (SELECT MAX(lastpaymentdate) from df) - lastpaymentdate AS days_since_last_payment,
    FROM df
    GROUP BY 
        customer_number,
        lastpaymentdate
),
quantiles AS (
    SELECT
        quantile(total_sum, 0.25) AS total_sum_1,
        quantile(total_sum, 0.5) AS total_sum_2,
        quantile(total_sum, 0.75) AS total_sum_3,
        quantile(total_count, 0.25) AS total_count_1,
        quantile(total_count, 0.5) AS total_count_2,
        quantile(total_count, 0.75) AS total_count_3,
        quantile(days_since_last_payment, 0.25) AS days_since_last_payment_1,
        quantile(days_since_last_payment, 0.5) AS days_since_last_payment_2,
        quantile(days_since_last_payment, 0.75) AS days_since_last_payment_3,
    FROM summary_data
)
SELECT
    customer_number,
    CASE
        WHEN total_sum < total_sum_1 THEN 1
        WHEN total_sum < total_sum_2 THEN 2
        WHEN total_sum < total_sum_3 THEN 3
        ELSE 4
    END AS monetary_quantile,
    CASE
        WHEN total_count < total_count_1 THEN 1
        WHEN total_count < total_count_2 THEN 2
        WHEN total_count < total_count_3 THEN 3
        ELSE 4
    END AS frequency_quantile,
    -- Reversed the order of the quantiles for recency
    CASE
        WHEN days_since_last_payment < days_since_last_payment_1 THEN 4
        WHEN days_since_last_payment < days_since_last_payment_2 THEN 3
        WHEN days_since_last_payment < days_since_last_payment_3 THEN 2
        ELSE 1
    END AS recency_quantile,
FROM summary_data, quantiles
""").df()
df_quantiles

Unnamed: 0,customer_number,monetary_quantile,frequency_quantile,recency_quantile
0,307205,1,2,1
1,307210,2,1,1
2,307211,3,3,4
3,307215,2,1,1
4,307226,3,4,1
...,...,...,...,...
406729,200827,1,2,1
406730,203046,2,3,3
406731,203350,4,4,4
406732,201770,1,1,2


In [72]:
df_segmented = duckdb.query("""
SELECT
    CASE
        WHEN monetary_quantile >= 4 AND frequency_quantile >= 4 AND recency_quantile >= 4 THEN 'Champions'
        WHEN monetary_quantile >= 4 AND frequency_quantile >= 4 AND recency_quantile <= 1 THEN 'Lost Champions'
        WHEN monetary_quantile >= 3 AND frequency_quantile >= 3 AND recency_quantile >= 3 THEN 'Loyal Customers'
        WHEN monetary_quantile >= 3 AND frequency_quantile >= 2 AND recency_quantile >= 4 THEN 'Potential Loyalists'
        WHEN monetary_quantile >= 2 AND frequency_quantile >= 1 AND recency_quantile >= 4 THEN 'New Customers'
        WHEN monetary_quantile >= 2 AND frequency_quantile >= 1 AND recency_quantile >= 3 THEN 'Promising'
        WHEN monetary_quantile >= 2 AND frequency_quantile >= 1 AND recency_quantile >= 2 THEN 'Hibernating'
        WHEN monetary_quantile >= 1 AND frequency_quantile >= 1 AND recency_quantile >= 1 THEN 'Lost'
    END AS customer_segment,
FROM df_quantiles
""").df()
df_segmented

Unnamed: 0,customer_segment
0,Lost
1,Lost
2,Loyal Customers
3,Lost
4,Lost
...,...
406729,Lost
406730,Promising
406731,Champions
406732,Lost


In [75]:
duckdb.query("""
SELECT
    customer_segment,
    COUNT(*) AS count
FROM df_segmented
GROUP BY customer_segment
""").df()

Unnamed: 0,customer_segment,count
0,Lost,161658
1,Loyal Customers,90474
2,Champions,35749
3,Hibernating,65566
4,Promising,27339
5,New Customers,19856
6,Potential Loyalists,2757
7,Lost Champions,3335
