# Overview

Notebook ini digunakan untuk analisa customer seperti **RFM Analysis** dan **Churn Analysis**

# Libaries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
from datetime import datetime, timedelta
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import sample_colorscale

from lifelines import KaplanMeierFitter

# For notebook display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

import plotly.io as pio
pio.renderers.default = 'notebook_connected'

# Load Data

In [2]:
df = pd.read_parquet('data/preprocessed/cleaned_data.parquet')

# RFM Analysis

Salah satu hal yang common untuk dianalisa adalah, bagaimana komposisi customer kita? berapa banyak top customer, berapa yang new customer, berapa yang lost, etc

Hal itu dapat di achieve oleh RFM analysis (Recency Frequency Monetary)

In [3]:
df = df[~df['order_id_cancelled']]

# Calculate RFM metrics per customer
now = df['order_date'].max() + timedelta(days=1) # Latest date untuk recency
rfm = (
    df.groupby('customer_id')
      .agg(
          recency=('order_date', lambda x: (now - x.max()).days), # Recency
          frequency=('order_id', 'nunique'), # Frequency
          monetary=('adjusted_gmv', 'sum') # Monetary
      )
      .reset_index()
)

# Rank metric tersebut
rfm['recency_rank'] = rfm['recency'].rank(method='first')
rfm['frequency_rank'] = rfm['frequency'].rank(method='first')
rfm['monetary_rank'] = rfm['monetary'].rank(method='first')

# Bagi jadi 4 quartil
try: # In case data tidak cukup, wrap dengan try except
    # Recency makin kecil makin bagus (4)
    rfm['R'] = pd.qcut(rfm['recency_rank'], 4, labels=[4,3,2,1]).astype(int)
except ValueError:
    rfm['R'] = 1

try:
    # Frequency makin besar makin bagus (4)
    rfm['F'] = pd.qcut(rfm['frequency_rank'], 4, labels=[1,2,3,4]).astype(int)
except ValueError:
    rfm['F'] = 1

try:
    # Monetary makin besar makin bagus (4)
    rfm['M'] = pd.qcut(rfm['monetary_rank'], 4, labels=[1,2,3,4]).astype(int)
except ValueError:
    rfm['M'] = 1

# Kalau data terlalu sedikit, tidak bisa pakai RFM
if rfm['recency'].nunique() <= 1 or rfm['frequency'].nunique() <= 1 or rfm['monetary'].nunique() <= 1:
    print("Not enough variation in RFM data to compute segments.")
    

# RFM score nya dari sum score
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)

# Penamaan manual untuk segment
def name_segment(row):
    if row['R']==4 and row['F']==4 and row['M']==4:
        return 'Champions'
    if row['R']>=3 and row['F']>=3 and row['M']>=3:
        return 'Loyal Customers'
    if row['R']==4:
        return 'Recent Customers'
    if row['F']>=3:
        return 'Frequent Customers'
    if row['M']>=3:
        return 'Big Spenders'
    if row['R']==1 and row['F']==1 and row['M']==1:
        return 'Lost Low Value'
    return 'Needs Attention'
rfm['Segment'] = rfm.apply(name_segment, axis=1)

# Compute range dari masing masing segment
segment_stats = (
    rfm.groupby('Segment')
       .agg(
           recency_min=('recency','min'), recency_max=('recency','max'),
           freq_min=('frequency','min'), freq_max=('frequency','max'),
           mon_min=('monetary','min'), mon_max=('monetary','max'),
           cust_count=('customer_id','count')
       )
).reset_index()

# Total customer untuk persentase
total_customers = len(rfm)
# Persentase
segment_stats['Pct_Customers'] = (segment_stats['cust_count'] / total_customers * 100).round(1).astype(str) + '%'

# Merge back data segment ke rfm
rfm = rfm.merge(segment_stats, on='Segment', how='left')

# Buat kolom untuk label
rfm['Recency_Range'] = rfm.apply(lambda x: f"{int(x.recency_min)}–{int(x.recency_max)} days", axis=1)
rfm['Frequency_Range'] = rfm.apply(lambda x: f"{int(x.freq_min)}–{int(x.freq_max)} orders", axis=1)
rfm['Monetary_Range'] = rfm.apply(lambda x: f"£{x.mon_min:,.0f}–£{x.mon_max:,.0f}", axis=1)
rfm['Customer_Pct'] = rfm['Pct_Customers']

# Untuk keperluan count customer
rfm['count'] = 1

# Plotly Treemap
fig = px.treemap(
    rfm,
    path=['Segment'], # Group yang mau di gambarkan
    values='count', # Fokus ke count customer per segment
    color='RFM_Score',
    title='Customer RFM Segments',
    color_continuous_scale=[[0, 'lightgreen'], [1, 'darkgreen']],
    # Hover text
    custom_data=['Recency_Range','Frequency_Range','Monetary_Range','Customer_Pct']
)
# Custom hover template untuk persentase customer
fig.update_traces(
    hovertemplate=(
        '<b>%{label}</b><br>' +
        '%{customdata[0]}<br>' +
        '%{customdata[1]}<br>' +
        '%{customdata[2]}<br>' +
        'Share of Customers: %{customdata[3]}<extra></extra>'
    )
)

# Layouting
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25), height=500)

fig.show()

Error importing optional module jupyterlab
Traceback (most recent call last):
  File "C:\Users\PC\anaconda3\envs\Kalbe-test\Lib\site-packages\_plotly_utils\optional_imports.py", line 28, in get_module
    return import_module(name)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\PC\anaconda3\envs\Kalbe-test\Lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<frozen importlib._bootstrap>", line 1204, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 690, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 940, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "C:\Users\PC\anaconda3\envs\Kalbe-test\Lib\site-packages\jupyterlab

# Order Frequency

Pertanyaan yang muncul selanjutnya adalah, kok mayoritas customer kita need attention? emang seberapa sering sih customer belanja di toko kita? mari kita plot frekuensi pembelian customer

In [4]:
# Hitung ada berapa unique order per customer
freq = df.groupby('customer_id')['order_id'].nunique()
# Set custom range untuk bar nya
bins = [1,2,4,7,11, np.inf]
labels = ['1','2-3','4-6','7-10','11+']

# Kelompokan data ke custom range tersebut
freq_binned = pd.cut(freq, bins=bins, labels=labels, right=False)

# Hitung count per label (ada berapa customer yang 1 kali pembelian, 2-3 kali pembelian, dst
dist = freq_binned.value_counts().reindex(labels).reset_index()
dist.columns = ['Order Frequency', 'Count']

# Plot dengan barplot
fig = px.bar(
    dist,
    x='Order Frequency',
    y='Count',
    title='Order Frequency Distribution',
    text='Count',
    height=500
)
fig.update_traces(marker_color='darkgreen', textposition='outside')

fig.show()

# New Customer Trend

Hal selanjutnya yang mungkin dicek adalah, bagaimana akuisisi new customer kita? dan bagaimana pula customer existing kita? berapa proporsinya?

In [5]:
df['month'] = df['order_date'].dt.to_period('M').dt.to_timestamp()

# Cari first order month dari customer, untuk flagging 'new'
first_orders = df.groupby('customer_id')['month'].min()
df['first_month'] = df['customer_id'].map(first_orders)

# Cocokan mana yang jadi pembelian pertama
new_customers = df[df['month'] == df['first_month']]

# Hitung monthly unique new customer
new_monthly = new_customers.groupby('month')['customer_id'].nunique().reset_index(name='new_customers')

# Hitung monthly unique customer
total_monthly = df.groupby('month')['customer_id'].nunique().reset_index(name='total_customers')

# Existing customers = total - new (agar tidak double perhitungan)
# Gabungkan total bulanan unique customer dan new customer
merged = pd.merge(total_monthly, new_monthly, on='month', how='left').fillna(0)
# Selisih sebagai existing
merged['existing_customers'] = merged['total_customers'] - merged['new_customers']

# ubah jadi long form (month, tipe (new/existing), customers (count)) agar mudah di plot
area_data = merged[['month', 'new_customers', 'existing_customers']].melt(
    id_vars='month', var_name='type', value_name='customers')

# Plotly area chart
fig = px.area(
    area_data,
    x='month',
    y='customers',
    color='type', # Grup berdasarkan tipe
    title='Monthly New vs Existing Customers',
    labels={'month': 'Month', 'customers': 'Customers', 'type': 'Customer Type'},
    height=500,
    color_discrete_map={ # Miripkan palette, hijau untuk customer
        'new_customers': 'lightgreen',
        'existing_customers': 'darkgreen'
    }
)
fig.update_traces( # Hover text
    hovertemplate='<b>%{fullData.name}</b><br>%{x|%b %Y}<br>Customers: %{y:,}<extra></extra>'
)
fig.update_layout(legend_title_text='Customer Type', xaxis=dict(tickformat='%b %Y'))

fig.show()

# Days between Purchase

Salah satu strategi yang bisa dilakukan untuk customer yang need attention adalah, strategi reminder. Namun berapa lama seorang customer harus di reminder setelah pembelanjaan terakhirnya? hal itu bisa dijawab dengan survival analysis

In [6]:
# Ambil unique customer id dan tanggal pembelian (untuk menghindari pembelian di hari yang sama)
df = df[['customer_id', 'order_date_only']].drop_duplicates()
# Sort per customer berdasarkan tanggal
df = df.sort_values(['customer_id', 'order_date_only'])

# Convert ke datetime
df['order_date_only'] = pd.to_datetime(df['order_date_only'])

# Hitung pembelian sebelumnya per customer
df['prev_date'] = df.groupby('customer_id')['order_date_only'].shift(1)

# Hitung selisih harinya
df['interval'] = (df['order_date_only'] - df['prev_date']).dt.days

# Gunakan konsep survival agar semua data dipakai dan tidak bias
# Misal customer baru 1 kali pembelian, pembelian pertamanya 1 bulan lalu
# Berarti days between purchase nya adalah 30 hari++ (>30 hari, tapi tidak tahu kapan / censored)

# Get latest date
analysis_date = df['order_date_only'].max()

# Get pembelian terakhir (tidak punya interval)
last_orders = df[df['interval'].isnull()]

# Hitung intervalnya dengan tanggal sekarang
last_orders['interval'] = (analysis_date - last_orders['order_date_only']).dt.days

# Flag sebagai censored data
last_orders['event_observed'] = 0  

# Yang intervalnya tidak null, berarti observed data
observed = df.dropna(subset=['interval']).copy()
observed['event_observed'] = 1

# Combine them
survival_df = pd.concat([
    observed[['interval', 'event_observed']],
    last_orders[['interval', 'event_observed']]
])

# Fit Kaplan-Meier untuk estimasi survival function
kmf = KaplanMeierFitter()
kmf.fit(durations=survival_df['interval'], event_observed=survival_df['event_observed'])

# Hitung Cumulative Distribution Function (CDF)
# CDF = 1 - Survival
surv_df = kmf.survival_function_.reset_index()
surv_df.columns = ['days_between', 'survival_rate']
surv_df['cdf'] = 1 - surv_df['survival_rate']

# Filter untuk 3 bulan pertama
surv_df = surv_df[surv_df['days_between'] <= 90]

# Cari median nya
median_day = kmf.median_survival_time_

# Plot
fig = go.Figure()

# Add line dengan plotly scatter
fig.add_trace(go.Scatter(
    x=surv_df['days_between'],
    y=surv_df['cdf'],
    mode='lines',
    name='CDF',
    line=dict(color='darkgreen', width=3)
))

# Horizontal 50% line (median)
fig.add_trace(go.Scatter(
    x=[0, surv_df['days_between'].max()],
    y=[0.5, 0.5],
    mode='lines',
    name='50% Mark',
    line=dict(color='gray', dash='dash'),
    showlegend=False
))

# Vertical line di median
if np.isfinite(median_day):
    # Vertical line at median
    fig.add_trace(go.Scatter(
        x=[median_day, median_day],
        y=[0, 0.5],
        mode='lines',
        line=dict(color='gray', dash='dash'),
        name=f'Median: {int(median_day)} days',
    ))
else:
    # Add annotation kalau data tidak cukup
    fig.add_annotation(
        x=surv_df['days_between'].max(),
        y=0.5,
        text="Median not reached",
        showarrow=False,
        font=dict(color="gray", size=12),
        xanchor='right'
    )

# Layouting
fig.update_layout(
    title='Cumulative Distribution of Days Between Purchases',
    xaxis_title='Days Between Purchases',
    yaxis_title='Proportion',
    height=500,
    hovermode='x unified'
)
fig.update_yaxes(tickformat='.0%', range=[0, 1])
fig.update_xaxes(range=[0, 90])

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

