In [3]:
# import
import pandas as pd

In [18]:
#load csv and convert few datetime objects
fact =pd.read_csv("Cleaned/olist_fact_sales.csv",parse_dates=['order_purchase_timestamp',
                                                                              'order_approved_at',
                                                                              'order_delivered_carrier_date',
                                                                              'order_delivered_customer_date',
                                                                              'order_estimated_delivery_date'],encoding='utf-8')

In [20]:
#drop unwanted column
fact.drop(columns='Unnamed: 0',inplace=True)

In [21]:
fact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 25 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
 8   order_approved_flag            99441 non-null  bool          
 9   order_shipped_flag             99441 non-null  bool          
 10  order_delivered_flag           99441 non-null  bool          
 11  item_revenue   

In [27]:
# reference date
ref_date = fact['order_purchase_timestamp'].max() + pd.Timedelta(days=1)

In [28]:
ref_date

Timestamp('2018-10-18 17:30:18')

In [40]:
# RFM analysis
rfm=(fact.groupby('customer_id').agg(Recency=('order_purchase_timestamp',lambda x: (ref_date - x.max()).days),
                               Frequency =('order_id','nunique'),
                               Monetary=('total_payment_value', 'sum')).reset_index())

In [41]:
rfm.head(3)

Unnamed: 0,customer_id,Recency,Frequency,Monetary
0,00012a2ce6f8dcda20d059ce98491703,338,1,114.74
1,000161a058600d5901f007fab4c27140,459,1,67.41
2,0001fd6190edaaf884bcaf3d49edf079,597,1,195.42


In [42]:
#sanity check
rfm.describe()

Unnamed: 0,Recency,Frequency,Monetary
count,99441.0,99441.0,99441.0
mean,290.900192,1.0,160.988648
std,153.667316,0.0,221.950728
min,1.0,1.0,0.0
25%,167.0,1.0,62.01
50%,272.0,1.0,105.29
75%,401.0,1.0,176.97
max,773.0,1.0,13664.08


In [80]:
# customer segments
rfm['R_score'] =pd.qcut(x=rfm['Recency'],q=5,labels=range(5,0,-1))
rfm['M_score']=pd.qcut(x=rfm['Monetary'],q=5,labels=range(1,6,1))
rfm['F_score'] = 1

def rm_segment(row):
    if row['R_score'] >= 4 and row['M_score'] >= 4:
        return 'Champions'
    elif row['R_score'] >= 4 and row['M_score'] <= 3:
        return 'New Customers'
    elif row['R_score'] <= 2 and row['M_score'] >= 4:
        return 'At Risk High Value'
    elif row['R_score'] <= 2 and row['M_score'] <= 2:
        return 'Low Value'
    else:
        return 'Regular'

rfm['customer_segment'] = rfm.apply(rm_segment,axis=1)

In [82]:
# sanity check
rfm['customer_segment'].value_counts()

customer_segment
Regular               27744
New Customers         23425
Low Value             16405
Champions             16402
At Risk High Value    15465
Name: count, dtype: int64

In [89]:
# sanity check 
rfm.groupby('customer_segment')[['Recency','Monetary']].mean()

Unnamed: 0_level_0,Recency,Monetary
customer_segment,Unnamed: 1_level_1,Unnamed: 2_level_1
At Risk High Value,448.628193,303.302618
Champions,143.003902,297.736188
Low Value,449.455349,54.774681
New Customers,141.488111,71.683107
Regular,322.813834,139.023671


In [91]:
rfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   customer_id       99441 non-null  object  
 1   Recency           99441 non-null  int64   
 2   Frequency         99441 non-null  int64   
 3   Monetary          99441 non-null  float64 
 4   R_score           99441 non-null  category
 5   M_score           99441 non-null  category
 6   F_score           99441 non-null  int64   
 7   customer_segment  99441 non-null  object  
dtypes: category(2), float64(1), int64(3), object(2)
memory usage: 4.7+ MB


In [92]:
# export  file

rfm.to_csv("Cleaned/Olist_RFM_analysis.csv")
