In [1]:
#load libraries
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
#load data
df = pd.read_excel('PS_BA-Pt.2-Dataset.xlsx',sheet_name="data", index_col=None)

In [4]:
df.head()

Unnamed: 0,cartId,memberId,dateReceived,dateShipped,shippingService,value,cost,currency,itemCount,newsLetter,Year,Month,language
0,50000001,21167367,2018-10-08,2018-10-12,Expedited,307,107,GBP,1,1,2018,10,EN
1,50000031,21167119,2017-12-16,2017-12-20,Expedited,1049,237,CAD,4,0,2017,12,OT
2,50000061,16460067,2018-09-13,2018-09-14,Expedited,276,145,CAD,12,1,2018,9,CN
3,50000071,20686798,2018-11-04,2018-11-05,Ground,152,68,CAD,5,1,2018,11,CN
4,50000081,21171967,2018-05-27,2018-05-31,Express,450,219,EUR,19,1,2018,5,EN


In [5]:
df.dtypes

cartId                      int64
memberId                    int64
dateReceived       datetime64[ns]
dateShipped        datetime64[ns]
shippingService            object
value                       int64
cost                        int64
currency                   object
itemCount                   int64
newsLetter                  int64
Year                        int64
Month                       int64
language                   object
dtype: object

# build donor rfm model

In [7]:
#find min and max date
df['dateShipped'].min()
df['dateShipped'].max()

Timestamp('2018-12-01 00:00:00')

In [8]:
import datetime as dt
NOW = dt.datetime(2020,6,1)
df['dateShipped'] = pd.to_datetime(df['dateShipped'])

In [9]:
#rfm model
rfmTable = df.groupby('memberId').agg({'dateShipped': lambda x: (NOW - x.max()).days,
                                        'memberId': lambda x: len(x),  
                                        'value': lambda x: x.sum()})

rfmTable['dateShipped'] = rfmTable['dateShipped'].astype(int)
rfmTable.rename(columns={'dateShipped': 'recency', 
                         'memberId': 'frequency', 
                         'value': 'monetary_value'}, inplace=True)
rfmTable.head()

Unnamed: 0_level_0,recency,frequency,monetary_value
memberId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
37189,648,5,39289
38072,615,6,2939
44644,597,8,6255
48359,566,7,10443
53315,593,5,6252


In [10]:
#split quantiles
quantiles = rfmTable.quantile(q=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])
quantiles = quantiles.to_dict()

In [11]:
segmented_rfm = rfmTable

In [20]:
def RScore(x,p,d):
    if x <= d[p][0.10]:
        return 1
    elif x <= d[p][0.20]:
        return 2
    elif x <= d[p][0.30]: 
        return 3
    elif x <= d[p][0.40]: 
        return 4
    elif x <= d[p][0.50]:
        return 5
    elif x <= d[p][0.60]: 
        return 6
    elif x <= d[p][0.70]: 
        return 7
    elif x <= d[p][0.80]:
        return 8
    elif x <= d[p][0.90]: 
        return 9
    else:
        return 10
    
def FMScore(x,p,d):
    if x <= d[p][0.10]:
        return 10
    elif x <= d[p][0.20]:
        return 9
    elif x <= d[p][0.30]: 
        return 8
    elif x <= d[p][0.40]: 
        return 7
    elif x <= d[p][0.50]:
        return 6
    elif x <= d[p][0.60]: 
        return 5
    elif x <= d[p][0.70]: 
        return 4
    elif x <= d[p][0.80]:
        return 3
    elif x <= d[p][0.90]: 
        return 2
    else:
        return 1

In [21]:
#segment the data
segmented_rfm['r_quartile'] = segmented_rfm['recency'].apply(RScore, args=('recency',quantiles,))
segmented_rfm['f_quartile'] = segmented_rfm['frequency'].apply(FMScore, args=('frequency',quantiles,))
segmented_rfm['m_quartile'] = segmented_rfm['monetary_value'].apply(FMScore, args=('monetary_value',quantiles,))
segmented_rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore
memberId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37189,648,5,39289,8,6,2,85.09.0
38072,615,6,2939,6,5,6,66.05.0
44644,597,8,6255,5,2,3,59.08.0
48359,566,7,10443,2,3,3,28.08.0
53315,593,5,6252,5,6,4,55.07.0


In [22]:
#add rfm
segmented_rfm['RFMScore'] = segmented_rfm.r_quartile.map(str) + segmented_rfm.f_quartile.map(str) + segmented_rfm.m_quartile.map(str)
segmented_rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore
memberId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37189,648,5,39289,8,6,2,862
38072,615,6,2939,6,5,6,656
44644,597,8,6255,5,2,3,523
48359,566,7,10443,2,3,3,233
53315,593,5,6252,5,6,4,564


In [23]:
#output
from datetime import date
from datetime import datetime
from datetime import timedelta 

current_time = datetime.now()
info = current_time.strftime('%m%d%Y')
output_filename = ("rfm_" + str(info[:4]) + str(info[6:]) + ".csv")
segmented_rfm.to_csv(output_filename, sep=',', header=True)