In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from IPython.display import display

# RFM Analysis
- RFM (Recency, Frequency, Monetary) analysis = 
    + A marketing technique used to determine which customers are the best ones 
+ RFM examines 
    + **Recency**: how recently a customer has purchased (recency)
    + **Frequency**: how often they purchase (frequency)
    + **Monetary**: how much the customer spends (monetary)

- Customers are assigned a ranking score of 1,2,3, or 4 (4 = highest) for each RFM parameter

In [2]:
df = pd.read_csv('./datasets/sales_data.csv')
df = df[['CUSTOMERNAME','ORDERNUMBER','ORDERDATE','SALES']]

df.head(3)

Unnamed: 0,CUSTOMERNAME,ORDERNUMBER,ORDERDATE,SALES
0,Land of Toys Inc.,10107,2/24/2003 0:00,2871.0
1,Reims Collectables,10121,5/7/2003 0:00,2765.9
2,Lyon Souveniers,10134,7/1/2003 0:00,3884.34


## Recency
- Recency = how recently a customer has purchased a product
- Algorithm: **recency(x) = recent date(x) - most recent date**
    + **recent date(x)**: the date of the last order of customer x
    + **most recent date**: the most recent order in the dataset
    - smaller recency = higher score

In [3]:
# Convert Date to Datetime type 
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])

# Algorithms
recent_dates_df = df[['CUSTOMERNAME', 'ORDERDATE']] \
    .groupby(['CUSTOMERNAME'])['ORDERDATE'] \
    .max()

most_recent_date= df['ORDERDATE'].max()

recency = recent_dates_df.apply(
    lambda x: (most_recent_date-x).days)

In [4]:
recency.head(3)

CUSTOMERNAME
AV Stores, Co.        195
Alpha Cognac           64
Amica Models & Co.    264
Name: ORDERDATE, dtype: int64

## Frequency
- Frequency = A measure of how often a customer purchases a product
- Algorithm
    + Calculate the total number of times a customer has made an order

In [5]:
############ Note: We have to group 2 times #########
# 1 customer make multiple order
# 1 order has multiple products

In [6]:
frequency = df[['CUSTOMERNAME', 'ORDERNUMBER']] \
    .groupby(['CUSTOMERNAME', 'ORDERNUMBER']) \
    .size()

frequency.head(10)

CUSTOMERNAME             ORDERNUMBER
AV Stores, Co.           10110          16
                         10306          17
                         10332          18
Alpha Cognac             10136           3
                         10178          12
                         10397           5
Amica Models & Co.       10280          17
                         10293           9
Anna's Decorations, Ltd  10148          14
                         10169          13
dtype: int64

In [7]:
frequency = frequency \
    .groupby(['CUSTOMERNAME']) \
    .size()

frequency.head(10)

CUSTOMERNAME
AV Stores, Co.                  3
Alpha Cognac                    3
Amica Models & Co.              2
Anna's Decorations, Ltd         4
Atelier graphique               3
Australian Collectables, Ltd    3
Australian Collectors, Co.      5
Australian Gift Network, Co     3
Auto Assoc. & Cie.              2
Auto Canal Petit                3
dtype: int64

## Monetary
- measure of how much the customer spent

In [8]:
monetary = df[['CUSTOMERNAME', 'SALES']] \
    .groupby(['CUSTOMERNAME'])['SALES'] \
    .sum()

monetary.head()

CUSTOMERNAME
AV Stores, Co.             157807.81
Alpha Cognac                70488.44
Amica Models & Co.          94117.26
Anna's Decorations, Ltd    153996.13
Atelier graphique           24179.96
Name: SALES, dtype: float64

## Summary

In [9]:
rfm_table = pd.DataFrame()
rfm_table['recency'] = recency
rfm_table['frequency'] = frequency
rfm_table['monetary'] = monetary

rfm_table.head()

Unnamed: 0_level_0,recency,frequency,monetary
CUSTOMERNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"AV Stores, Co.",195,3,157807.81
Alpha Cognac,64,3,70488.44
Amica Models & Co.,264,2,94117.26
"Anna's Decorations, Ltd",83,4,153996.13
Atelier graphique,187,3,24179.96


#### Assign scores

In [10]:
def assign_score(x, quantile_values, attribute):
    '''
    recency: lower = higher score
    Frequency and Monetary: higher = higher score

    Input: 
        attribute = 'recency', 'frequency', or 'monetary'
    '''
    # recency
    if attribute == 'recency':
        if x <= quantile_values.loc[0.25,attribute]:
            return 4
        elif x >= quantile_values.loc[0.25,attribute] and x <= quantile_values.loc[0.5,attribute]:
            return 3
        elif x >= quantile_values.loc[0.5,attribute] and x <= quantile_values.loc[0.75,attribute]:
            return 2
        else:
            return 1

    # frequency and frequency
    elif attribute == 'frequency' or attribute == 'monetary':
        if x <= quantile_values.loc[0.25,attribute]:
            return 1
        elif x >= quantile_values.loc[0.25,attribute] and x <= quantile_values.loc[0.5,attribute]:
            return 2
        elif x >= quantile_values.loc[0.5,attribute] and x <= quantile_values.loc[0.75,attribute]:
            return 3
        else:
            return 4


# Caculate quantiles
quantile_df = rfm_table.quantile(q=[0.25,0.5,0.75])

# Assign scores using quantiles
rfm_table['r_score'] = rfm_table['recency'] \
    .apply(assign_score, args=(quantile_df,'recency'))
rfm_table['f_score'] = rfm_table['frequency'] \
    .apply(assign_score, args=(quantile_df,'frequency'))
rfm_table['m_score'] = rfm_table['monetary'] \
    .apply(assign_score, args=(quantile_df,'monetary'))

# Summarize score
rfm_table['rfm_score'] = rfm_table['r_score'] + rfm_table['f_score'] + rfm_table['m_score']

# Sort
rfm_table = rfm_table[['r_score', 'f_score', 'm_score', 'rfm_score']] \
    .sort_values(by='rfm_score',ascending=False)

In [11]:
rfm_table.head(10)

Unnamed: 0_level_0,r_score,f_score,m_score,rfm_score
CUSTOMERNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
La Rochelle Gifts,4,4,4,12
The Sharp Gifts Warehouse,4,4,4,12
Mini Gifts Distributors Ltd.,4,4,4,12
Reims Collectables,4,4,4,12
Salzburg Collectables,4,4,4,12
Souveniers And Things Co.,4,4,4,12
Euro Shopping Channel,4,4,4,12
Danish Wholesale Imports,4,4,4,12
Diecast Classics Inc.,4,4,4,12
Technics Stores Inc.,3,4,4,11
