In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
data = pd.DataFrame({'CustomerID': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7},
 'Spend': {0: 137, 1: 335, 2: 172, 3: 355, 4: 303, 5: 233, 6: 244, 7: 229}})


In [3]:
# Create a spend quartile with 4 groups - a range between 1 and 5
spend_quartile = pd.qcut(data['Spend'], q=4, labels=range(1, 5))

# Assign the quartile values to the Spend_Quartile column in data
data['Spend_Quartile'] = spend_quartile

# Print data with sorted Spend values
(data.sort_values('Spend'))

Unnamed: 0,CustomerID,Spend,Spend_Quartile
0,0,137,1
2,2,172,1
7,7,229,2
5,5,233,2
6,6,244,3
4,4,303,3
1,1,335,4
3,3,355,4


In [4]:
data = pd.DataFrame({'CustomerID': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7},
 'Recency_Days': {0: 37,
  1: 235,
  2: 396,
  3: 72,
  4: 255,
  5: 393,
  6: 203,
  7: 133}})

In [5]:
# Store labels from 4 to 1 in a decreasing order
r_labels = list(range(4, 0, -1))

# Create a spend quartile with 4 groups and pass the previously created labels 
recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels=r_labels)

# Assign the quartile values to the Recency_Quartile column in `data`
data['Recency_Quartile'] = recency_quartiles 

# Print `data` with sorted Recency_Days values
(data.sort_values('Recency_Days'))

Unnamed: 0,CustomerID,Recency_Days,Recency_Quartile
0,0,37,4
3,3,72,4
7,7,133,3
6,6,203,3
1,1,235,2
4,4,255,2
5,5,393,1
2,2,396,1


In [6]:
# Calculate Recency, Frequency and Monetary value for each customer
import datetime as dt 
online = pd.read_csv('online.csv', parse_dates=['InvoiceDate'], index_col=0)
online['TotalSum'] = online['Quantity'] * online['UnitPrice']
snapshot_date = dt.datetime(2011, 12, 10)
datamart = online.groupby(['CustomerID']).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalSum': 'sum'})

# Rename the columns 
datamart.rename(columns={'InvoiceDate': 'Recency',
                         'InvoiceNo': 'Frequency',
                         'TotalSum': 'MonetaryValue'}, inplace=True)

# Print top 5 rows
print(datamart.head())

            Recency  Frequency  MonetaryValue
CustomerID                                   
12747             2         27         992.82
12748             0        967        7522.06
12749             3         37         813.45
12820             3         17         268.02
12822            70          9         146.15


In [7]:
# Create labels for Recency and Frequency
r_labels = range(3, 0, -1)
f_labels = range(1, 4)

# Assign these labels to three equal percentile groups 
r_groups = pd.qcut(datamart['Recency'], q=3, labels=r_labels)

# Assign these labels to three equal percentile groups 
f_groups = pd.qcut(datamart['Frequency'], q=3, labels=f_labels)

# Create new columns R and F 
datamart = datamart.assign(R=r_groups.values, F=f_groups.values)

# Print the results
print(datamart.head())

            Recency  Frequency  MonetaryValue  R  F
CustomerID                                         
12747             2         27         992.82  3  3
12748             0        967        7522.06  3  3
12749             3         37         813.45  3  3
12820             3         17         268.02  3  3
12822            70          9         146.15  2  2


In [8]:
    # Create labels for MonetaryValue
m_labels = range(1, 4)

# Assign these labels to three equal percentile groups 
m_groups = pd.qcut(datamart['MonetaryValue'], q=3, labels=m_labels)

# Create new column M
datamart = datamart.assign(M=m_groups)

# Calculate RFM_Score
datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis=1)
(datamart['RFM_Score'].head())

CustomerID
12747    9
12748    9
12749    9
12820    9
12822    6
Name: RFM_Score, dtype: int64

In [9]:
datamart[datamart.RFM_Score ==9].MonetaryValue.mean()

1342.1453533568904

In [10]:
# Define rfm_level function
def rfm_level(df):
    if df['RFM_Score'] >= 10:
        return 'Top'
    elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)):
        return 'Middle'
    else:
        return 'Low'

# Create a new variable RFM_Level
datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1)

# Print the header with top 5 rows to the console
(datamart.head())

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score,RFM_Level
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12747,2,27,992.82,3,3,3,9,Middle
12748,0,967,7522.06,3,3,3,9,Middle
12749,3,37,813.45,3,3,3,9,Middle
12820,3,17,268.02,3,3,3,9,Middle
12822,70,9,146.15,2,2,2,6,Middle


In [12]:
# Calculate average values for each RFM_Level, and return a size of each segment 
rfm_level_agg = datamart.groupby('RFM_Level').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
  
  	# Return the size of each segment
    'MonetaryValue': ['mean', 'count']
}).round(1)

# Print the aggregated dataset
(rfm_level_agg)

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,MonetaryValue
Unnamed: 0_level_1,mean,mean,mean,count
RFM_Level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Low,160.9,4.5,87.1,1651
Middle,39.8,30.9,616.9,2050
