In [6]:
#importing the libraries needed for this project
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors

In [2]:
data = pd.read_csv('online_retail.csv')

In [3]:
data.dropna(subset=['CustomerID'], inplace=True)

In [4]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['TotalAmount'] = data['Quantity'] * data['UnitPrice']

In [7]:
reference_date = data['InvoiceDate'].max() + timedelta(days=1)

In [8]:
reference_date

Timestamp('2011-12-10 12:50:00')

In [10]:
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'InvoiceNo': lambda x: len(x),
    'TotalAmount': lambda x: x.sum()
    })

In [11]:
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'TotalAmount': 'Value'}, inplace=True)
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,326,2,0.0
12347.0,2,182,4310.0
12348.0,75,31,1797.24
12349.0,19,73,1757.55
12350.0,310,17,334.4


In [12]:
#Defining the Quantiles
quantiles = rfm.quantile(q=[0.25, 0.5, 0.75])

#Assigning the RFM Scores 
def RScore(x,p,d):
    if p == 'Recency':
        if x <= d[p][0.25]:
            return 4
        elif x <= d[p][0.50]:
            return 3
        elif x <= d[p][0.75]:
            return 2
        else:
            return 1
        
    else:
        if x <= d[p][0.25]:
            return 1
        elif x <= d[p][0.50]:
            return 2
        elif x <= d[p][0.75]:
            return 3
        else:
            return 4

rfm['R'] = rfm['Recency'].apply(RScore, args=('Recency', quantiles))
rfm['F'] = rfm['Frequency'].apply(RScore, args=('Frequency', quantiles))
rfm['M'] = rfm['Value'].apply(RScore, args=('Value', quantiles))

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Value,R,F,M
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346.0,326,2,0.0,1,1,1
12347.0,2,182,4310.0,4,4,4
12348.0,75,31,1797.24,2,2,4
12349.0,19,73,1757.55,3,3,4
12350.0,310,17,334.4,1,1,2


In [14]:
#Creating a consolidated score
rfm['RFM_Segment'] = rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str)
rfm['RFM_Score'] = rfm[['R', 'F', 'M']].sum(axis=1)

In [15]:
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Value,R,F,M,RFM_Segment,RFM_Score
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12346.0,326,2,0.0,1,1,1,111,3
12347.0,2,182,4310.0,4,4,4,444,12
12348.0,75,31,1797.24,2,2,4,224,8
12349.0,19,73,1757.55,3,3,4,334,10
12350.0,310,17,334.4,1,1,2,112,4


In [16]:
#Assigning labels based on the RFM Score

segment_labels = ['Low Value', 'Mid Value', 'High Value']

def assign_segment(score):
    if score < 5:
        return 'Low Value'
    elif score < 9:
        return 'Mid Value'
    else:
        return 'High Value'

rfm['RFM_Segment_Label'] = rfm['RFM_Score'].apply(assign_segment)

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Value,R,F,M,RFM_Segment,RFM_Score,RFM_Segment_Label
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12346.0,326,2,0.0,1,1,1,111,3,Low Value
12347.0,2,182,4310.0,4,4,4,444,12,High Value
12348.0,75,31,1797.24,2,2,4,224,8,Mid Value
12349.0,19,73,1757.55,3,3,4,334,10,High Value
12350.0,310,17,334.4,1,1,2,112,4,Low Value


In [47]:
segment_counts = rfm['RFM_Segment_Label'].value_counts().reset_index()
segment_counts.columns = ['RFM_Segment', 'Count']
segment_counts = segment_counts.sort_values('RFM_Segment')

In [48]:
#Creating a chart to view the customer segments

fig = px.bar(segment_counts,
             x='RFM_Segment',
             y='Count',
             title='RFM Customer Segmentation',
             labels={'RFM_Segments': 'RFM Segment', 'Count': 'Number of Customers'},
             color='RFM_Segment',
             color_discrete_sequence=px.colors.qualitative.Pastel
             )
fig.show()

In [22]:
rfm['RFM_Customer_Segments'] = ''

rfm.loc[rfm['RFM_Score']>= 9, 'RFM_Customer_Segments'] = 'VIP/Loyal'
rfm.loc[(rfm['RFM_Score']>= 6) & (rfm['RFM_Score'] < 9), 'RFM_Customer_Segments'] = 'Potential Loyal'
rfm.loc[(rfm['RFM_Score']>= 5) & (rfm['RFM_Score'] < 6), 'RFM_Customer_Segments'] = 'At Risk'
rfm.loc[(rfm['RFM_Score']>= 4) & (rfm['RFM_Score'] < 5), 'RFM_Customer_Segments'] = "Can't Lose"
rfm.loc[(rfm['RFM_Score']>= 3) & (rfm['RFM_Score'] < 4), 'RFM_Customer_Segments'] = 'Lost'
segment_counts= rfm['RFM_Customer_Segments'].value_counts().sort_index()

In [23]:
#Creating a new dataframe to count occurences of in each segment
segment_product_counts = rfm.groupby(['RFM_Segment_Label', 'RFM_Customer_Segments']).size().reset_index(name='Count')
segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

In [24]:
treemap_segment_product = px.treemap(segment_product_counts,
                                     path=['RFM_Segment_Label', 'RFM_Customer_Segments'],
                                     values='Count',
                                     color='RFM_Segment_Label',
                                     color_discrete_sequence=px.colors.qualitative.Pastel,
                                     title='RFM Customer Segments by Value'
                                     )
treemap_segment_product.show()


In [31]:
vip_segment = rfm[rfm['RFM_Customer_Segments'] == 'VIP/Loyal']

In [32]:
fig = go.Figure()
fig.add_trace(go.Box(y=vip_segment['Recency'], name= 'Recency'))
fig.add_trace(go.Box(y=vip_segment['Frequency'], name= 'Frequency'))
fig.add_trace(go.Box(y=vip_segment['Value'], name= 'Value'))

In [34]:
correlation_matrix = vip_segment[['R', 'F', 'M']].corr()
heatmap = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns, 
    colorscale= 'RdBu',
    colorbar=dict(title='Correlation')
))
heatmap.update_layout(title='RFM Champion Segment Correlation Matrix')

heatmap.show()

In [36]:
pastel_colors = plotly.colors.qualitative.Pastel

fig = go.Figure(data=[go.Bar(x=segment_counts.index, y=segment_counts.values, marker=dict(color=pastel_colors))])

vip_color = 'rgb(158, 202, 225)'
fig.update_traces(marker_color=[vip_color if segment == 'Champions' else pastel_colors[i]
                                for i, segment in enumerate(segment_counts.index)],
                                marker_line_color='rgb(8, 48, 107)',
                                marker_line_width=1.5, opacity=0.6)

#Update the layout
fig.update_layout(title='Comparison of RFM Segments',
                  xaxis_title='RFM Segments',
                  yaxis_title='Number of Customers',
                  showlegend=False)
fig.show()

In [40]:
#Comparing recency score for all segments

segment_scores = rfm.groupby('RFM_Customer_Segments')[['R', 'F', 'M']].mean().reset_index()
fig = go.Figure()

# Add bars for Recency Score
fig.add_trace(go.Bar(
    x=segment_scores['RFM_Customer_Segments'],
    y=segment_scores['R'],
    name='Recency Score',
    marker_color='rgb(158, 202, 225)'
))

# Add bars for Frequency Score
fig.add_trace(go.Bar(
    x=segment_scores['RFM_Customer_Segments'],
    y=segment_scores['F'],
    name='Frequency Score',
    marker_color='rgb(94, 158, 217)'
))

# Add bars for Monetary Value Score
fig.add_trace(go.Bar(
    x=segment_scores['RFM_Customer_Segments'],
    y=segment_scores['M'],
    name='Monetary Value Score',
    marker_color='rgb(32, 102, 148)'
))

# Update the Layout
fig.update_layout(
    title='Comparison of RFM segments based on Recency, Frequency and Monetary Value Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',
    showlegend=True
)

fig.show()