## Customer Segmentation



### Recency

First let's start with recency 

In [8]:
import warnings
warnings.filterwarnings('ignore')

# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

import plotly.graph_objs as go
import plotly.offline as pyoff

# Load the dataset
df_data = pd.read_csv('data/uk_retail.csv', encoding='unicode_escape')

#convert the string date field to datetime
df_data['InvoiceDate'] = pd.to_datetime(df_data['InvoiceDate'])

#we will be using only UK data
df_uk = df_data.query("Country=='United Kingdom'").reset_index(drop=True)

In [9]:
#create a generic user dataframe to keep CustomerID and new segmentation scores
df_user = pd.DataFrame(df_data['CustomerID'].unique())
df_user.columns = ['CustomerID']

#get the max purchase date for each customer and create a dataframe with it
df_max_purchase = df_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
df_max_purchase.columns = ['CustomerID','MaxPurchaseDate']

#we take our observation point as the max invoice date in our dataset
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days

#merge this dataframe to our new user dataframe
df_user = pd.merge(df_user, df_max_purchase[['CustomerID','Recency']], on='CustomerID')

plot_data = [
    go.Histogram(
        x=df_user['Recency']
    )
]

plot_layout = go.Layout(
        title='Recency'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)


#### Clustering based on recency

In [10]:
from sklearn.cluster import KMeans

sse = {}
df_recency = df_user[['Recency']]

for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_

# Create a Plotly trace
trace = go.Scatter(
    x=list(sse.keys()),
    y=list(sse.values()),
    mode='lines+markers',
    name='SSE'
)

# Create a Plotly layout
layout = go.Layout(
    title='Elbow Plot',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='Inertia'),
    hovermode='closest'
)

# Create a Plotly figure
fig = go.Figure(data=[trace], layout=layout)

# Plot the figure
fig.show()

Here it looks like 3 is the optimal one. Based on business requirements, we can go ahead with less or more clusters. We will be selecting 4 for this example:

In [12]:
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Recency']])
df_user['RecencyCluster'] = kmeans.predict(df_user[['Recency']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

df_user = order_cluster('RecencyCluster', 'Recency',df_user,False)

Lets look at describtions of the clusters

In [19]:
df_user.groupby("RecencyCluster")['Recency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,478.0,304.393305,41.183489,245.0,266.25,300.0,336.0,373.0
1,568.0,184.625,31.753602,132.0,156.75,184.0,211.25,244.0
2,954.0,77.679245,22.850898,48.0,59.0,72.5,93.0,131.0
3,1950.0,17.488205,13.237058,0.0,6.0,16.0,28.0,47.0


## Frequency

In [20]:
df_frequency = df_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
df_frequency.columns = ['CustomerID','Frequency']

#add this data to our main dataframe
df_user = pd.merge(df_user, df_frequency, on='CustomerID')

#plot the histogram
plot_data = [
    go.Histogram(
        x=df_user.query('Frequency < 1000')['Frequency']
    )
]

plot_layout = go.Layout(
        title='Frequency'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [22]:
#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyCluster'] = kmeans.predict(df_user[['Frequency']])

#order the frequency cluster
df_user = order_cluster('FrequencyCluster', 'Frequency',df_user,True)

#see details of each cluster
df_user.groupby('FrequencyCluster')['Frequency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
FrequencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3496.0,49.525744,44.954212,1.0,15.0,33.0,73.0,190.0
1,429.0,331.221445,133.85651,191.0,228.0,287.0,399.0,803.0
2,22.0,1313.136364,505.934524,872.0,988.5,1140.0,1452.0,2782.0
3,3.0,5917.666667,1805.062418,4642.0,4885.0,5128.0,6555.5,7983.0
