<a href="https://colab.research.google.com/github/cinnData/DataSci/blob/main/Notebooks/retail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example - RFM based segmentation in online retail transaction data

### Importing the data

In [None]:
import pandas as pd

In [None]:
path = 'https://raw.githubusercontent.com/cinnData/DataSci/main/Data/'
filename = path + 'retail.csv.zip'
df = pd.read_csv(filename)

In [None]:
df.head()

In [None]:
pd.crosstab(df['InvoiceNo'].str.contains('C'), df['Quantity'] < 0)

### Q1. New column with the number of days since the invoice was generated

In [None]:
df['InvoiceDate'] = df['InvoiceDate'].astype('datetime64[D]')

In [None]:
max_date = max(df['InvoiceDate'])
max_date

In [None]:
df['Diff'] = (max_date - df['InvoiceDate']).dt.days
df.head()

### Q2a. Creating recency and frequency data

In [None]:
RF = df.groupby('CustomerID')['Diff'].agg(['min', 'count'])
RF.head()

In [None]:
RF.columns = ['Recency', 'Frequency']

### Q2b. Creating monetary data

In [None]:
df['Monetary'] = df['Quantity']*df['UnitPrice']

In [None]:
M = df.groupby('CustomerID')['Monetary'].sum()
M.head()

### Q2c. Joining the two data sets

In [None]:
RFM = RF.merge(M, left_index=True, right_index=True)
RFM.head()

In [None]:
RFM.describe()

### Q3b. Normalization

In [None]:
import numpy as np

In [None]:
def normalize(x): return (x - np.min(x))/(np.max(x) - np.min(x))

In [None]:
RFM1 = RFM.apply(normalize, axis=0)

### Q3c. 8-cluster analysis

In [None]:
import scipy.cluster.vq as cluster

In [None]:
center = cluster.kmeans(RFM1, 8)[0]
center

In [None]:
label = cluster.vq(RFM1, center)[0]
RFM['Segment'] = label
RFM.head()

In [None]:
RFM['Segment'].value_counts()

### Q4a. Binarization

In [None]:
RFM['BinRecency'] = ((RFM['Recency'] > RFM['Recency'].median()) + 0).astype(str)
RFM['BinFrequency'] = ((RFM['Frequency'] > RFM['Frequency'].median()) + 0).astype(str)
RFM['BinMonetary'] = ((RFM['Monetary'] > RFM['Monetary'].median()) + 0).astype(str)
RFM.head()

### Q4b. Compare this partition with that of the preceding question

In [None]:
RFM['BinSegment'] = RFM['BinRecency'] + RFM['BinFrequency'] + RFM['BinMonetary']
RFM.head()

In [None]:
pd.crosstab(RFM['Segment'], RFM['BinSegment'])