# RFM SEGMENTATION

In [23]:
import datetime as dt
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
df_ = pd.read_excel("online_retail_II.xlsx", sheet_name="Year 2010-2011")
df = df_.copy()

In [None]:
df = df[~df["Invoice"].str.contains("C", na=False)]
df["TotalPrice"] = df["Quantity"] * df["Price"]

In [None]:
def check_rfm(dataframe):
    print("How many nulls are there?\n", dataframe.isnull().sum())
    print("########################################################")
    print("What is the unique number of products?\n", dataframe["Description"].nunique())
    print("########################################################")
    print("How many pieces of which product are there?\n", dataframe["Description"].value_counts())
    print("########################################################")
    print("What is the most ordered product?\n", dataframe.groupby("Description").
          agg({"Quantity": "sum"}).sort_values("Quantity", ascending=False).head())
    print("########################################################")
    print("How many invoices have been issued?\n", dataframe["Invoice"].nunique())
    print("########################################################")
    print("How much money is earned on average per invoice?\n", dataframe.groupby("Invoice").
          agg({"TotalPrice":"mean"}).sort_values("TotalPrice", ascending=False).head())
    print("########################################################")
    print("Which country made how much?\n", dataframe.groupby("Country").
          agg({"TotalPrice": "sum"}).sort_values("TotalPrice", ascending=False).head())
    print("########################################################")
    print("What is the distribution of values?\n", dataframe.describe([0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T)
check_rfm(df)

In [None]:
df.dropna(inplace=True)

In [None]:
df["InvoiceDate"].max()

In [None]:
# RFM METRICS
today_date = dt.datetime(2011, 12, 11)
rfm = df.groupby('Customer ID').agg({'InvoiceDate': lambda date: (today_date - date.max()).days,
                                     'Invoice': lambda num: len(num),
                                     'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm = rfm[(rfm["Monetary"]) > 0 & (rfm["Frequency"] > 0)]
rfm.head()

In [None]:
# RFM SCORES
rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'], 5, labels=[1, 2, 3, 4, 5])
rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
rfm["RFM_SCORE"] = (rfm['RecencyScore'].astype(str) +
                    rfm['FrequencyScore'].astype(str) +
                    rfm['MonetaryScore'].astype(str))
rfm.head()

In [None]:
seg_map = {
        r'[1-2][1-2]': 'Hibernating',
        r'[1-2][3-4]': 'At_Risk',
        r'[1-2]5': 'Cant_Loose',
        r'3[1-2]': 'About_to_Sleep',
        r'33': 'Need_Attention',
        r'[3-4][4-5]': 'Loyal_Customers',
        r'41': 'Promising',
        r'51': 'New_Customers',
        r'[4-5][2-3]': 'Potential_Loyalists',
        r'5[4-5]': 'Champions'}
rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)
rfm.head()

In [None]:
new_df = pd.DataFrame()
new_df["Loyal_Customers"] = rfm[rfm["Segment"] == "Loyal_Customers"].index
new_df.to_csv("Loyal_Customers.csv")

In [None]:
# Create RFM Metrics and Scores Functions
def rfm_metrics(dataframe):

    today_date = dt.datetime(2011, 12, 11)
    rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': lambda date: (today_date - date.max()).days,
                                                'Invoice': lambda num: len(num),
                                                'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
    rfm.columns = ['Recency', 'Frequency', 'Monetary']
    rfm = rfm[(rfm["Monetary"]) > 0 & (rfm["Frequency"] > 0)]
    return rfm

rfm = rfm_metrics(df)


def rfm_scores(rfm):

    rfm["RecencyScore"] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
    rfm["FrequencyScore"] = pd.qcut(rfm['Frequency'], 5, labels=[1, 2, 3, 4, 5])
    rfm["MonetaryScore"] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])
    rfm["RFM_SCORE"] = (rfm['RecencyScore'].astype(str) +
                        rfm['FrequencyScore'].astype(str) +
                        rfm['MonetaryScore'].astype(str))
    return rfm

rfm = rfm_scores(rfm)


def rfm_segments(rfm):
    seg_map = {
        r'[1-2][1-2]': 'Hibernating',
        r'[1-2][3-4]': 'At_Risk',
        r'[1-2]5': 'Cant_Loose',
        r'3[1-2]': 'About_to_Sleep',
        r'33': 'Need_Attention',
        r'[3-4][4-5]': 'Loyal_Customers',
        r'41': 'Promising',
        r'51': 'New_Customers',
        r'[4-5][2-3]': 'Potential_Loyalists',
        r'5[4-5]': 'Champions'}
    rfm['Segment'] = rfm['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
    rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)

    return rfm

rfm = rfm_segments(rfm)