In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Customer Segmentation with RFM**

# What is RFM?


RFM represents a method used for measuring customer value. An RFM analysis can show you who are the most valuable customers for your business. The ones who buy most frequently, most often, and spend the most. First of all, the metrics you have seen are calculated. 

![image.png](attachment:e6cbfb14-897a-47cc-842e-87eb5722e3dd.png)

**Recency:** The value that indicates how much time has passed since a customer's last activity or transaction with the brand. The activity is usually a purchase, but sometimes variations are used such as the last visit to a website or the use of a mobile app.

**Frequency:** How often a customer shows in a certain period of activity value. Customers who operate more often may be more loyal than others.

**Monetary:** It is the value that shows how much a customer has spent in a certain period of time. Dividing the value of the spend by frequency shows us the average purchase amount.

# Dataset and Story

An e-commerce company wants to segment its customers and determine marketing strategies according to these segments. The company believes that marketing activities specific to customer segments that exhibit common behaviors will increase revenue. For example, it is desired to organize different campaigns for new customers and different campaigns to retain very profitable customers for the company.

The dataset named Online Retail includes the sales of a UK-based online store between 01/12/2009-09/12/2011. This company's product catalog includes souvenirs. The majority of the company's customers are corporate customers.

**Variables of the data set:**

* InvoiceNo : The number of the invoice, unique per each purchase. Refund invoice numbers contain "C"
* StockCode : Unique code per each item
* Description : Name of the item
* Quantity : The number of items within the invoice
* InvoiceDate : Date and time of the purchase
* UnitPrice : Price of a single item, as of Sterlin
* CustomerID : Unique id number per each customer
* Country : The country where the customer is living

# Load Dataset


In [None]:
!pip install xlrd
!pip install openpyxl
!pip install lifetimes
!pip install missingno
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import datetime as dt
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1zv4Mk2V0ylzbhl_iqQSfflanRztfh_Zz' -O online_retail_II.xlsx

In [None]:
# Read the 2010-2011 data in the OnlineRetail II excel. Make a copy of the data frame you created.
df = pd.read_excel("./online_retail_II.xlsx", sheet_name="Year 2010-2011")
df_copy = df.copy()
df.head()

In [None]:
#Checking Variables

def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

Check out descriptive statistics of numerical variables. See the difference between 75% and 99% values and then See the difference between 99% and max values. We can think that there are some outliers.

In [None]:
df.describe([0.01,0.25,0.50,0.75,0.99]).T

In [None]:
# There is no specific correlation between missing values

msno.heatmap(df)

In [None]:
sns.boxplot(df["Quantity"]);

In [None]:
sns.boxplot(df["Price"]);

# Data Preprocessing

Outlier values are trimmed (very little) without damaging the data.Here we have set a lower and upper limit. But since the lower limit is set, we'll only assign it to the upper limit. We'll do it for Quantity and Price.

In [None]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
replace_with_thresholds(df,"Quantity")
replace_with_thresholds(df,"Price")

In [None]:
# outliers values are now cleaner.

df.describe([0.01,0.25,0.50,0.75,0.99]).T

Missing values are deleted. Canceled Invoices are not received and and a new variable was created.

In [None]:
def data_prep(dataframe):
    dataframe.dropna(axis=0, inplace=True)
    dataframe = dataframe[~dataframe["Invoice"].str.contains("C", na=False)]
    dataframe = dataframe[dataframe["Quantity"] > 0]
    dataframe["TotalPrice"] = dataframe["Quantity"] * dataframe["Price"]
    return dataframe

In [None]:
df= data_prep(df)
check_df(df)

Variables are as follows after they are cleared of outliers values.

In [None]:
sns.boxplot(df["Quantity"])

In [None]:
sns.boxplot(df["Price"]);

# Creating RFM Metrics & Calculating RFM Scores

In [None]:
df["InvoiceDate"].max()
today_date = dt.datetime(2011, 12, 11)

# recency
# frequency
# monetary

rfm = df.groupby("Customer ID").agg({"InvoiceDate": lambda x: (today_date - x.max()).days,
                               "Invoice": lambda x: x.nunique(),
                               "TotalPrice": lambda x: x.sum()})

rfm.columns = ["recency", "frequency", "monetary"]

rfm = rfm[rfm["monetary"] > 0]
rfm.head()

We segmented our RFM values with the qcut () function. The high recency value means that the customer gets away from us. For this reason, we made the labels decreasing in order to avoid confusion.  qcut() bins data into discrete intervals based on sample quantiles

In [None]:
rfm["recency_score"] = pd.qcut(rfm['recency'], 5, labels=[5, 4, 3, 2, 1])

rfm["frequency_score"] = pd.qcut(rfm['frequency'].rank(method="first"), 5, labels=[1, 2, 3, 4, 5])

rfm["monetary_score"] = pd.qcut(rfm['monetary'], 5, labels=[1, 2, 3, 4, 5])


In order to see the RFM values together, we defined a variable named "RFM_SCORE" and combined the RFM values by converting them to strings. If we tried to combine without converting, we might get an error because it would not combine the two categorical variables. (We do not include monetary_score)

In [None]:
rfm["RFM_SCORE"] = (rfm['recency_score'].astype(str) +
                    rfm['frequency_score'].astype(str))

# Naming & Analysing RFM Segments

In [None]:
seg_map = {
    r'[1-2][1-2]': 'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_lose',
    r'3[1-2]': 'about_to_sleep',
    r'33': 'need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalists',
    r'5[4-5]': 'champions'
}


rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex=True)

rfm[["segment", "recency", "frequency", "monetary"]].groupby("segment").agg(["mean", "count"])


Then segments are created.

**So why are we doing this?**

We look for answers to these questions;

-Who is our most profitable customer? -What is it they appreciate in my products or services? -Who are my new customers? -How do I attract new customers to the company?

The answers to the questions are hidden in the segmentation.

With better RFM segmentation, we’ll be able to address certain segments in a personalized manner, based on their needs and preferences.

Browse the scheme to more easily understand segmentation.

![image.png](attachment:cf8c52ed-6b35-4bf3-b7ef-caa20ba0a0f3.png)

In [None]:
plt.figure(figsize=(15,7))
sns.barplot(x="segment", y="frequency", data=rfm)

Now we can see the customer segments that we want to focus on. For the customer segment that they want to focus on, a marketing strategy can be chosen easily.

In [None]:
# Select the customer IDs of the Loyal Customers class and get the excel output.

rfm[rfm["segment"] == "loyal_customers"]

new_df = pd.DataFrame()
new_df["new_customer_id"] = rfm[rfm["segment"] == "loyal_customers"].index
new_df.head()

In [None]:
new_df.to_excel("loyal_customers.xlsx")