In [134]:
import sys
import os
import datetime
import numpy as np
#Loading utils for required funcs
%run {os.path.join('..', 'src', 'utils.py')}

## RFM
##### Recency, Frequency, Monetary value (RFM) is a model used in marketing analysis that segments a company's consumer base by their purchasing patterns or habits. In particular, it evaluates customers Recency (how long ago they made a purchase), Frequency (how often they make purchases), and Monetary value (how much money they spend).


In [135]:
#read our dataset
rfm_data = pd.read_csv(os.path.join("..","data","potentials.csv"))

In [136]:
# Boolean condition to filter rows
condition = (rfm_data['is_promotion'] != 1) & (rfm_data['current_products_price'] > 0) & (rfm_data['membership_length'] > 0)

# Filter the DataFrame based on the condition
rfm_data = rfm_data[condition].copy()

In [137]:
#filling the NaN budget_value with 1
rfm_data['budget_value'].fillna(1, inplace=True)

###  Step 1 - > Recency

In [138]:
#we will count information until today
reference_date = datetime.datetime.today().date()

In [139]:
# creating extra columns
rfm_data['days_since_last_call'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_call'])).astype('timedelta64[D]')
rfm_data['days_since_last_touch'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_touch'])).astype('timedelta64[D]')
rfm_data['days_since_last_seen'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_seen_at'])).astype('timedelta64[D]')

In [140]:
# Fill NaN values in 'days_since_last_call' with the maximum value from the column
max_last_call = rfm_data['days_since_last_call'].max()
rfm_data['days_since_last_call'].fillna(max_last_call, inplace=True)

# Fill NaN values in 'days_since_last_touch' with the maximum value from the column
max_last_touch = rfm_data['days_since_last_touch'].max()
rfm_data['days_since_last_touch'].fillna(max_last_touch, inplace=True)

In [141]:
#normalizing the column "budget_value"
min_val = rfm_data['budget_value'].min()
max_val = rfm_data['budget_value'].max()

# Normalize the column between 0 and 1
rfm_data['budget_value'] = (rfm_data['budget_value'] - min_val) / (max_val - min_val)

In [142]:
#normalizing the column "lead_read_gap_min"
min_val = rfm_data['lead_read_gap_min'].min()
max_val = rfm_data['lead_read_gap_min'].max()

# Normalize the column between 0 and 1
rfm_data['lead_read_gap_min'] = (rfm_data['lead_read_gap_min'] - min_val) / (max_val - min_val)

### Step 2 - > Frequency Part-Monetary Part
##### For **frequency**, we can use the following  : *lead_count*,*view_count*, *call_count* and *touch_count* and for **Monetary** our necessary columns will be : *budget_value*,*current_product_price*

In [143]:
rfm_data = rfm_data[['provider_id', 'lead_count','view_count','image_count','video_count','discount_count','review_count','touch_count','call_count',
                    'membership_length','budget_value','current_products_price','lead_read_gap_min',
                    'days_since_last_call', 'days_since_last_touch','days_since_last_seen']]

In [144]:
#our finalized dataframe to work on can be seen here: 
rfm_data.head(2)

Unnamed: 0,provider_id,lead_count,view_count,image_count,video_count,discount_count,review_count,touch_count,call_count,membership_length,budget_value,current_products_price,lead_read_gap_min,days_since_last_call,days_since_last_touch,days_since_last_seen
2,117,505,10606,98,11,0,109,13,5,96,0.023067,51288.0,0.031348,132.0,74.0,55.0
6,143,176,2029,89,1,2,37,6,3,108,0.023067,110664.0,0.005032,67.0,67.0,55.0


In [145]:
# Function to calculate Monetary column
def calculate_monetary(df):
    """
    this function will be used to create a monetary score and assign it to a seperate column created and named as Monetary
    """
    df['Monetary'] = df['current_products_price'] * df['budget_value']

# Function to calculate Frequency column
def calculate_frequency(df):
    """
    this function will be used to create a frequency score and assign it to a seperate column created and named as Frequency
    """
    df['Frequency'] = (df['image_count'] + df['video_count'] + df['discount_count'] + df['review_count'] + df['lead_count'] + df['view_count'] +
                      (1.5 * (df['touch_count'] + df['call_count']))) / df['membership_length']

# Function to calculate Recency column
def calculate_recency(df):
    """
    this function will be used to create a recency score and assign it to a seperate column created and named as Recency
    """
    min_last_touch = df['days_since_last_touch'].min()
    min_last_seen = df['days_since_last_seen'].min()
    min_last_call = df['days_since_last_call'].min()

    df['Recency'] = np.minimum.reduce([min_last_touch, min_last_seen, min_last_call]) * df['lead_read_gap_min']


In [146]:
# Calculate the columns using the defined functions
calculate_monetary(rfm_data)
calculate_frequency(rfm_data)
calculate_recency(rfm_data)

In [147]:
#now let's create our final RFM dataset to evaluate :
rfm_providers=rfm_data[["provider_id"]]
rfm_data = rfm_data[["Recency","Monetary","Frequency"]]

In [148]:
quantiles = rfm_data.quantile(q=[0.25,0.5,0.75])
quantiles

Unnamed: 0,Recency,Monetary,Frequency
0.25,0.187269,94.39186,15.869792
0.5,1.503232,322.475307,37.971354
0.75,7.550439,1320.259357,96.21875


In [149]:
quantiles.to_dict()

{'Recency': {0.25: 0.1872691597414589,
  0.5: 1.5032317636195751,
  0.75: 7.550438596491228},
 'Monetary': {0.25: 94.39186042933774,
  0.5: 322.47530743905065,
  0.75: 1320.2593565062207},
 'Frequency': {0.25: 15.869791666666666,
  0.5: 37.971354166666664,
  0.75: 96.21875}}

In [150]:
def RScore(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1
def FMScore(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

In [151]:
rfm_segmentation = rfm_data
rfm_segmentation['R_Quartile'] = rfm_segmentation['Recency'].apply(RScore, args=('Recency',quantiles,))
rfm_segmentation['F_Quartile'] = rfm_segmentation['Frequency'].apply(FMScore, args=('Frequency',quantiles,))
rfm_segmentation['M_Quartile'] = rfm_segmentation['Monetary'].apply(FMScore, args=('Monetary',quantiles,))

In [152]:
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile
2,1.724146,1183.055334,118.291667,2,4,3
6,0.276777,2552.675781,21.736111,3,2,4
7,0.060942,168.884932,28.807292,4,2,2
9,0.263446,677.585534,49.095238,3,3,3
10,1.441655,5060.786306,52.923611,3,3,4


In [153]:
rfm_segmentation['RFMScore'] = rfm_segmentation.R_Quartile.map(str) \
                            + rfm_segmentation.F_Quartile.map(str) \
                            + rfm_segmentation.M_Quartile.map(str)
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile,RFMScore
2,1.724146,1183.055334,118.291667,2,4,3,243
6,0.276777,2552.675781,21.736111,3,2,4,324
7,0.060942,168.884932,28.807292,4,2,2,422
9,0.263446,677.585534,49.095238,3,3,3,333
10,1.441655,5060.786306,52.923611,3,3,4,334


In [154]:
rfm_segmentation['RFMScore_num'] = rfm_segmentation.R_Quartile \
                            + rfm_segmentation.F_Quartile \
                            + rfm_segmentation.M_Quartile
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile,RFMScore,RFMScore_num
2,1.724146,1183.055334,118.291667,2,4,3,243,9
6,0.276777,2552.675781,21.736111,3,2,4,324,9
7,0.060942,168.884932,28.807292,4,2,2,422,8
9,0.263446,677.585534,49.095238,3,3,3,333,9
10,1.441655,5060.786306,52.923611,3,3,4,334,10


In [155]:
rfm_segmentation = rfm_segmentation.reset_index(drop=True)

In [157]:
rfm_providers = rfm_providers.reset_index(drop=True)

In [160]:
rfm_segmentation = pd.merge(rfm_providers, rfm_segmentation, left_index=True, right_index=True)

In [162]:
#Saving the feature engineering results as CSV file
rfm_segmentation.to_csv(Path("..","data","rfm_segmentation.csv"),index=False)