In [14]:
import sys
import os
import datetime
import numpy as np
#Loading utils for required funcs
%run {os.path.join('..', 'src', 'utils.py')}

## RFM
##### Recency, Frequency, Monetary value (RFM) is a model used in marketing analysis that segments a company's consumer base by their purchasing patterns or habits. In particular, it evaluates customers Recency (how long ago they made a purchase), Frequency (how often they make purchases), and Monetary value (how much money they spend).


In [15]:
#read our dataset
rfm_data = pd.read_csv(os.path.join("..","data","potentials.csv"))

In [16]:
# Boolean condition to filter rows
condition = (rfm_data['is_promotion'] != 1) & (rfm_data['current_products_price'] > 0) & (rfm_data['membership_length'] > 0)

# Filter the DataFrame based on the condition
rfm_data = rfm_data[condition].copy()

In [17]:
#filling the NaN budget_value with 1
rfm_data['budget_value'].fillna(1, inplace=True)

###  Step 1 - > Recency

In [18]:
#we will count information until today
reference_date = datetime.datetime.today().date()

In [19]:
# creating extra columns
rfm_data['days_since_last_call'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_call'])).astype('timedelta64[D]')
rfm_data['days_since_last_touch'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_touch'])).astype('timedelta64[D]')
rfm_data['days_since_last_seen'] = (pd.to_datetime(reference_date) - pd.to_datetime(rfm_data['last_seen_at'])).astype('timedelta64[D]')

In [20]:
# Fill NaN values in 'days_since_last_call' with the maximum value from the column
max_last_call = rfm_data['days_since_last_call'].max()
rfm_data['days_since_last_call'].fillna(max_last_call, inplace=True)

# Fill NaN values in 'days_since_last_touch' with the maximum value from the column
max_last_touch = rfm_data['days_since_last_touch'].max()
rfm_data['days_since_last_touch'].fillna(max_last_touch, inplace=True)

In [21]:
columns_to_normalize = ['budget_value', 'lead_read_gap_min', 'lead_count','view_count','image_count','video_count','discount_count','review_count','touch_count','call_count']  # List of columns to normalize

for column in columns_to_normalize:
    min_val = rfm_data[column].min()
    max_val = rfm_data[column].max()
    rfm_data[column] = (rfm_data[column] - min_val) / (max_val - min_val)

### Step 2 - > Frequency Part-Monetary Part
##### For **frequency**, we can use the following  : *lead_count*,*view_count*, *call_count* and *touch_count* and for **Monetary** our necessary columns will be : *budget_value*,*current_product_price*

In [22]:
rfm_data = rfm_data[['provider_id', 'lead_count','view_count','image_count','video_count','discount_count','review_count','touch_count','call_count',
                    'membership_length','budget_value','current_products_price','lead_read_gap_min',
                    'days_since_last_call', 'days_since_last_touch','days_since_last_seen']]

In [23]:
#our finalized dataframe to work on can be seen here: 
rfm_data.head(2)

Unnamed: 0,provider_id,lead_count,view_count,image_count,video_count,discount_count,review_count,touch_count,call_count,membership_length,budget_value,current_products_price,lead_read_gap_min,days_since_last_call,days_since_last_touch,days_since_last_seen
2,117,0.324134,0.444696,0.080592,0.157143,0.0,0.147297,0.108333,0.147059,96,0.023067,51288.0,0.031348,137.0,79.0,60.0
6,143,0.112965,0.085073,0.073191,0.014286,0.222222,0.05,0.05,0.088235,108,0.023067,110664.0,0.005032,72.0,72.0,60.0


In [24]:
# Function to calculate Monetary column
def calculate_monetary(df):
    """
    this function will be used to create a monetary score and assign it to a seperate column created and named as Monetary
    """
    df['Monetary'] = df['current_products_price'] * df['budget_value']

# Function to calculate Frequency column
def calculate_frequency(df):
    """
    this function will be used to create a frequency score and assign it to a seperate column created and named as Frequency
    """
    df['Frequency'] = (df['image_count'] + df['video_count'] + df['discount_count'] + df['review_count'] + df['lead_count'] + df['view_count'] +
                      (1.5 * (df['touch_count'] + df['call_count']))) / df['membership_length']

# Function to calculate Recency column
def calculate_recency(df):
    """
    this function will be used to create a recency score and assign it to a seperate column created and named as Recency
    """
    min_last_touch = df['days_since_last_touch'].min()
    min_last_seen = df['days_since_last_seen'].min()
    min_last_call = df['days_since_last_call'].min()

    df['Recency'] = np.minimum.reduce([min_last_touch, min_last_seen, min_last_call]) * df['lead_read_gap_min']


In [25]:
# Calculate the columns using the defined functions
calculate_monetary(rfm_data)
calculate_frequency(rfm_data)
calculate_recency(rfm_data)

In [26]:
#now let's create our final RFM dataset to evaluate :
rfm_providers=rfm_data[["provider_id"]]
rfm_data = rfm_data[["Recency","Monetary","Frequency"]]

In [27]:
quantiles = rfm_data.quantile(q=[0.25,0.5,0.75])
quantiles

Unnamed: 0,Recency,Monetary,Frequency
0.25,0.204294,94.39186,0.009279
0.5,1.639889,322.475307,0.017514
0.75,8.236842,1320.259357,0.034462


In [28]:
quantiles.to_dict()

{'Recency': {0.25: 0.20429362880886426,
  0.5: 1.6398891966759002,
  0.75: 8.236842105263158},
 'Monetary': {0.25: 94.39186042933774,
  0.5: 322.47530743905065,
  0.75: 1320.2593565062207},
 'Frequency': {0.25: 0.009279177225368272,
  0.5: 0.01751447891206222,
  0.75: 0.034462434318311176}}

In [29]:
def RScore(x,p,d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1
def FMScore(x,p,d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

In [30]:
rfm_segmentation = rfm_data
rfm_segmentation['R_Quartile'] = rfm_segmentation['Recency'].apply(RScore, args=('Recency',quantiles,))
rfm_segmentation['F_Quartile'] = rfm_segmentation['Frequency'].apply(FMScore, args=('Frequency',quantiles,))
rfm_segmentation['M_Quartile'] = rfm_segmentation['Monetary'].apply(FMScore, args=('Monetary',quantiles,))

In [31]:
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile
2,1.880886,1183.055334,0.01601,2,2,3
6,0.301939,2552.675781,0.007084,3,1,4
7,0.066482,168.884932,0.013442,4,2,2
9,0.287396,677.585534,0.008841,3,1,3
10,1.572715,5060.786306,0.008249,3,1,4


In [32]:
rfm_segmentation['RFMScore'] = rfm_segmentation.R_Quartile.map(str) \
                            + rfm_segmentation.F_Quartile.map(str) \
                            + rfm_segmentation.M_Quartile.map(str)
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile,RFMScore
2,1.880886,1183.055334,0.01601,2,2,3,223
6,0.301939,2552.675781,0.007084,3,1,4,314
7,0.066482,168.884932,0.013442,4,2,2,422
9,0.287396,677.585534,0.008841,3,1,3,313
10,1.572715,5060.786306,0.008249,3,1,4,314


In [33]:
rfm_segmentation['RFMScore_num'] = rfm_segmentation.R_Quartile \
                            + rfm_segmentation.F_Quartile \
                            + rfm_segmentation.M_Quartile
rfm_segmentation.head()

Unnamed: 0,Recency,Monetary,Frequency,R_Quartile,F_Quartile,M_Quartile,RFMScore,RFMScore_num
2,1.880886,1183.055334,0.01601,2,2,3,223,7
6,0.301939,2552.675781,0.007084,3,1,4,314,8
7,0.066482,168.884932,0.013442,4,2,2,422,8
9,0.287396,677.585534,0.008841,3,1,3,313,7
10,1.572715,5060.786306,0.008249,3,1,4,314,8


In [34]:
rfm_segmentation = rfm_segmentation.reset_index(drop=True)

In [35]:
rfm_providers = rfm_providers.reset_index(drop=True)

In [36]:
rfm_segmentation = pd.merge(rfm_providers, rfm_segmentation, left_index=True, right_index=True)

In [37]:
#Saving the feature engineering results as CSV file
rfm_segmentation.to_csv(Path("..","data","rfm_segmentation.csv"),index=False)