In [1]:
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd

import sys
sys.path.insert(1, '../src')
from feature_extractor import preprocess_features, aggregate_customer_features, iv_woe

In [2]:
def calculate_rfm(df, snaphot_date=None):
    """
    Calculate RFM (Recency, Frequency, Monetary) features from a DataFrame.
    
    Parameters:
    df (DataFrame): Input DataFrame containing 'CustomerId', 'transaction_date', and 'amount'.
    snaphot_date (datetime, optional): The date to consider as the snapshot for recency calculation.
    
    Returns:
    DataFrame: A DataFrame with RFM features.
    """
    if snaphot_date is None:
        snaphot_date = df['TransactionStartTime'].max() + timedelta(days=1)
    
    rfm = df.groupby('CustomerId').agg({
        'TransactionStartTime': lambda x: (snaphot_date - x.max()).days,
        'TransactionId': 'count',
        'Amount': 'sum'
    }).rename(
        columns={
            'TransactionStartTime': 'Recency',
            'TransactionId': 'Frequency',
            'Amount': 'Monetary'
        }
    ).reset_index()
    
    rfm.columns = ['CustomerId', 'Recency', 'Frequency', 'Monetary']
    
    return rfm

In [None]:
def scale_rfm(rfm_df):
    """
    Scales the RFM (Recency, Frequency, Monetary) features using StandardScaler.

    Parameters:
    rfm_df (DataFrame): DataFrame containing 'Recency', 'Frequency', and 'Monetary' columns.

    Returns:
    tuple: (scaled_rfm, scaler)
        scaled_rfm (ndarray): Scaled RFM features as a numpy array.
        scaler (StandardScaler): Fitted StandardScaler object.
    """
    rfm_values = rfm_df[['Recency', 'Frequency', 'Monetary']]
    scaler = StandardScaler()
    scaled_rfm = scaler.fit_transform(rfm_values)

    return scaled_rfm, scaler

In [None]:
def create_rfm_clusters(scaled_rfm, random_state=42):
    """
    Perform KMeans clustering on scaled RFM features.

    Parameters:
    scaled_rfm (ndarray): Scaled RFM features as a numpy array.
    random_state (int): Random state for KMeans clustering (default=42).

    Returns:
    ndarray: Cluster labels for each customer.
    """

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    clusters = kmeans.fit_predict(scaled_rfm)
    return clusters

In [None]:
def assign_proxy_label(rfm_df, clusters):
    """
    Assigns a proxy label for high-risk customers based on RFM cluster profiles.

    Parameters:
    rfm_df (DataFrame): DataFrame containing RFM features for each customer.
    clusters (array-like): Cluster labels assigned to each customer.

    Returns:
    DataFrame: The input DataFrame with added 'cluster' and 'is_high_risk' columns.
    """

    rfm_df['cluster'] = clusters

    # Compute mean values per cluster to identify least engaged one
    cluster_profile = rfm_df.groupby('cluster')[['Recency', 'Frequency', 'Monetary']].mean().reset_index()
    print (cluster_profile)

    # Let's assume lowest frequency & monitary = high risk

    high_risk_cluster = cluster_profile.sort_values(by=['Frequency', 'Monetary']).index[0]
    rfm_df['is_high_risk'] = (rfm_df['cluster'] == high_risk_cluster).astype(int)

    return rfm_df

In [6]:
df = pd.read_csv('../data/raw/data.csv')
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

## RFM Segmentation and High-Risk Labeling

In this section, we perform RFM (Recency, Frequency, Monetary) analysis to segment customers based on their transaction behaviors. The workflow includes:

- Calculating RFM features for each customer using transaction data.
- Scaling RFM features for clustering.
- Applying KMeans clustering to group customers into segments.
- Assigning a proxy label (`is_high_risk`) to identify high-risk customer segments based on cluster profiles.

This segmentation enables targeted analysis and risk modeling in subsequent steps.

In [7]:
rfm = calculate_rfm(df)
scale_rfm, scaler = scale_rfm(rfm)


In [8]:
clusters = create_rfm_clusters(scale_rfm)
rfm_labeled = assign_proxy_label(rfm, clusters)
df_labeled = df.merge(rfm_labeled[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

   cluster    Recency    Frequency      Monetary
0        0  61.859846     7.726699  8.172379e+04
1        1  29.000000  4091.000000 -1.049000e+08
2        2  12.716076    34.807692  2.726546e+05


In [9]:
df_labeled.sample(5)

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,is_high_risk
55492,TransactionId_99187,BatchId_35543,AccountId_318,SubscriptionId_3087,CustomerId_647,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-01-09 19:58:04+00:00,2,0,0
42344,TransactionId_53717,BatchId_8953,AccountId_4841,SubscriptionId_3829,CustomerId_647,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-10.0,10,2018-12-27 14:19:07+00:00,2,0,0
50985,TransactionId_72182,BatchId_27582,AccountId_3369,SubscriptionId_3665,CustomerId_3802,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,10000.0,10000,2019-01-04 20:32:46+00:00,2,0,0
80575,TransactionId_20002,BatchId_67943,AccountId_4841,SubscriptionId_3829,CustomerId_2217,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-5000.0,5000,2019-02-01 08:48:29+00:00,2,0,0
81362,TransactionId_17742,BatchId_114134,AccountId_4840,SubscriptionId_3829,CustomerId_1922,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-01 13:17:05+00:00,2,0,0


In [10]:
df_labeled['is_high_risk'].value_counts()

is_high_risk
0    84636
1    11026
Name: count, dtype: int64

In [11]:
processed_df, _ = preprocess_features(df_labeled)

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'is_high_risk'],
      dtype='object')


In [12]:
processed_df.to_csv('../data/processed/processed_data.csv', index=False)

In [13]:
processed_df['is_high_risk'].value_counts()

is_high_risk
0    84636
1    11026
Name: count, dtype: int64

In [14]:
new_df, woe_df = iv_woe(processed_df, 'is_high_risk')

Information value of Amount is 0.02353
Information value of Value is 0.048139
Information value of ProductCategory_airtime is 2.670338
Information value of ProductCategory_data_bundles is 1.689457


Information value of ProductCategory_financial_services is 0.032647
Information value of ProductCategory_movies is 0.018344
Information value of ProductCategory_other is 0.018443
Information value of ProductCategory_ticket is 2.3e-05
Information value of ProductCategory_transport is 0.012851
Information value of ProductCategory_tv is 0.016609
Information value of ProductCategory_utility_bill is 0.000601
Information value of ProviderId_ProviderId_1 is 0.022983
Information value of ProviderId_ProviderId_2 is 0.000283
Information value of ProviderId_ProviderId_3 is 0.00093
Information value of ProviderId_ProviderId_4 is 0.001152
Information value of ProviderId_ProviderId_5 is 0.02462
Information value of ProviderId_ProviderId_6 is 8.8e-05
Information value of ProductId_ProductId_1 is 0.006249
Information value of ProductId_ProductId_10 is 0.002392
Information value of ProductId_ProductId_11 is 0.004222
Information value of ProductId_ProductId_12 is 0.013927
Information value of ProductId_

In [15]:
woe_df.head(15).sort_values(by='IV', ascending=False)

Unnamed: 0,Variable,Cutoff,N,Events,% of Events,Non-Events,% of Non-Events,WoE,IV
1,Value,1.0,15049,1102,0.099946,13947,0.164788,0.500034,0.032423
1,Amount,"(11.0, 13.0]",10486,1569,0.1423,8917,0.105357,-0.300582,0.011104
3,Value,3.0,11039,1560,0.141484,9479,0.111997,-0.233711,0.006891
4,Value,4.0,27716,3622,0.328496,24094,0.284678,-0.143167,0.006273
6,Amount,"(19.0, 23.0]",5472,796,0.072193,4676,0.055248,-0.267504,0.004533
3,Amount,"(14.0, 16.0]",13591,1347,0.122166,12244,0.144667,0.169053,0.003804
5,Amount,"(17.0, 19.0]",11464,1204,0.109196,10260,0.121225,0.1045,0.001257
4,Amount,"(16.0, 17.0]",7291,764,0.069291,6527,0.077118,0.107031,0.000838
2,Amount,"(13.0, 14.0]",5179,663,0.060131,4516,0.053358,-0.119497,0.000809
9,Amount,"(9.0, 11.0]",12212,1315,0.119264,10897,0.128751,0.076547,0.000726
