In [73]:
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../src')
from feature_extractor import preprocess_features, aggregate_customer_features, iv_woe

In [74]:
def extract_customer(df, snaphot_date=None):
    
    if snaphot_date is None:
        snaphot_date = df['TransactionStartTime'].max() + timedelta(days=1)
    
    customers = df.groupby('CustomerId').agg(
        recency=('TransactionStartTime', lambda x: (snaphot_date - x.max()).days),
        frequency=('TransactionId', 'count'),
        monetary=('Amount', 'sum'),
        total_transactions=('TransactionId', 'count'),
        total_amount=('Amount', 'sum'),
        avg_amount=('Amount', 'mean'),
        std_amount=('Amount', 'std'),
        total_value=('Value', 'sum'),
        avg_value=('Value', 'mean'),
        std_value=('Value', 'std'),
        unique_providers=('ProviderId', pd.Series.nunique),
        unique_products=('ProductId', pd.Series.nunique),
        unique_channels=('ChannelId', pd.Series.nunique),
        fraud_count=('FraudResult', 'sum'),
        fraud_rate=('FraudResult', 'mean')
    ).reset_index()
    
    # rfm.columns = ['CustomerId', 'recency', 'frequency', 'monetary']
    
    return customers

In [75]:
def scaled_customers(rfm_df):
    """
    Scales the RFM (Recency, Frequency, Monetary) features using StandardScaler.

    Parameters:
    rfm_df (DataFrame): DataFrame containing 'Recency', 'Frequency', and 'Monetary' columns.

    Returns:
    tuple: (scaled_rfm, scaler)
        scaled_rfm (ndarray): Scaled RFM features as a numpy array.
        scaler (StandardScaler): Fitted StandardScaler object.
    """
    # Find columns with numeric type
    numeric_cols = rfm_df.select_dtypes(include=np.number).columns
    rfm_values = rfm_df[numeric_cols]
    scaler = StandardScaler()
    scaled_rfm = scaler.fit_transform(rfm_values)

    return scaled_rfm, scaler

In [76]:
def create_rfm_clusters(scaled_rfm, random_state=42):
    """
    Perform KMeans clustering on scaled RFM features.

    Parameters:
    scaled_rfm (ndarray): Scaled RFM features as a numpy array.
    random_state (int): Random state for KMeans clustering (default=42).

    Returns:
    ndarray: Cluster labels for each customer.
    """

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    clusters = kmeans.fit_predict(scaled_rfm)
    return clusters

In [77]:
def assign_proxy_label(rfm_df, clusters):
    """
    Assigns a proxy label for high-risk customers based on RFM cluster profiles.

    Parameters:
    rfm_df (DataFrame): DataFrame containing RFM features for each customer.
    clusters (array-like): Cluster labels assigned to each customer.

    Returns:
    DataFrame: The input DataFrame with added 'cluster' and 'is_high_risk' columns.
    """

    rfm_df['cluster'] = clusters

    # Compute mean values per cluster to identify least engaged one
    cluster_profile = rfm_df.groupby('cluster')[['recency', 'frequency', 'monetary']].mean().reset_index()
    print (cluster_profile)

    # Let's assume lowest frequency & monitary = high risk

    high_risk_cluster = cluster_profile.sort_values(by=['frequency', 'monetary']).index[0]
    rfm_df['is_high_risk'] = (rfm_df['cluster'] == high_risk_cluster).astype(int)
    rfm_df = rfm_df.drop(columns=['cluster'])

    return rfm_df

In [78]:
df = pd.read_csv('../data/raw/data.csv')
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

## RFM Segmentation and High-Risk Labeling

In this section, we perform RFM (Recency, Frequency, Monetary) analysis to segment customers based on their transaction behaviors. The workflow includes:

- Calculating RFM features for each customer using transaction data.
- Scaling RFM features for clustering.
- Applying KMeans clustering to group customers into segments.
- Assigning a proxy label (`is_high_risk`) to identify high-risk customer segments based on cluster profiles.

This segmentation enables targeted analysis and risk modeling in subsequent steps.

In [79]:
customers = extract_customer(df)
customers['std_amount'] = customers['std_amount'].fillna(0)
customers['std_value'] = customers['std_value'].fillna(0)
scaled_customers, scaler = scaled_customers(customers)
# processed_df = pd.DataFrame(features.toarray(), columns=feature_names)
numeric_cols = customers.select_dtypes(include=np.number).columns
processed_customers = pd.DataFrame(scaled_customers, columns=numeric_cols)


In [80]:
processed_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3742 entries, 0 to 3741
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   recency             3742 non-null   float64
 1   frequency           3742 non-null   float64
 2   monetary            3742 non-null   float64
 3   total_transactions  3742 non-null   float64
 4   total_amount        3742 non-null   float64
 5   avg_amount          3742 non-null   float64
 6   std_amount          3742 non-null   float64
 7   total_value         3742 non-null   float64
 8   avg_value           3742 non-null   float64
 9   std_value           3742 non-null   float64
 10  unique_providers    3742 non-null   float64
 11  unique_products     3742 non-null   float64
 12  unique_channels     3742 non-null   float64
 13  fraud_count         3742 non-null   float64
 14  fraud_rate          3742 non-null   float64
dtypes: float64(15)
memory usage: 438.6 KB


In [93]:
clusters = create_rfm_clusters(scaled_customers)
labeled_customers = assign_proxy_label(customers, clusters)
df_labeled = df.merge(labeled_customers[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')

processed_customers = pd.concat([processed_customers, labeled_customers['is_high_risk'].reset_index(drop=True)], axis=1) if 'is_high_risk' in labeled_customers.columns else pd.DataFrame()
# processed_df = pd.concat([processed_df, df['is_high_risk'].reset_index(drop=True)], axis=1) if 'is_high_risk' in df.columns else pd.DataFrame()

   cluster    recency   frequency      monetary
0        0  23.523810   36.777381  1.462872e+05
1        1  21.333333  109.000000  6.482298e+07
2        2  47.894996    2.178835  6.524201e+04


In [106]:
processed_customers.sample(5)

Unnamed: 0,recency,frequency,monetary,total_transactions,total_amount,avg_amount,std_amount,total_value,avg_value,std_value,unique_providers,unique_products,unique_channels,fraud_count,fraud_rate,is_high_risk
163,-0.643963,0.004495,-0.036256,0.004495,-0.036256,-0.076927,-0.102379,-0.061379,-0.09217,-0.09831,1.280259,0.843513,0.450075,-0.066617,-0.086096,0
2077,-0.496445,-0.170914,-0.059161,-0.170914,-0.059161,-0.086437,-0.124144,-0.088375,-0.103317,-0.117741,0.392594,1.333185,0.450075,-0.066617,-0.086096,0
934,-0.570204,-0.150278,0.205475,-0.150278,0.205475,0.302058,0.217944,0.17562,0.284342,0.226766,-1.382737,-1.115175,-1.404749,-0.066617,-0.086096,1
900,-0.791481,-0.19155,-0.068657,-0.19155,-0.068657,-0.106335,-0.123838,-0.087756,-0.099395,-0.114918,-1.382737,-0.625503,-1.404749,-0.066617,-0.086096,1
2017,1.826966,-0.253459,-0.059529,-0.253459,-0.059529,-0.034087,-0.140432,-0.089524,-0.052297,-0.131508,-1.382737,-1.115175,-1.404749,-0.066617,-0.086096,1


In [83]:
df_labeled['is_high_risk'].value_counts()

is_high_risk
0    93006
1     2656
Name: count, dtype: int64

In [84]:
processed_df, _ = preprocess_features(df_labeled)

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult',
       'is_high_risk'],
      dtype='object')


In [100]:
processed_df.to_csv('../data/processed/processed_data.csv', index=False)
processed_customers.to_csv('../data/processed/processed_customers.csv', index=False)

In [101]:
processed_customers['is_high_risk'].value_counts()

is_high_risk
0    2523
1    1219
Name: count, dtype: int64

In [102]:
new_df, woe_df = iv_woe(processed_customers, 'is_high_risk')

Information value of recency is 0.944513
Information value of frequency is 6.214178
Information value of monetary is 1.640208
Information value of total_transactions is 6.214178
Information value of total_amount is 1.640208
Information value of avg_amount is 1.063364
Information value of std_amount is 3.114451
Information value of total_value is 2.052103
Information value of avg_value is 1.192368
Information value of std_value is 2.953377
Information value of unique_providers is 10.476192
Information value of unique_products is 5.453999
Information value of unique_channels is 8.559467
Information value of fraud_count is 0.0
Information value of fraud_rate is 0.0


In [103]:
woe_df.head(15).sort_values(by='IV', ascending=False)

Unnamed: 0,Variable,Cutoff,N,Events,% of Events,Non-Events,% of Non-Events,WoE,IV
0,frequency,"(-0.0987, 0.00449]",371,5,0.004102,366,0.145065,3.565778,0.502645
1,frequency,"(-0.161, -0.0987]",340,14,0.011485,326,0.129211,2.420422,0.284948
9,recency,"(1.606, 2.196]",341,238,0.195242,103,0.040824,-1.56496,0.241657
5,recency,"(-1.1239999999999999, -1.087]",501,55,0.045119,446,0.176774,1.365568,0.179783
3,frequency,"(-0.212, -0.192]",368,31,0.025431,337,0.133571,1.658678,0.17937
8,recency,"(1.008, 1.606]",408,249,0.204266,159,0.06302,-1.175967,0.1661
4,recency,"(-1.087, -0.976]",372,35,0.028712,337,0.133571,1.537317,0.161202
2,frequency,"(-0.192, -0.161]",291,23,0.018868,268,0.106223,1.728075,0.150956
2,recency,"(-0.865, -0.607]",395,54,0.044299,341,0.135157,1.115481,0.10135
4,frequency,"(-0.233, -0.212]",444,72,0.059065,372,0.147444,0.91481,0.08085
