# Feature Engineering Pipeline

This notebook demonstrates a robust, automated, and reproducible feature engineering workflow using custom transformers and scikit-learn pipelines. All logic is implemented in `src/feature_engineering.py`.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
 #--- Proxy Target Variable Engineering: RFM & High-Risk Label ---

import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [2]:
sys.path.append(os.path.abspath(os.path.join('..')))

In [3]:
from src.feature_engineering import (
    DateTimeFeatureExtractor,
    AggregateFeatures,
    DataFrameImputer,
    CategoricalEncoder,
    NumericScaler,
    build_feature_pipeline
)# Load Raw Data
raw_data_path = '../data/raw/data.csv'
df = pd.read_csv(raw_data_path)
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [4]:
# Extract DateTime Features
datetime_extractor = DateTimeFeatureExtractor(datetime_col='TransactionStartTime')
df = datetime_extractor.fit_transform(df)
df[['year', 'month', 'day', 'hour']].head()

Unnamed: 0,year,month,day,hour
0,2018,11,15,2
1,2018,11,15,2
2,2018,11,15,2
3,2018,11,15,3
4,2018,11,15,3


In [5]:
# Aggregate Customer Features
agg_features = AggregateFeatures(customer_id_col='CustomerId', amount_col='Amount')
df = agg_features.fit_transform(df)
df[['CustomerId', 'total_transaction_amount', 'average_transaction_amount', 'transaction_count', 'std_transaction_amount']].head()

Unnamed: 0,CustomerId,total_transaction_amount,average_transaction_amount,transaction_count,std_transaction_amount
0,CustomerId_4406,109921.75,923.712185,119,3042.294251
1,CustomerId_4406,109921.75,923.712185,119,3042.294251
2,CustomerId_4683,1000.0,500.0,2,0.0
3,CustomerId_988,228727.2,6019.136842,38,17169.24161
4,CustomerId_988,228727.2,6019.136842,38,17169.24161


In [6]:
# Encode Categorical Variables
categorical_cols = [
    'AccountId', 'SubscriptionId', 'CustomerId', 'ProviderId',
    'ProductId', 'ProductCategory', 'ChannelId', 'FraudResult'
]
categorical_encoder = CategoricalEncoder(categorical_cols=categorical_cols, encoding='onehot')
df = categorical_encoder.fit_transform(df)
df.head()

Unnamed: 0,TransactionId,BatchId,CurrencyCode,CountryCode,Amount,Value,TransactionStartTime,PricingStrategy,year,month,...,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,FraudResult_0,FraudResult_1
0,TransactionId_76871,BatchId_36123,UGX,256,1000.0,1000,2018-11-15 02:18:49+00:00,2,2018,11,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,TransactionId_73770,BatchId_15642,UGX,256,-20.0,20,2018-11-15 02:19:08+00:00,2,2018,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,TransactionId_26203,BatchId_53941,UGX,256,500.0,500,2018-11-15 02:44:21+00:00,2,2018,11,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,TransactionId_380,BatchId_102363,UGX,256,20000.0,21800,2018-11-15 03:32:55+00:00,2,2018,11,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,TransactionId_28195,BatchId_38780,UGX,256,-644.0,644,2018-11-15 03:34:21+00:00,2,2018,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [7]:
# # Scale Numeric Features
numeric_cols = ['Amount', 'Value', 'PricingStrategy', 'day', 'hour']
numeric_scaler = NumericScaler(numeric_cols=numeric_cols, scaling='standard')
df = numeric_scaler.fit_transform(df)
df.head()

Unnamed: 0,TransactionId,BatchId,CurrencyCode,CountryCode,Amount,Value,TransactionStartTime,PricingStrategy,year,month,...,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,FraudResult_0,FraudResult_1
0,TransactionId_76871,BatchId_36123,UGX,256,-0.046371,-0.072291,2018-11-15 02:18:49+00:00,-0.349252,2018,11,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,TransactionId_73770,BatchId_15642,UGX,256,-0.054643,-0.080251,2018-11-15 02:19:08+00:00,-0.349252,2018,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,TransactionId_26203,BatchId_53941,UGX,256,-0.050426,-0.076352,2018-11-15 02:44:21+00:00,-0.349252,2018,11,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,TransactionId_380,BatchId_102363,UGX,256,0.107717,0.096648,2018-11-15 03:32:55+00:00,-0.349252,2018,11,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,TransactionId_28195,BatchId_38780,UGX,256,-0.059704,-0.075183,2018-11-15 03:34:21+00:00,-0.349252,2018,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [8]:
# # Build and Run Feature Engineering Pipeline
# try:
#     pipeline = build_feature_pipeline()
#     df_raw = pd.read_csv(raw_data_path)
#     df_processed = pipeline.fit_transform(df_raw)
#     df_processed.head()
# except Exception as e:
#     import traceback
#     traceback.print_exc()

In [9]:
#Save Processed Data
# processed_data_path = '../data/processed/model_ready.csv'
# df_processed.to_csv(processed_data_path, index=False)
# print(f"Processed data saved to {processed_data_path}")

In [10]:
df_processed = pd.read_csv('../data/processed/model_ready.csv')

In [12]:
df_processed.columns

Index(['TransactionId', 'BatchId', 'CurrencyCode', 'CountryCode', 'Amount',
       'Value', 'TransactionStartTime', 'PricingStrategy', 'year', 'month',
       ...
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCategory_tv', 'ProductCategory_utility_bill',
       'ChannelId_ChannelId_1', 'ChannelId_ChannelId_2',
       'ChannelId_ChannelId_3', 'ChannelId_ChannelId_5', 'FraudResult_0.0',
       'FraudResult_1.0'],
      dtype='object', length=11062)

In [14]:
# 1. Calculate RFM metrics

# Recover original CustomerId from one-hot encoded columns if necessary
if 'CustomerId' not in df_processed.columns:
    customer_id_cols = [col for col in df_processed.columns if col.startswith('CustomerId_CustomerId_')]
    # Extract the CustomerId value from the column name where the value is 1
    df_processed['CustomerId'] = df_processed[customer_id_cols].idxmax(axis=1).str.replace('CustomerId_CustomerId_', '')

df_processed['TransactionStartTime'] = pd.to_datetime(df_processed['TransactionStartTime'])
snapshot_date = df_processed['TransactionStartTime'].max() + pd.Timedelta(days=1)
rfm = df_processed.groupby('CustomerId').agg({
    'TransactionStartTime': lambda x: (snapshot_date - x.max()).days,  # Recency
    'TransactionId': 'count',  # Frequency
    'Amount': 'sum'            # Monetary
}).rename(columns={
    'TransactionStartTime': 'Recency',
    'TransactionId': 'Frequency',
    'Amount': 'Monetary'
}).reset_index()


In [15]:
# 2. Scale RFM features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])


In [16]:

# 3. KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
rfm['cluster'] = kmeans.fit_predict(rfm_scaled)


In [17]:

# 4. Identify high-risk cluster (lowest Frequency & Monetary, highest Recency)
cluster_stats = rfm.groupby('cluster')[['Recency', 'Frequency', 'Monetary']].mean()
high_risk_cluster = cluster_stats.sort_values(['Frequency', 'Monetary', 'Recency'], ascending=[True, True, False]).index[0]


In [18]:

# 5. Assign is_high_risk label
rfm['is_high_risk'] = (rfm['cluster'] == high_risk_cluster).astype(int)


In [None]:

# 6. Merge is_high_risk back to main processed dataset
df_processed = df_processed.merge(rfm[['CustomerId', 'is_high_risk']], on='CustomerId', how='left')


In [19]:

# 7. Save the updated processed data
processed_data_path = '../data/processed/model_ready.csv'
df_processed.to_csv(processed_data_path, index=False)
print("Added is_high_risk target variable and saved updated data.")

Added is_high_risk target variable and saved updated data.
