In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns

from src.feature_engineering import CustomerAggregateFeature, DateTimeFeatureExtractor, build_full_pipeline

In [2]:
# Load raw data
df = pd.read_csv("../data/raw/data - data.csv")
df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


### Feature Engineering
🎯 Goal:
Transform raw transaction data into model-ready features using:
- Customer-level aggregate features
- Time-based features
- Categorical encoding
- Scaling/normalization
- Missing value handling

In [3]:
# Assume 'TransactionStartTime' is a datetime column
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

df = df.to_csv("../data/processed/processed_feature.csv", index=False)

In [4]:
df = pd.read_csv("../data/processed/processed_feature.csv")

In [5]:
print("\n🔢 Summary Statistics:")
print(df.describe(include='all').T)


🔢 Summary Statistics:
                        count unique                        top   freq  \
TransactionId           95662  95662        TransactionId_76871      1   
BatchId                 95662  94809              BatchId_67019     28   
AccountId               95662   3633             AccountId_4841  30893   
SubscriptionId          95662   3627        SubscriptionId_3829  32630   
CustomerId              95662   3742            CustomerId_7343   4091   
CurrencyCode            95662      1                        UGX  95662   
CountryCode           95662.0    NaN                        NaN    NaN   
ProviderId              95662      6               ProviderId_4  38189   
ProductId               95662     23                ProductId_6  32635   
ProductCategory         95662      9         financial_services  45405   
ChannelId               95662      4                ChannelId_3  56935   
Amount                95662.0    NaN                        NaN    NaN   
Value          

In [6]:
required = ['AccountId', 'Value', 'Amount', 'TransactionStartTime', 'ChannelId', 'PricingStrategy']
missing = set(required) - set(df.columns)
if missing:
    print(f"Warning: Missing columns {missing}. Pipeline may fail.")

In [7]:
# Build and apply the full pipeline
pipeline = build_full_pipeline()
X_processed = pipeline.fit_transform(df)
#
print(X_processed.shape)

AttributeError: 'DebugTransformer' object has no attribute 'fit'

In [8]:
# First check what columns you actually have
print("Existing columns:", df.columns.tolist())

# Check if 'TransactionHour' exists
if 'TransactionHour' not in df.columns:
    print("ERROR: 'TransactionHour' column is missing")

Existing columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult', 'TransactionHour', 'TransactionDay', 'TransactionMonth', 'TransactionYear']


### Proxy Target Variable Engineering

🎯 Goal:
Create a binary target column (is_high_risk) by identifying disengaged customers using RFM (Recency, Frequency, Monetary) and clustering.

In [9]:
from src.proxy_target_engineering import ProxyTargetEngineer

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

In [11]:
df["TransactionHour"] = pd.to_datetime(df["TransactionHour"])
df["TransactionDay"] = pd.to_datetime(df["TransactionDay"])
df["TransactionMonth"] = pd.to_datetime(df["TransactionMonth"])
df["TransactionYear"] = pd.to_datetime(df["TransactionYear"])

In [12]:
# Build and apply proxy target logic
proxy = ProxyTargetEngineer(snapshot_date='2023-12-31')
target_df = proxy.generate_target(df)

# Merge with preprocessed feature data
final_data = df.merge(target_df, on='CustomerId', how='left')

TypeError: Cannot subtract tz-naive and tz-aware datetime-like objects.