#  Feature Engineering Pipeline

In [None]:
import sys
import os
import importlib.util

# Get absolute project root folder
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
print("Project Root:", project_root)

# Load data_loader.py
loader_path = os.path.join(project_root, "src", "data_loader.py")
spec = importlib.util.spec_from_file_location("data_loader", loader_path)
data_loader = importlib.util.module_from_spec(spec)
spec.loader.exec_module(data_loader)
load_data = data_loader.load_data

# Load feature_engineering_pipeline_task3.py
pipeline_path = os.path.join(project_root, "src", "feature_engineering_pipeline.py")
spec = importlib.util.spec_from_file_location("task3_pipeline", pipeline_path)
task3_pipeline = importlib.util.module_from_spec(spec)
spec.loader.exec_module(task3_pipeline)


Project Root: c:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring


# Load Dataset

In [6]:
df = load_data(r"C:\Users\hp\Desktop\AI projects\bati-bank-credit-scoring\data\raw\data.csv")
df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


# Prepare target and columns

In [16]:
target = "FraudResult"
customer_col = "CustomerId"
transaction_col = "Amount"
timestamp_col = "TransactionStartTime"

# Numeric columns
numeric_cols = df.select_dtypes(include="number").columns.tolist()
numeric_cols.remove(target)  # remove target itself

# Categorical columns
categorical_cols = df.select_dtypes(include="object").columns.tolist()
# Remove ID columns from features
for col in ["TransactionId", "BatchId", "AccountId", "SubscriptionId", customer_col]:
    if col in categorical_cols:
        categorical_cols.remove(col)

# Remove timestamp from categorical_cols
if timestamp_col in categorical_cols:
    categorical_cols.remove(timestamp_col)

print("Categorical Columns:", categorical_cols)
print("Numeric Columns:", numeric_cols)



Categorical Columns: ['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']
Numeric Columns: ['CountryCode', 'Amount', 'Value', 'PricingStrategy']


# Build Task 3 pipeline

In [17]:
# Build the Task 3 pipeline
pipeline = task3_pipeline.build_task3_pipeline(
    numeric_cols, categorical_cols, customer_col, transaction_col, timestamp_col
)

X = df.drop(columns=[target])
y = df[target]

X_task3 = pipeline.fit_transform(X, y)
print("Task 3 Main Feature Matrix Shape:", X_task3.shape)


Task 3 Main Feature Matrix Shape: (95662, 47)


# Fit and transform main features

In [18]:
X = df.drop(columns=[target])
y = df[target]

X_task3 = pipeline.fit_transform(X, y)
print("Task 3 pipeline executed successfully!")
print("Main feature matrix shape:", X_task3.shape)


Task 3 pipeline executed successfully!
Main feature matrix shape: (95662, 47)


# WoE + IV features

In [20]:
target = "FraudResult"

# Initialize WOE + IV transformer with target column
woe_iv = task3_pipeline.WOEIVTransformer(target_col=target, iv_threshold=0.02)

# Fit and transform the features
X_woe_iv = woe_iv.fit_transform(df[categorical_cols + numeric_cols], df[target])

print("WoE + IV features shape:", X_woe_iv.shape)
print("\nInformation Value (IV) for features:\n")
print(woe_iv.iv_values_.sort_values(ascending=False))


  grouped = df2.groupby('bin')[self.target_col].agg(['count', 'sum'])
  grouped = df2.groupby('bin')[self.target_col].agg(['count', 'sum'])
  grouped = df2.groupby('bin')[self.target_col].agg(['count', 'sum'])
  grouped = df2.groupby('bin')[self.target_col].agg(['count', 'sum'])


WoE + IV features shape: (95662, 7)

Information Value (IV) for features:

Value              10.112496
Amount              9.914119
ProductId           5.017794
ProviderId          3.329529
ProductCategory     1.463272
ChannelId           1.350916
PricingStrategy     0.085529
dtype: float64
