### Importing Libraries & Annd Dataset Loading

In [10]:
import pandas as pd
import numpy as np
import os

# Load dataset 
data_path = "../dataset/paysim_sample_100k.csv"
df = pd.read_csv(data_path)

print("Dataset loaded:", df.shape)
df.head()


Dataset loaded: (100000, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,0
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,0
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,0
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,0
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,0


### Log transform & balance change features

In [11]:
# Log-transform amount to reduce skew
df["amount_log"] = np.log1p(df["amount"])

# Balance changes for sender & receiver
df["orig_balance_change"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["dest_balance_change"] = df["newbalanceDest"] - df["oldbalanceDest"]

# Flag mismatched balances (amount inconsistency)
df["balance_mismatch_orig"] = ((df["orig_balance_change"] - df["amount"]).abs() > 1e-9).astype(int)
df["balance_mismatch_dest"] = ((df["dest_balance_change"] - df["amount"]).abs() > 1e-9).astype(int)

# Flag zero-balance anomalies
df["orig_zero_but_amount"] = ((df["oldbalanceOrg"] == 0) & (df["amount"] > 0)).astype(int)
df["dest_zero_but_amount"] = ((df["oldbalanceDest"] == 0) & (df["amount"] > 0)).astype(int)

# Balance ratios (guarding divide-by-zero)
df["orig_balance_ratio"] = df["newbalanceOrig"] / (df["oldbalanceOrg"] + 1e-9)
df["dest_balance_ratio"] = df["newbalanceDest"] / (df["oldbalanceDest"] + 1e-9)

# Transaction type encoding (one-hot)
type_dummies = pd.get_dummies(df["type"], prefix="type")
df = pd.concat([df, type_dummies], axis=1)

print("Basic transaction-level features created:", len(type_dummies.columns) + 7)
df.head()


Basic transaction-level features created: 12


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,balance_mismatch_dest,orig_zero_but_amount,dest_zero_but_amount,orig_balance_ratio,dest_balance_ratio,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,...,1,0,1,0.885468,0.0,False,False,False,True,False
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,...,1,0,1,0.440815,0.0,False,False,False,True,False
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,...,1,1,0,0.0,1.185004,False,False,False,False,True
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,...,1,0,0,0.0,24.575803,False,True,False,False,False
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,...,1,0,1,0.961494,0.0,False,False,False,True,False


### Sender-based aggregated features

In [12]:

sender_stats = df.groupby("nameOrig").agg({
    "amount": ["mean", "std", "max", "count"],
    "balance_mismatch_orig": "mean",
    "orig_zero_but_amount": "mean"
}).reset_index()


sender_stats.columns = ["nameOrig",
                        "sender_amount_mean", "sender_amount_std", "sender_amount_max",
                        "sender_tx_count", "sender_balance_mismatch_ratio",
                        "sender_zero_but_ratio"]

# Merge back into main df
df = df.merge(sender_stats, on="nameOrig", how="left")

print("Sender features added")
df.head()


Sender features added


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,sender_amount_mean,sender_amount_std,sender_amount_max,sender_tx_count,sender_balance_mismatch_ratio,sender_zero_but_ratio
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,...,False,False,True,False,25062.87,,25062.87,1,0.0,0.0
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,...,False,False,True,False,12035.89,,12035.89,1,0.0,0.0
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,...,False,False,False,True,1147953.82,,1147953.82,1,1.0,1.0
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,...,True,False,False,False,185024.72,,185024.72,1,1.0,0.0
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,...,False,False,True,False,11511.0,,11511.0,1,1.0,0.0


### Receiver-based aggregated features

In [13]:

dest_stats = df.groupby("nameDest").agg({
    "amount": ["mean", "std", "count"],
    "dest_zero_but_amount": "mean"
}).reset_index()

dest_stats.columns = ["nameDest",
                      "dest_amount_mean", "dest_amount_std",
                      "dest_tx_count", "dest_zero_but_ratio"]

df = df.merge(dest_stats, on="nameDest", how="left")

print(" Receiver features added")
df.head()


 Receiver features added


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,sender_amount_mean,sender_amount_std,sender_amount_max,sender_tx_count,sender_balance_mismatch_ratio,sender_zero_but_ratio,dest_amount_mean,dest_amount_std,dest_tx_count,dest_zero_but_ratio
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,...,25062.87,,25062.87,1,0.0,0.0,25062.87,,1,1.0
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,...,12035.89,,12035.89,1,0.0,0.0,12035.89,,1,1.0
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,...,1147953.82,,1147953.82,1,1.0,1.0,258726.772,333454.931711,10,0.1
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,...,185024.72,,185024.72,1,1.0,0.0,167767.633333,27190.450516,3,0.0
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,...,11511.0,,11511.0,1,1.0,0.0,11511.0,,1,1.0


### Temporal / step-based features

In [14]:

# Transaction velocity within same step (hour/batch)
df["tx_per_step_orig"] = df.groupby(["nameOrig", "step"])["amount"].transform("count")
df["amount_per_step_orig"] = df.groupby(["nameOrig", "step"])["amount"].transform("sum")
df["avg_amount_per_step_orig"] = df["amount_per_step_orig"] / (df["tx_per_step_orig"] + 1e-9)

print("Temporal (step-based) features added")
df[["nameOrig", "step", "tx_per_step_orig", "amount_per_step_orig", "avg_amount_per_step_orig"]].head()


Temporal (step-based) features added


Unnamed: 0,nameOrig,step,tx_per_step_orig,amount_per_step_orig,avg_amount_per_step_orig
0,C1287300038,13,1,25062.87,25062.87
1,C1458652905,9,1,12035.89,12035.89
2,C1512273713,1,1,1147953.82,1147954.0
3,C1697166189,9,1,185024.72,185024.7
4,C2056406361,11,1,11511.0,11511.0


### Derived ratios and relationships

In [15]:
df["balance_gap_ratio"] = (df["orig_balance_change"] - df["dest_balance_change"]) / (df["amount"] + 1e-9)
df["relative_amount_to_mean_sender"] = df["amount"] / (df["sender_amount_mean"] + 1e-9)
df["amount_to_balance_orig_ratio"] = df["amount"] / (df["oldbalanceOrg"] + 1e-9)
df["amount_balance_gap"] = np.abs(df["orig_balance_change"] - df["dest_balance_change"])

print(" Ratio and gap-based features added")
df.head()


 Ratio and gap-based features added


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,dest_amount_std,dest_tx_count,dest_zero_but_ratio,tx_per_step_orig,amount_per_step_orig,avg_amount_per_step_orig,balance_gap_ratio,relative_amount_to_mean_sender,amount_to_balance_orig_ratio,amount_balance_gap
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,...,,1,1.0,1,25062.87,25062.87,1.0,1.0,0.1145317,25062.87
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,...,,1,1.0,1,12035.89,12035.89,1.0,1.0,0.5591846,12035.89
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,...,333454.931711,10,0.1,1,1147953.82,1147954.0,-0.758819,1.0,1147954000000000.0,871089.35
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,...,27190.450516,3,0.0,1,185024.72,185024.7,-5.789107,1.0,399.6214,1071127.99
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,...,,1,1.0,1,11511.0,11511.0,1.000001,1.0,0.03850611,11511.01


### Fraud pattern flags & domain logic

In [16]:
df["is_large_transfer"] = ((df["amount"] > 1e6) & (df["type"].isin(["TRANSFER", "CASH_OUT"]))).astype(int)
df["is_same_sender_receiver"] = (df["nameOrig"] == df["nameDest"]).astype(int)
df["is_merchant_dest"] = df["nameDest"].str.startswith("M").astype(int)
df["is_customer_dest"] = df["nameDest"].str.startswith("C").astype(int)

print(" Fraud pattern flags created")
df[["amount", "type", "is_large_transfer", "is_same_sender_receiver", "is_merchant_dest"]].head()


 Fraud pattern flags created


Unnamed: 0,amount,type,is_large_transfer,is_same_sender_receiver,is_merchant_dest
0,25062.87,PAYMENT,0,0,1
1,12035.89,PAYMENT,0,0,1
2,1147953.82,TRANSFER,1,0,0
3,185024.72,CASH_OUT,0,0,0
4,11511.0,PAYMENT,0,0,1


### Interaction features

In [17]:
df["amount_to_sender_mean_ratio"] = df["amount"] / (df["sender_amount_mean"] + 1e-9)
df["amount_to_balance_gap_ratio"] = df["amount"] / (df["amount_balance_gap"] + 1e-9)
df["sender_activity_intensity"] = df["sender_tx_count"] / (df["step"] + 1e-9)

print("Interaction features created")
df.head()


Interaction features created


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,...,relative_amount_to_mean_sender,amount_to_balance_orig_ratio,amount_balance_gap,is_large_transfer,is_same_sender_receiver,is_merchant_dest,is_customer_dest,amount_to_sender_mean_ratio,amount_to_balance_gap_ratio,sender_activity_intensity
0,13,PAYMENT,25062.87,C1287300038,218829.1,193766.23,M345828317,0.0,0.0,0,...,1.0,0.1145317,25062.87,0,0,1,0,1.0,1.0,0.076923
1,9,PAYMENT,12035.89,C1458652905,21524.0,9488.11,M516109369,0.0,0.0,0,...,1.0,0.5591846,12035.89,0,0,1,0,1.0,1.0,0.111111
2,1,TRANSFER,1147953.82,C1512273713,0.0,0.0,C657736958,4708479.3,5579568.65,0,...,1.0,1147954000000000.0,871089.35,1,0,0,1,1.0,1.317837,1.0
3,9,CASH_OUT,185024.72,C1697166189,463.0,0.0,C1411830789,45453.0,1117043.99,0,...,1.0,399.6214,1071127.99,0,0,0,1,1.0,0.172738,0.111111
4,11,PAYMENT,11511.0,C2056406361,298939.55,287428.54,M1051396154,0.0,0.0,0,...,1.0,0.03850611,11511.01,0,0,1,0,1.0,0.999999,0.090909


### Label encoding

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["type_encoded"] = le.fit_transform(df["type"])

print("âœ… Encoded transaction type as numeric")
df[["type", "type_encoded"]].head()


âœ… Encoded transaction type as numeric


Unnamed: 0,type,type_encoded
0,PAYMENT,3
1,PAYMENT,3
2,TRANSFER,4
3,CASH_OUT,1
4,PAYMENT,3


### Summary & Export

In [19]:
new_features = [
    col for col in df.columns
    if any(x in col for x in [
        "amount_log", "balance_", "ratio", "orig_", "dest_", "sender_", "tx_",
        "is_", "gap", "interaction", "encoded"
    ])
]

print("ðŸ§© Total new features added:", len(new_features))
print("Sample of engineered columns:\n", new_features[:15])

output_path = "../dataset/paysim_feature_engineered.csv"
df.to_csv(output_path, index=False)
print(f"Feature-engineered dataset saved to: {output_path}")


ðŸ§© Total new features added: 31
Sample of engineered columns:
 ['amount_log', 'orig_balance_change', 'dest_balance_change', 'balance_mismatch_orig', 'balance_mismatch_dest', 'orig_zero_but_amount', 'dest_zero_but_amount', 'orig_balance_ratio', 'dest_balance_ratio', 'sender_amount_mean', 'sender_amount_std', 'sender_amount_max', 'sender_tx_count', 'sender_balance_mismatch_ratio', 'sender_zero_but_ratio']
Feature-engineered dataset saved to: ../dataset/paysim_feature_engineered.csv
