# AML Synthetic Data — Feature Engineering
## Temporal Features | Account/Customer Velocity Roll-ups | Running Balance | Flow Tracking
---

In [2]:
import warnings
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
OUTPUT_DIR = Path('./outputs')
print('✅ Libraries loaded')

✅ Libraries loaded


## 0. Load Data

In [3]:
# ── B-3: Load Stage-1 Data (from Notebook 2a) ─────────────────────────────────
df = pd.read_parquet(OUTPUT_DIR / 'txns_stage1.parquet')
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Ensure timestamp is datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by timestamp globally (critical for rolling/cumulative ops)
df = df.sort_values('timestamp').reset_index(drop=True)

print(f"Loaded {len(df):,} transactions | {df['customer_id'].nunique():,} customers | {df['sender_account_id'].nunique():,} sender accounts")
df.head(3)

Loaded 429,039 transactions | 4,074 customers | 8,500 sender accounts


Unnamed: 0,transaction_id,customer_id,sender_account_id,receiver_account_id,beneficiary_id,device_id,timestamp,amount,channel,debit_credit,transaction_type,cash_flag,synthetic_flow_id,flow_depth,hop_number,time_since_origin_ts,fraud_type,label,avg_balance,account_open_days,kyc_level,country_risk,income_bracket,customer_risk_rating,pep_flag,occupation,industry,account_type,home_lat,home_lon,home_city,shared_kyc_id,shared_phone_hash,shared_email_hash,device_age_days,rooted_flag,os_type,vpn_flag,emulator_flag,beneficiary_type,beneficiary_country_risk,ip_address,ip_risk_score,geo_lat,geo_lon
0,T400357,C3013,A8276,,B3027,D3593,2025-09-01 00:00:53,6939.94,mobile,debit,UPI,0,,,,NaT,normal,0,19832.46,558,medium,medium,low,low,0,self_employed,unknown,current,28.61,77.21,Delhi,,,,463,0,android,0,0,crypto,high,10.113.106.226,0.05,28.67,77.2
1,T56222,C1559,A1154,A5708,,D2663,2025-09-01 00:02:48,2814.15,web,debit,online_transfer,0,,,,NaT,normal,0,11328.25,411,medium,high,medium,low,0,self_employed,real_estate,savings,26.91,75.79,Jaipur,,,,1038,0,ios,0,0,,,10.227.193.228,0.1,26.94,75.83
2,T312648,C1825,A6455,,B5161,D3054,2025-09-01 00:03:05,1751.31,atm,debit,cash_withdrawal,1,,,,NaT,normal,0,14795.57,1225,low,medium,medium,medium,1,salaried,healthcare,savings,28.61,77.21,Delhi,,,,479,0,android,0,0,individual,low,162.247.211.166,0.8,2.22,155.97


## 1. Temporal Decomposition Features

In [4]:
# ─── 1. Temporal Features ────────────────────────────────────────────────────
df['txn_hour']           = df['timestamp'].dt.hour
df['txn_day_of_week']    = df['timestamp'].dt.dayofweek          # 0=Mon, 6=Sun
df['txn_day_of_month']   = df['timestamp'].dt.day
df['txn_month']          = df['timestamp'].dt.month
df['txn_year']           = df['timestamp'].dt.year
df['txn_quarter']        = df['timestamp'].dt.quarter

# Boolean flags
df['is_weekend']         = df['txn_day_of_week'].isin([5, 6]).astype(int)    # Sat/Sun
df['is_night']           = df['txn_hour'].between(22, 23) | df['txn_hour'].between(0, 5)
df['is_night']           = df['is_night'].astype(int)                        # 22:00–05:59
df['is_business_hours']  = (df['txn_hour'].between(9, 17) & (df['is_weekend'] == 0)).astype(int)  # 09:00–17:59 weekday
df['is_early_morning']   = df['txn_hour'].between(0, 8).astype(int)          # 00:00–08:59

# Date only column (used later for daily resets)
df['txn_date']           = df['timestamp'].dt.date

print("✅ Temporal features created:")
temporal_cols = ['txn_hour','txn_day_of_week','txn_day_of_month','txn_month',
                 'is_weekend','is_night','is_business_hours','is_early_morning']
df[temporal_cols].describe()

✅ Temporal features created:


Unnamed: 0,txn_hour,txn_day_of_week,txn_day_of_month,txn_month,is_weekend,is_night,is_business_hours,is_early_morning
count,429039.0,429039.0,429039.0,429039.0,429039.0,429039.0,429039.0,429039.0
mean,12.74,2.95,15.77,10.54,0.28,0.13,0.43,0.2
std,5.24,2.0,8.78,1.12,0.45,0.33,0.5,0.4
min,0.0,0.0,1.0,9.0,0.0,0.0,0.0,0.0
25%,9.0,1.0,8.0,10.0,0.0,0.0,0.0,0.0
50%,13.0,3.0,16.0,11.0,0.0,0.0,0.0,0.0
75%,17.0,5.0,23.0,12.0,1.0,0.0,1.0,0.0
max,23.0,6.0,31.0,12.0,1.0,1.0,1.0,1.0


## 2. Derive Debit / Credit Flag at Account Level

> **Key Design:** A transaction has a `sender_account_id` and `receiver_account_id`. For **sender**: it's a **debit (outflow)**. For **receiver**: it's a **credit (inflow)**. We melt the table into a long format so each account appears once per transaction side.

In [6]:
# ─── Build account-level event log (debit/credit perspective) ────────────────
# Each transaction generates TWO rows: one for sender (debit), one for receiver (credit)

base_cols = ['transaction_id', 'customer_id', 'timestamp', 'amount',
             'channel', 'transaction_type', 'txn_date',
             'txn_hour', 'is_weekend', 'is_night', 'is_business_hours']

# Sender side → DEBIT (outflow from sender)
sender_df = df[base_cols + ['sender_account_id']].copy()
sender_df.rename(columns={'sender_account_id': 'account_id'}, inplace=True)
sender_df['debit_credit'] = 'debit'
sender_df['signed_amount'] = -df['amount'].values  # outflow is negative

# Receiver side → CREDIT (inflow to receiver)
# ✅ FIX: customer_id is already in base_cols — do NOT add it again
receiver_df = df[base_cols + ['receiver_account_id']].copy()
receiver_df.rename(columns={'receiver_account_id': 'account_id'}, inplace=True)
receiver_df['debit_credit'] = 'credit'
receiver_df['signed_amount'] = df['amount'].values  # inflow is positive

# Create account-level customer mapping (account → customer)
# An account is owned by a customer; use sender mapping as primary
acct_cust_map = df[['sender_account_id', 'customer_id']].drop_duplicates()
acct_cust_map.columns = ['account_id', 'account_owner_customer_id']

# Sanity check before concat
assert sender_df.columns.is_unique, f"Duplicate cols in sender_df: {sender_df.columns[sender_df.columns.duplicated()].tolist()}"
assert receiver_df.columns.is_unique, f"Duplicate cols in receiver_df: {receiver_df.columns[receiver_df.columns.duplicated()].tolist()}"

# Combine
acct_events = pd.concat([sender_df, receiver_df], ignore_index=True)
acct_events = acct_events.sort_values('timestamp').reset_index(drop=True)

# Merge account owner
acct_events = acct_events.merge(acct_cust_map, on='account_id', how='left')

print(f"✅ Account-level event log: {len(acct_events):,} rows")
acct_events[['transaction_id','account_id','account_owner_customer_id','debit_credit','amount','signed_amount','timestamp']].head(6)

✅ Account-level event log: 858,078 rows


Unnamed: 0,transaction_id,account_id,account_owner_customer_id,debit_credit,amount,signed_amount,timestamp
0,T400357,A8276,C3013,debit,6939.94,-6939.94,2025-09-01 00:00:53
1,T400357,,,credit,6939.94,6939.94,2025-09-01 00:00:53
2,T56222,A1154,C1559,debit,2814.15,-2814.15,2025-09-01 00:02:48
3,T56222,A5708,C4667,credit,2814.15,2814.15,2025-09-01 00:02:48
4,T312648,A6455,C1825,debit,1751.31,-1751.31,2025-09-01 00:03:05
5,T312648,,,credit,1751.31,1751.31,2025-09-01 00:03:05


## 3. Account-Level Rolling Velocity Features

Windows: **1hr, 24hr, 7day, 30day** — count of txns + sum of inflow + sum of outflow. **Current transaction is inclusive.**

In [8]:
# ─── Account-Level Rolling Velocity (sorted by account + timestamp) ───────────

def compute_account_rolling_velocity(acct_events: pd.DataFrame) -> pd.DataFrame:
    """
    For each account, compute rolling counts and amounts over 1h/24h/7d/30d windows.
    Current transaction is inclusive (window is [t - W, t]).
    """
    acct_events = acct_events.sort_values(['account_id', 'timestamp']).reset_index(drop=True)
    
    windows = {
        '1h'  : pd.Timedelta(hours=1),
        '24h' : pd.Timedelta(hours=24),
        '7d'  : pd.Timedelta(days=7),
        '30d' : pd.Timedelta(days=30),
    }
    
    n_rows = len(acct_events)
    
    # Pre-allocate result arrays — one slot per row in acct_events
    result_cols = {}
    for label in windows:
        result_cols[f'acct_txn_count_{label}']     = np.zeros(n_rows, dtype=np.int32)
        result_cols[f'acct_inflow_amt_{label}']    = np.zeros(n_rows, dtype=np.float64)
        result_cols[f'acct_outflow_amt_{label}']   = np.zeros(n_rows, dtype=np.float64)
        result_cols[f'acct_inflow_count_{label}']  = np.zeros(n_rows, dtype=np.int32)
        result_cols[f'acct_outflow_count_{label}'] = np.zeros(n_rows, dtype=np.int32)
    
    # Iterate once per account — fill all windows together
    for acct_id, grp in acct_events.groupby('account_id', sort=False):
        grp       = grp.sort_values('timestamp')
        idx_arr   = grp.index.values          # actual row positions in acct_events
        ts_arr    = grp['timestamp'].values
        amt_arr   = grp['amount'].values
        dc_arr    = grp['debit_credit'].values
        n         = len(grp)
        
        for i in range(n):
            t_end = ts_arr[i]
            row_idx = idx_arr[i]              # position to write results into
            
            for label, window in windows.items():
                t_start  = t_end - np.timedelta64(int(window.total_seconds()), 's')
                mask     = (ts_arr >= t_start) & (ts_arr <= t_end)
                in_mask  = mask & (dc_arr == 'credit')
                out_mask = mask & (dc_arr == 'debit')
                
                result_cols[f'acct_txn_count_{label}'][row_idx]     = mask.sum()
                result_cols[f'acct_inflow_amt_{label}'][row_idx]    = amt_arr[in_mask].sum()
                result_cols[f'acct_outflow_amt_{label}'][row_idx]   = amt_arr[out_mask].sum()
                result_cols[f'acct_inflow_count_{label}'][row_idx]  = in_mask.sum()
                result_cols[f'acct_outflow_count_{label}'][row_idx] = out_mask.sum()
    
    for col, vals in result_cols.items():
        acct_events[col] = vals
    
    return acct_events


print("Computing account-level rolling velocities (may take a moment)...")
acct_events = compute_account_rolling_velocity(acct_events)
print("✅ Account-level velocity features done")

acct_vel_cols = [c for c in acct_events.columns if c.startswith('acct_')]
acct_events[['account_id','timestamp','debit_credit','amount'] + acct_vel_cols].head(10)

Computing account-level rolling velocities (may take a moment)...
✅ Account-level velocity features done


Unnamed: 0,account_id,timestamp,debit_credit,amount,acct_txn_count_1h,acct_inflow_amt_1h,acct_outflow_amt_1h,acct_inflow_count_1h,acct_outflow_count_1h,acct_txn_count_24h,acct_inflow_amt_24h,acct_outflow_amt_24h,acct_inflow_count_24h,acct_outflow_count_24h,acct_txn_count_7d,acct_inflow_amt_7d,acct_outflow_amt_7d,acct_inflow_count_7d,acct_outflow_count_7d,acct_txn_count_30d,acct_inflow_amt_30d,acct_outflow_amt_30d,acct_inflow_count_30d,acct_outflow_count_30d
0,A0,2025-09-02 13:07:27,debit,1501.54,1,0.0,1501.54,0,1,1,0.0,1501.54,0,1,1,0.0,1501.54,0,1,1,0.0,1501.54,0,1
1,A0,2025-09-04 10:03:58,credit,8347.69,1,8347.69,0.0,1,0,1,8347.69,0.0,1,0,2,8347.69,1501.54,1,1,2,8347.69,1501.54,1,1
2,A0,2025-09-05 13:14:15,debit,1225.28,1,0.0,1225.28,0,1,1,0.0,1225.28,0,1,3,8347.69,2726.82,1,2,3,8347.69,2726.82,1,2
3,A0,2025-09-08 10:02:27,credit,9877.97,1,9877.97,0.0,1,0,1,9877.97,0.0,1,0,4,18225.66,2726.82,2,2,4,18225.66,2726.82,2,2
4,A0,2025-09-09 02:34:58,debit,161.86,1,0.0,161.86,0,1,2,9877.97,161.86,1,1,5,18225.66,2888.68,2,3,5,18225.66,2888.68,2,3
5,A0,2025-09-09 17:34:06,debit,5029.27,1,0.0,5029.27,0,1,2,0.0,5191.13,0,2,5,18225.66,6416.41,2,3,6,18225.66,7917.95,2,4
6,A0,2025-09-11 09:06:30,debit,5319.92,1,0.0,5319.92,0,1,1,0.0,5319.92,0,1,6,18225.66,11736.33,2,4,7,18225.66,13237.87,2,5
7,A0,2025-09-15 19:23:32,credit,2508.73,1,2508.73,0.0,1,0,1,2508.73,0.0,1,0,4,2508.73,10511.05,1,3,8,20734.39,13237.87,3,5
8,A0,2025-09-16 15:22:25,credit,4758.61,1,4758.61,0.0,1,0,2,7267.34,0.0,2,0,4,7267.34,10349.19,2,2,9,25493.0,13237.87,4,5
9,A0,2025-09-18 01:12:51,debit,533.87,1,0.0,533.87,0,1,1,0.0,533.87,0,1,4,7267.34,5853.79,2,2,10,25493.0,13771.74,4,6


## 4. Customer-Level Rolling Velocity Features

> A customer can have 2-3 accounts. Any txn across any of their accounts (debit OR credit) counts as a customer-level txn.

In [9]:
# ─── Customer-Level Rolling Velocity ─────────────────────────────────────────
# Use account_owner_customer_id for receiver side if available, else customer_id
# Fill nulls with customer_id (sender's customer)
acct_events['cust_id_for_rollup'] = acct_events['account_owner_customer_id'].fillna(acct_events['customer_id'])

def compute_customer_rolling_velocity(acct_events: pd.DataFrame) -> pd.DataFrame:
    """
    For each customer, compute rolling counts and amounts over 1h/24h/7d/30d windows
    across ALL accounts they own.
    """
    acct_sorted = acct_events.sort_values(['cust_id_for_rollup', 'timestamp']).reset_index(drop=True)
    
    windows = {
        '1h'  : pd.Timedelta(hours=1),
        '24h' : pd.Timedelta(hours=24),
        '7d'  : pd.Timedelta(days=7),
        '30d' : pd.Timedelta(days=30),
    }
    
    result_cols = {k: [] for k in [
        *[f'cust_txn_count_{w}' for w in windows],
        *[f'cust_inflow_amt_{w}' for w in windows],
        *[f'cust_outflow_amt_{w}' for w in windows],
        *[f'cust_inflow_count_{w}' for w in windows],
        *[f'cust_outflow_count_{w}' for w in windows],
    ]}
    
    for cust_id, grp in acct_sorted.groupby('cust_id_for_rollup', sort=False):
        grp = grp.sort_values('timestamp')
        ts_arr  = grp['timestamp'].values
        amt_arr = grp['amount'].values
        dc_arr  = grp['debit_credit'].values
        n = len(grp)
        
        for label, window in windows.items():
            txn_count_list     = []
            inflow_amt_list    = []
            outflow_amt_list   = []
            inflow_count_list  = []
            outflow_count_list = []
            
            for i in range(n):
                t_end   = ts_arr[i]
                t_start = t_end - np.timedelta64(int(window.total_seconds()), 's')
                mask     = (ts_arr >= t_start) & (ts_arr <= t_end)
                in_mask  = mask & (dc_arr == 'credit')
                out_mask = mask & (dc_arr == 'debit')
                
                txn_count_list.append(mask.sum())
                inflow_amt_list.append(amt_arr[in_mask].sum())
                outflow_amt_list.append(amt_arr[out_mask].sum())
                inflow_count_list.append(in_mask.sum())
                outflow_count_list.append(out_mask.sum())
            
            result_cols[f'cust_txn_count_{label}'].extend(txn_count_list)
            result_cols[f'cust_inflow_amt_{label}'].extend(inflow_amt_list)
            result_cols[f'cust_outflow_amt_{label}'].extend(outflow_amt_list)
            result_cols[f'cust_inflow_count_{label}'].extend(inflow_count_list)
            result_cols[f'cust_outflow_count_{label}'].extend(outflow_count_list)
    
    for col, vals in result_cols.items():
        acct_sorted[col] = vals
    
    return acct_sorted


print("Computing customer-level rolling velocities...")
acct_events = compute_customer_rolling_velocity(acct_events)
print("✅ Customer-level velocity features done")

cust_vel_cols = [c for c in acct_events.columns if c.startswith('cust_')]
acct_events[['cust_id_for_rollup','account_id','timestamp','debit_credit','amount'] + cust_vel_cols].head(10)

Computing customer-level rolling velocities...
✅ Customer-level velocity features done


Unnamed: 0,cust_id_for_rollup,account_id,timestamp,debit_credit,amount,cust_id_for_rollup.1,cust_txn_count_1h,cust_txn_count_24h,cust_txn_count_7d,cust_txn_count_30d,cust_inflow_amt_1h,cust_inflow_amt_24h,cust_inflow_amt_7d,cust_inflow_amt_30d,cust_outflow_amt_1h,cust_outflow_amt_24h,cust_outflow_amt_7d,cust_outflow_amt_30d,cust_inflow_count_1h,cust_inflow_count_24h,cust_inflow_count_7d,cust_inflow_count_30d,cust_outflow_count_1h,cust_outflow_count_24h,cust_outflow_count_7d,cust_outflow_count_30d
0,C0,A6471,2025-09-01 14:01:04,debit,7011.86,C0,2,2,2,2,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,1,1,1,1,1,1,1,1
1,C0,,2025-09-01 14:01:04,credit,7011.86,C0,2,2,2,2,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,7011.86,1,1,1,1,1,1,1,1
2,C0,A1507,2025-09-02 09:42:26,debit,3093.91,C0,1,3,3,3,0.0,7011.86,7011.86,7011.86,3093.91,10105.77,10105.77,10105.77,0,1,1,1,1,2,2,2
3,C0,A6471,2025-09-03 13:38:26,debit,3431.29,C0,2,2,5,5,3431.29,3431.29,10443.15,10443.15,3431.29,3431.29,13537.06,13537.06,1,1,2,2,1,1,3,3
4,C0,,2025-09-03 13:38:26,credit,3431.29,C0,2,2,5,5,3431.29,3431.29,10443.15,10443.15,3431.29,3431.29,13537.06,13537.06,1,1,2,2,1,1,3,3
5,C0,A1507,2025-09-04 01:42:35,debit,13628.43,C0,2,4,7,7,13628.43,17059.72,24071.58,24071.58,13628.43,17059.72,27165.49,27165.49,1,2,3,3,1,2,4,4
6,C0,,2025-09-04 01:42:35,credit,13628.43,C0,2,4,7,7,13628.43,17059.72,24071.58,24071.58,13628.43,17059.72,27165.49,27165.49,1,2,3,3,1,2,4,4
7,C0,A6471,2025-09-06 22:10:02,debit,1534.3,C0,1,1,8,8,0.0,0.0,24071.58,24071.58,1534.3,1534.3,28699.79,28699.79,0,0,3,3,1,1,5,5
8,C0,A6471,2025-09-07 10:40:17,debit,2029.16,C0,2,3,10,10,2029.16,2029.16,26100.74,26100.74,2029.16,3563.46,30728.95,30728.95,1,1,4,4,1,2,6,6
9,C0,,2025-09-07 10:40:17,credit,2029.16,C0,2,3,10,10,2029.16,2029.16,26100.74,26100.74,2029.16,3563.46,30728.95,30728.95,1,1,4,4,1,2,6,6


## 5. Running Balance Features

| Column | Description |
|--------|-------------|
| `balance_before_txn` | Balance just before the current transaction |
| `running_balance_txn_amount` | Signed amount (+credit / –debit) |
| `balance_after_txn` | balance_before + running_balance_txn_amount |
| `current_balance` | Latest known balance (clamped ≥ 0) |
| `bal_ratio` | balance_after_txn / current_balance |
| `cumulative_daily_balance_change` | Net balance change since start of the day (resets each day per account) |

In [47]:
# ─── Running Balance (per account, sorted by timestamp) ──────────────────────

def compute_running_balance(acct_events: pd.DataFrame) -> pd.DataFrame:
    """
    Compute running balance per account — no capping, no floor, all values as-is.
    Starting balance is seeded high enough that normal outflows don't immediately
    push the account into deep negatives from transaction 1.

    Seed logic per account:
      seed = max(avg_balance, max_single_outflow_for_account × 10, GLOBAL_MIN_SEED)
    """

    # ── Build avg_balance map from BOTH sender AND receiver ───────────────────
    sender_map   = df.groupby('sender_account_id')['avg_balance'].first().to_dict()
    receiver_map = df.groupby('receiver_account_id')['avg_balance'].first().to_dict()
    combined_bal_map = {**receiver_map, **sender_map}  # sender takes priority

    # ── Build per-account max outflow map ─────────────────────────────────────
    # For each account, find the single largest debit amount it ever sends.
    # We seed the balance at 10× this value so even the biggest outflow
    # leaves the account with a positive balance on day 1.
    max_outflow_map = (
        df.groupby('sender_account_id')['amount'].max().to_dict()
    )

    # Global minimum seed — no account starts below this regardless of avg_balance
    # Set to 10× the 95th percentile of all transaction amounts in the dataset
    p95_amount   = float(np.percentile(df['amount'].values, 95))
    GLOBAL_MIN_SEED = p95_amount * 10

    print(f"   p95 transaction amount : ₹{p95_amount:,.2f}")
    print(f"   GLOBAL_MIN_SEED        : ₹{GLOBAL_MIN_SEED:,.2f}")

    def get_seed(acct_id):
        avg_bal     = float(combined_bal_map.get(acct_id, GLOBAL_MIN_SEED))
        max_out     = float(max_outflow_map.get(acct_id, 0))
        # Seed = highest of: avg_balance, 10× max single outflow, global min
        return max(avg_bal, max_out * 10, GLOBAL_MIN_SEED)

    acct_events = acct_events.sort_values(['account_id', 'timestamp']).reset_index(drop=True)
    n_rows = len(acct_events)

    # Pre-allocate result arrays
    balance_before_arr = np.zeros(n_rows, dtype=np.float64)
    signed_amt_arr     = np.zeros(n_rows, dtype=np.float64)
    balance_after_arr  = np.zeros(n_rows, dtype=np.float64)
    cum_daily_arr      = np.zeros(n_rows, dtype=np.float64)

    for acct_id, grp in acct_events.groupby('account_id', sort=False):
        grp      = grp.sort_values('timestamp')
        idx_arr  = grp.index.values
        amt_arr  = grp['amount'].values
        dc_arr   = grp['debit_credit'].values
        date_arr = grp['txn_date'].values

        start_bal     = get_seed(acct_id)
        current_bal   = start_bal
        day_start_bal = start_bal
        current_date  = None

        for i in range(len(grp)):
            row_idx  = idx_arr[i]
            txn_date = date_arr[i]
            signed   = amt_arr[i] if dc_arr[i] == 'credit' else -amt_arr[i]

            if txn_date != current_date:
                day_start_bal = current_bal
                current_date  = txn_date

            bal_before = current_bal           # no capping
            bal_after  = current_bal + signed  # pure arithmetic — no floor, no clip

            cum_daily  = bal_after - day_start_bal

            balance_before_arr[row_idx] = bal_before
            signed_amt_arr[row_idx]     = signed
            balance_after_arr[row_idx]  = bal_after
            cum_daily_arr[row_idx]      = cum_daily

            current_bal = bal_after

    acct_events['balance_before_txn']             = balance_before_arr
    acct_events['running_balance_txn_amount']     = signed_amt_arr
    acct_events['balance_after_txn']              = balance_after_arr
    acct_events['cumulative_daily_balance_change'] = cum_daily_arr

    # current_balance = last known balance_after_txn — no clip, no abs
    latest_bal_map = (
        acct_events.sort_values('timestamp')
        .groupby('account_id')['balance_after_txn']
        .last()
        .to_dict()
    )
    acct_events['current_balance'] = acct_events['account_id'].map(latest_bal_map)

    # Balance ratio — guard div/0 only
    acct_events['bal_ratio_after_to_current'] = np.where(
        acct_events['current_balance'] != 0,
        acct_events['balance_after_txn'] / acct_events['current_balance'],
        np.nan
    )

    return acct_events


print("Computing running balances...")
acct_events = compute_running_balance(acct_events)
print("✅ Running balance features done")


# ─── Split into sender_ and receiver_ prefixed columns ───────────────────────

bal_feature_cols = [
    'balance_before_txn',
    'running_balance_txn_amount',
    'balance_after_txn',
    'cumulative_daily_balance_change',
    'current_balance',
    'bal_ratio_after_to_current',
]

sender_bal = (
    acct_events[acct_events['debit_credit'] == 'debit']
    [['transaction_id', 'account_id'] + bal_feature_cols]
    .rename(columns={c: f'sender_{c}' for c in bal_feature_cols})
    .rename(columns={'account_id': 'sender_account_id'})
)

receiver_bal = (
    acct_events[acct_events['debit_credit'] == 'credit']
    [['transaction_id', 'account_id'] + bal_feature_cols]
    .rename(columns={c: f'receiver_{c}' for c in bal_feature_cols})
    .rename(columns={'account_id': 'receiver_account_id'})
)

# Drop any previously merged balance columns before re-merging
cols_to_drop = [c for c in df.columns if any(x in c for x in [
    'sender_balance_before', 'sender_running_balance', 'sender_balance_after',
    'sender_current_balance', 'sender_bal_ratio', 'sender_cumulative_daily',
    'receiver_balance_before', 'receiver_running_balance', 'receiver_balance_after',
    'receiver_current_balance', 'receiver_bal_ratio', 'receiver_cumulative_daily',
])]
df = df.drop(columns=cols_to_drop, errors='ignore')

df = df.merge(sender_bal,   on=['transaction_id', 'sender_account_id'],   how='left')
df = df.merge(receiver_bal, on=['transaction_id', 'receiver_account_id'], how='left')

print(f"✅ Sender & receiver balance columns added to df: {df.shape}")


# ─── Verification ─────────────────────────────────────────────────────────────
sample_acct = acct_events['account_id'].iloc[0]
sample = (acct_events[acct_events['account_id'] == sample_acct]
          .sort_values('timestamp').reset_index(drop=True))

print(f"\nSample balance chain for account: {sample_acct}")
print(sample[['timestamp', 'debit_credit', 'amount',
              'balance_before_txn', 'running_balance_txn_amount',
              'balance_after_txn', 'cumulative_daily_balance_change']].head(10).to_string())

# Chain integrity
breaks = sum(
    abs(sample.loc[i, 'balance_after_txn'] - sample.loc[i+1, 'balance_before_txn']) > 0.01
    for i in range(len(sample) - 1)
)
print(f"\nBalance chain breaks (expect 0): {breaks}")

# Balance distribution summary
print(f"\nBalance distribution (balance_after_txn):")
print(f"  Min    : ₹{acct_events['balance_after_txn'].min():>15,.2f}")
print(f"  p1     : ₹{np.percentile(acct_events['balance_after_txn'], 1):>15,.2f}")
print(f"  Median : ₹{acct_events['balance_after_txn'].median():>15,.2f}")
print(f"  p99    : ₹{np.percentile(acct_events['balance_after_txn'], 99):>15,.2f}")
print(f"  Max    : ₹{acct_events['balance_after_txn'].max():>15,.2f}")
print(f"\nRows with negative balance_after_txn : {(acct_events['balance_after_txn'] < 0).sum():,}")
print(f"Rows with balance_before_txn == 0    : {(acct_events['balance_before_txn'] == 0).sum():,}")

Computing running balances...
   p95 transaction amount : ₹38,090.26
   GLOBAL_MIN_SEED        : ₹380,902.60
✅ Running balance features done
✅ Sender & receiver balance columns added to df: (429039, 68)

Sample balance chain for account: A0
            timestamp debit_credit  amount  balance_before_txn  running_balance_txn_amount  balance_after_txn  cumulative_daily_balance_change
0 2025-09-02 13:07:27        debit 1501.54           380902.60                    -1501.54          379401.06                         -1501.54
1 2025-09-04 10:03:58       credit 8347.69           379401.06                     8347.69          387748.75                          8347.69
2 2025-09-05 13:14:15        debit 1225.28           387748.75                    -1225.28          386523.47                         -1225.28
3 2025-09-08 10:02:27       credit 9877.97           386523.47                     9877.97          396401.44                          9877.97
4 2025-09-09 02:34:58        debit  161.86  

## 6. Flow Tracking Features

These features (`synthetic_flow_id`, `flow_depth`, `hop_number`, `time_since_origin_ts`) are already injected in the raw data (per spec). We pass them through and compute `time_since_origin_ts` for rows where it's missing.

In [48]:
# ─── Flow Tracking — Carry through + compute time_since_origin_ts ────────────
# synthetic_flow_id, flow_depth, hop_number already exist in raw data
# time_since_origin_ts: seconds since hop_number==1 for that flow

flow_df = df[['transaction_id', 'synthetic_flow_id', 'flow_depth',
              'hop_number', 'time_since_origin_ts', 'timestamp']].copy()

# For rows with a valid flow, compute origin_timestamp from hop_number == 1
# If time_since_origin_ts is already populated, trust it; else derive it.
flow_rows = flow_df[flow_df['synthetic_flow_id'].notna()].copy()

# Get origin timestamp for each flow (hop_number == 1)
origin_ts = (
    flow_rows[flow_rows['hop_number'] == 1]
    .groupby('synthetic_flow_id')['timestamp']
    .first()
    .rename('origin_ts')
)
flow_rows = flow_rows.merge(origin_ts, on='synthetic_flow_id', how='left')

# Compute derived time_since_origin (seconds)
flow_rows['time_since_origin_derived'] = (
    (flow_rows['timestamp'] - flow_rows['origin_ts'])
    .dt.total_seconds()
    .fillna(0)
    .astype(int)
)

# Fill missing time_since_origin_ts from raw with derived
flow_rows['time_since_origin_ts'] = flow_rows['time_since_origin_ts'].fillna(
    flow_rows['time_since_origin_derived']
)

# Merge back to main df
df = df.merge(
    flow_rows[['transaction_id', 'time_since_origin_ts']].rename(
        columns={'time_since_origin_ts': 'time_since_origin_ts_final'}
    ),
    on='transaction_id', how='left'
)
df['time_since_origin_ts'] = df['time_since_origin_ts_final'].fillna(df['time_since_origin_ts'])
df.drop(columns=['time_since_origin_ts_final'], inplace=True)

print("✅ Flow tracking features verified/computed")
flow_summary = df[df['synthetic_flow_id'].notna()][['synthetic_flow_id','flow_depth','hop_number','time_since_origin_ts']]
print(f"   Flow transactions: {len(flow_summary):,}")
flow_summary.head(10)

✅ Flow tracking features verified/computed
   Flow transactions: 9,712


Unnamed: 0,synthetic_flow_id,flow_depth,hop_number,time_since_origin_ts
203279,FLOW_00047,10.0,3.0,2025-12-19 13:37:54
203281,FLOW_00030,10.0,2.0,2025-11-04 20:25:19
203282,FLOW_00030,10.0,2.0,2025-11-04 20:25:19
203283,FLOW_00008,10.0,9.0,2025-12-21 12:43:23
203296,FLOW_00012,10.0,10.0,2025-12-12 01:37:51
203307,FLOW_00004,10.0,8.0,2025-11-30 02:06:08
203309,FLOW_00054,10.0,6.0,2025-11-04 18:27:04
203310,FLOW_00054,10.0,6.0,2025-11-04 18:27:04
203311,FLOW_00003,10.0,6.0,2025-12-25 01:50:28
203312,FLOW_00059,10.0,10.0,2025-11-13 23:42:37


## 7. Merge All Features Back to Transaction-Level

Aggregate account-level features back to the **sender** and **receiver** perspective on the original transaction.

In [49]:
# ─── Merge back to transaction-level ─────────────────────────────────────────

# ✅ FIX: Drop any previously merged sender_/receiver_ balance + velocity columns
# from df before re-merging to avoid duplicate column conflicts.
cols_to_drop = [c for c in df.columns if any(x in c for x in [
    'sender_balance_before', 'sender_running_balance', 'sender_balance_after',
    'sender_current_balance', 'sender_bal_ratio', 'sender_cumulative_daily',
    'receiver_balance_before', 'receiver_running_balance', 'receiver_balance_after',
    'receiver_current_balance', 'receiver_bal_ratio', 'receiver_cumulative_daily',
    'sender_acct_txn_count', 'sender_acct_inflow', 'sender_acct_outflow',
    'sender_cust_txn_count', 'sender_cust_inflow', 'sender_cust_outflow',
    'receiver_acct_txn_count', 'receiver_acct_inflow', 'receiver_acct_outflow',
])]
df = df.drop(columns=cols_to_drop, errors='ignore')
print(f"Dropped {len(cols_to_drop)} previously merged columns from df")

# ── Sender-side features (debit events) ──────────────────────────────────────
sender_feats = acct_events[acct_events['debit_credit'] == 'debit'].copy()
sender_feats = sender_feats.rename(columns={
    col: f'sender_{col}' for col in sender_feats.columns
    if col not in ['transaction_id', 'timestamp', 'account_id', 'customer_id',
                   'amount', 'debit_credit', 'txn_date']
})
sender_feats = sender_feats.rename(columns={'account_id': 'sender_account_id'})

# ── Receiver-side features (credit events) ────────────────────────────────────
receiver_feats = acct_events[acct_events['debit_credit'] == 'credit'].copy()
receiver_feats = receiver_feats.rename(columns={
    col: f'receiver_{col}' for col in receiver_feats.columns
    if col not in ['transaction_id', 'timestamp', 'account_id', 'customer_id',
                   'amount', 'debit_credit', 'txn_date']
})
receiver_feats = receiver_feats.rename(columns={'account_id': 'receiver_account_id'})

# ── Select columns to merge ───────────────────────────────────────────────────
SENDER_KEYWORDS   = ['acct_txn_count', 'acct_inflow', 'acct_outflow',
                     'cust_txn_count', 'cust_inflow', 'cust_outflow',
                     'balance_before', 'running_balance_txn', 'balance_after',
                     'current_balance', 'bal_ratio', 'cumulative_daily']

RECEIVER_KEYWORDS = ['acct_txn_count', 'acct_inflow', 'acct_outflow',
                     'balance_before', 'running_balance_txn', 'balance_after',
                     'current_balance', 'bal_ratio', 'cumulative_daily']

sender_merge_cols = ['transaction_id', 'sender_account_id'] + [
    c for c in sender_feats.columns
    if any(x in c for x in SENDER_KEYWORDS)
]

receiver_merge_cols = ['transaction_id', 'receiver_account_id'] + [
    c for c in receiver_feats.columns
    if any(x in c for x in RECEIVER_KEYWORDS)
]

# ✅ Deduplicate merge col lists (safety guard)
sender_merge_cols   = list(dict.fromkeys(sender_merge_cols))
receiver_merge_cols = list(dict.fromkeys(receiver_merge_cols))

# ── Final merge ───────────────────────────────────────────────────────────────
final_df = df.merge(
    sender_feats[[c for c in sender_merge_cols if c in sender_feats.columns]],
    on=['transaction_id', 'sender_account_id'],
    how='left',
    validate='many_to_one'   # each transaction should match at most one sender row
)
final_df = final_df.merge(
    receiver_feats[[c for c in receiver_merge_cols if c in receiver_feats.columns]],
    on=['transaction_id', 'receiver_account_id'],
    how='left',
    validate='many_to_one'
)

print(f"✅ Final feature set: {final_df.shape[1]} columns x {len(final_df):,} rows")
print("\nColumn list:")
print(list(final_df.columns))

Dropped 12 previously merged columns from df
✅ Final feature set: 128 columns x 429,039 rows

Column list:
['transaction_id', 'customer_id', 'sender_account_id', 'receiver_account_id', 'beneficiary_id', 'device_id', 'timestamp', 'amount', 'channel', 'debit_credit', 'transaction_type', 'cash_flag', 'synthetic_flow_id', 'flow_depth', 'hop_number', 'time_since_origin_ts', 'fraud_type', 'label', 'avg_balance', 'account_open_days', 'kyc_level', 'country_risk', 'income_bracket', 'customer_risk_rating', 'pep_flag', 'occupation', 'industry', 'account_type', 'home_lat', 'home_lon', 'home_city', 'shared_kyc_id', 'shared_phone_hash', 'shared_email_hash', 'device_age_days', 'rooted_flag', 'os_type', 'vpn_flag', 'emulator_flag', 'beneficiary_type', 'beneficiary_country_risk', 'ip_address', 'ip_risk_score', 'geo_lat', 'geo_lon', 'txn_hour', 'txn_day_of_week', 'txn_day_of_month', 'txn_month', 'txn_year', 'txn_quarter', 'is_weekend', 'is_night', 'is_business_hours', 'is_early_morning', 'txn_date', 'se

## 8. Test Cases & Validation

In [28]:
# ═══════════════════════════════════════════════════════════════════
# TEST CASE 1: Temporal Features Sanity Check
# ═══════════════════════════════════════════════════════════════════
print("=" * 60)
print("TEST 1: Temporal Feature Sanity")
print("=" * 60)

# All hours 0-23
assert df['txn_hour'].between(0, 23).all(), "FAIL: txn_hour out of range"

# Night = hour in {22,23,0,1,2,3,4,5}
night_mask = df['is_night'] == 1
night_hours_ok = df.loc[night_mask, 'txn_hour'].isin([22, 23, 0, 1, 2, 3, 4, 5]).all()
assert night_hours_ok, "FAIL: night flag mismatch"

# Business hours only on weekdays
biz_mask = df['is_business_hours'] == 1
assert (df.loc[biz_mask, 'is_weekend'] == 0).all(), "FAIL: business_hours on weekend"
assert df.loc[biz_mask, 'txn_hour'].between(9, 17).all(), "FAIL: business_hours wrong hours"

# No overlap between night and business hours
overlap = ((df['is_night'] == 1) & (df['is_business_hours'] == 1)).sum()
assert overlap == 0, f"FAIL: {overlap} rows have both night and business_hours = 1"

print("✅ TEST 1 PASSED: All temporal feature checks OK")
print(f"   Night txns: {night_mask.sum():,} | Business hr txns: {biz_mask.sum():,}")

TEST 1: Temporal Feature Sanity
✅ TEST 1 PASSED: All temporal feature checks OK
   Night txns: 53,981 | Business hr txns: 185,317


In [29]:
# ═══════════════════════════════════════════════════════════════════
# TEST CASE 2: Velocity Roll-up Validation
# Pick a random account and verify 24h count manually
# ═══════════════════════════════════════════════════════════════════
print("=" * 60)
print("TEST 2: Account Velocity Roll-up Manual Verification")
print("=" * 60)

# Pick an account with enough transactions
busy_accts = acct_events.groupby('account_id').size()
test_acct  = busy_accts[busy_accts >= 5].index[0]
acct_slice = acct_events[acct_events['account_id'] == test_acct].sort_values('timestamp')

# For the 5th transaction, manually compute 24h count
ref_row = acct_slice.iloc[4]  # 5th event
ref_ts  = ref_row['timestamp']
window  = pd.Timedelta(hours=24)

manual_mask  = (acct_slice['timestamp'] >= ref_ts - window) & (acct_slice['timestamp'] <= ref_ts)
manual_count = manual_mask.sum()
feature_val  = int(ref_row['acct_txn_count_24h'])

print(f"   Account: {test_acct}")
print(f"   Reference timestamp: {ref_ts}")
print(f"   Manual 24h count: {manual_count} | Feature value: {feature_val}")
assert manual_count == feature_val, f"FAIL: count mismatch ({manual_count} vs {feature_val})"
print("✅ TEST 2 PASSED: 24h account velocity matches manual calculation")

# Verify inflow/outflow split
manual_inflow  = acct_slice.loc[manual_mask & (acct_slice['debit_credit'] == 'credit'), 'amount'].sum()
manual_outflow = acct_slice.loc[manual_mask & (acct_slice['debit_credit'] == 'debit'),  'amount'].sum()
feat_inflow  = ref_row['acct_inflow_amt_24h']
feat_outflow = ref_row['acct_outflow_amt_24h']

print(f"   Manual inflow: {manual_inflow:.2f} | Feature: {feat_inflow:.2f}")
print(f"   Manual outflow: {manual_outflow:.2f} | Feature: {feat_outflow:.2f}")
assert abs(manual_inflow - feat_inflow) < 0.01, "FAIL: inflow amt mismatch"
assert abs(manual_outflow - feat_outflow) < 0.01, "FAIL: outflow amt mismatch"
print("✅ TEST 2b PASSED: Inflow/Outflow amounts match")

TEST 2: Account Velocity Roll-up Manual Verification
   Account: A0
   Reference timestamp: 2025-09-09 02:34:58
   Manual 24h count: 2 | Feature value: 2
✅ TEST 2 PASSED: 24h account velocity matches manual calculation
   Manual inflow: 9877.97 | Feature: 9877.97
   Manual outflow: 161.86 | Feature: 161.86
✅ TEST 2b PASSED: Inflow/Outflow amounts match


In [30]:
# ═══════════════════════════════════════════════════════════════════
# TEST CASE 3: Running Balance Validation
# ═══════════════════════════════════════════════════════════════════
print("=" * 60)
print("TEST 3: Running Balance Validation")
print("=" * 60)

# For same test account, check balance chain
bal_slice = acct_events[acct_events['account_id'] == test_acct].sort_values('timestamp').reset_index(drop=True)
bal_view = bal_slice[['timestamp','debit_credit','amount','balance_before_txn',
                       'running_balance_txn_amount','balance_after_txn',
                       'cumulative_daily_balance_change']].head(10)
print(bal_view.to_string())

# Check: balance_before[i+1] == balance_after[i] (same day consecutive)
errors = 0
for i in range(len(bal_slice) - 1):
    after_i  = bal_slice.loc[i, 'balance_after_txn']
    before_next = bal_slice.loc[i + 1, 'balance_before_txn']
    # same day check: after_i should equal before_next if consecutive
    if abs(after_i - before_next) > 0.01:
        errors += 1
        if errors <= 3:
            print(f"   ⚠️  Row {i}: balance_after={after_i:.2f}, next balance_before={before_next:.2f}")

if errors == 0:
    print("✅ TEST 3 PASSED: Balance chain is consistent (balance_after[t] = balance_before[t+1])")
else:
    print(f"   ⚠️  {errors} balance chain breaks (check for negative clamping)")

# Check no negative balances
neg_bal = (acct_events['balance_after_txn'] < 0).sum()
assert neg_bal == 0, f"FAIL: {neg_bal} rows with negative balance_after_txn"
print("✅ TEST 3b PASSED: No negative balances after clamping")

TEST 3: Running Balance Validation
            timestamp debit_credit  amount  balance_before_txn  running_balance_txn_amount  balance_after_txn  cumulative_daily_balance_change
0 2025-09-02 13:07:27        debit 1501.54            11418.49                    -1501.54            9916.95                         -1501.54
1 2025-09-04 10:03:58       credit 8347.69             9916.95                     8347.69           18264.64                          8347.69
2 2025-09-05 13:14:15        debit 1225.28            18264.64                    -1225.28           17039.36                         -1225.28
3 2025-09-08 10:02:27       credit 9877.97            17039.36                     9877.97           26917.33                          9877.97
4 2025-09-09 02:34:58        debit  161.86            26917.33                     -161.86           26755.47                          -161.86
5 2025-09-09 17:34:06        debit 5029.27            26755.47                    -5029.27           21726.

In [31]:
# ═══════════════════════════════════════════════════════════════════
# TEST CASE 4: Customer-level velocity — cross-account aggregation
# ═══════════════════════════════════════════════════════════════════
print("=" * 60)
print("TEST 4: Customer Velocity — Cross-Account Aggregation")
print("=" * 60)

# Find a customer with multiple accounts
multi_acct_custs = acct_cust_map.groupby('account_owner_customer_id')['account_id'].count()
if len(multi_acct_custs[multi_acct_custs > 1]) > 0:
    test_cust = multi_acct_custs[multi_acct_custs > 1].index[0]
    cust_accts = acct_cust_map[acct_cust_map['account_owner_customer_id'] == test_cust]['account_id'].tolist()
    print(f"   Customer {test_cust} owns accounts: {cust_accts}")
    
    cust_slice = acct_events[acct_events['cust_id_for_rollup'] == test_cust].sort_values('timestamp')
    ref_row    = cust_slice.iloc[-1]  # last event
    ref_ts     = ref_row['timestamp']
    
    manual_24h = ((cust_slice['timestamp'] >= ref_ts - pd.Timedelta(hours=24)) &
                  (cust_slice['timestamp'] <= ref_ts)).sum()
    feat_24h   = int(ref_row['cust_txn_count_24h'])
    
    print(f"   Manual 24h cust txn count: {manual_24h} | Feature: {feat_24h}")
    assert manual_24h == feat_24h, f"FAIL: customer 24h count mismatch"
    print("✅ TEST 4 PASSED: Customer-level cross-account velocity is correct")
else:
    print("   (No multi-account customers found in dataset — test skipped)")

TEST 4: Customer Velocity — Cross-Account Aggregation
   Customer C0 owns accounts: ['A6471', 'A1507']
   Manual 24h cust txn count: 4 | Feature: 4
✅ TEST 4 PASSED: Customer-level cross-account velocity is correct


In [32]:
# ═══════════════════════════════════════════════════════════════════
# TEST CASE 5: Inclusive window check (1hr edge case)
# ═══════════════════════════════════════════════════════════════════
print("=" * 60)
print("TEST 5: Velocity Window Inclusivity")
print("=" * 60)

# For any account with at least 2 transactions, first txn 1h count should be >=1
first_txns = acct_events.sort_values('timestamp').groupby('account_id').first()
assert (first_txns['acct_txn_count_1h'] >= 1).all(), "FAIL: some accounts show 0 count for own first txn"
print("✅ TEST 5 PASSED: All first transactions count themselves in 1h window")

# Summary statistics
print("\n📊 Velocity Feature Summary:")
vel_cols = ['acct_txn_count_1h','acct_txn_count_24h','acct_txn_count_7d','acct_txn_count_30d',
            'cust_txn_count_1h','cust_txn_count_24h','cust_txn_count_7d','cust_txn_count_30d']
acct_events[[c for c in vel_cols if c in acct_events.columns]].describe()

TEST 5: Velocity Window Inclusivity
✅ TEST 5 PASSED: All first transactions count themselves in 1h window

📊 Velocity Feature Summary:


Unnamed: 0,acct_txn_count_1h,acct_txn_count_24h,acct_txn_count_7d,acct_txn_count_30d,cust_txn_count_1h,cust_txn_count_24h,cust_txn_count_7d,cust_txn_count_30d
count,858078.0,858078.0,858078.0,858078.0,858078.0,858078.0,858078.0,858078.0
mean,0.74,1.13,3.5,11.39,1.78,3.96,17.2,61.35
std,0.54,1.03,2.97,9.18,0.77,2.45,9.8,37.42
min,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,1.0,2.0,10.0,32.0
50%,1.0,1.0,4.0,14.0,2.0,3.0,16.0,55.0
75%,1.0,2.0,6.0,19.0,2.0,5.0,23.0,83.0
max,6.0,11.0,28.0,44.0,13.0,27.0,75.0,248.0


## 9. Save Output

In [50]:
final_df.head(2)

Unnamed: 0,transaction_id,customer_id,sender_account_id,receiver_account_id,beneficiary_id,device_id,timestamp,amount,channel,debit_credit,transaction_type,cash_flag,synthetic_flow_id,flow_depth,hop_number,time_since_origin_ts,fraud_type,label,avg_balance,account_open_days,kyc_level,country_risk,income_bracket,customer_risk_rating,pep_flag,occupation,industry,account_type,home_lat,home_lon,home_city,shared_kyc_id,shared_phone_hash,shared_email_hash,device_age_days,rooted_flag,os_type,vpn_flag,emulator_flag,beneficiary_type,beneficiary_country_risk,ip_address,ip_risk_score,geo_lat,geo_lon,txn_hour,txn_day_of_week,txn_day_of_month,txn_month,txn_year,txn_quarter,is_weekend,is_night,is_business_hours,is_early_morning,txn_date,sender_acct_txn_count_1h,sender_acct_inflow_amt_1h,sender_acct_outflow_amt_1h,sender_acct_inflow_count_1h,sender_acct_outflow_count_1h,sender_acct_txn_count_24h,sender_acct_inflow_amt_24h,sender_acct_outflow_amt_24h,sender_acct_inflow_count_24h,sender_acct_outflow_count_24h,sender_acct_txn_count_7d,sender_acct_inflow_amt_7d,sender_acct_outflow_amt_7d,sender_acct_inflow_count_7d,sender_acct_outflow_count_7d,sender_acct_txn_count_30d,sender_acct_inflow_amt_30d,sender_acct_outflow_amt_30d,sender_acct_inflow_count_30d,sender_acct_outflow_count_30d,sender_cust_txn_count_1h,sender_cust_txn_count_24h,sender_cust_txn_count_7d,sender_cust_txn_count_30d,sender_cust_inflow_amt_1h,sender_cust_inflow_amt_24h,sender_cust_inflow_amt_7d,sender_cust_inflow_amt_30d,sender_cust_outflow_amt_1h,sender_cust_outflow_amt_24h,sender_cust_outflow_amt_7d,sender_cust_outflow_amt_30d,sender_cust_inflow_count_1h,sender_cust_inflow_count_24h,sender_cust_inflow_count_7d,sender_cust_inflow_count_30d,sender_cust_outflow_count_1h,sender_cust_outflow_count_24h,sender_cust_outflow_count_7d,sender_cust_outflow_count_30d,sender_balance_before_txn,sender_running_balance_txn_amount,sender_balance_after_txn,sender_cumulative_daily_balance_change,sender_current_balance,sender_bal_ratio_after_to_current,receiver_acct_txn_count_1h,receiver_acct_inflow_amt_1h,receiver_acct_outflow_amt_1h,receiver_acct_inflow_count_1h,receiver_acct_outflow_count_1h,receiver_acct_txn_count_24h,receiver_acct_inflow_amt_24h,receiver_acct_outflow_amt_24h,receiver_acct_inflow_count_24h,receiver_acct_outflow_count_24h,receiver_acct_txn_count_7d,receiver_acct_inflow_amt_7d,receiver_acct_outflow_amt_7d,receiver_acct_inflow_count_7d,receiver_acct_outflow_count_7d,receiver_acct_txn_count_30d,receiver_acct_inflow_amt_30d,receiver_acct_outflow_amt_30d,receiver_acct_inflow_count_30d,receiver_acct_outflow_count_30d,receiver_balance_before_txn,receiver_running_balance_txn_amount,receiver_balance_after_txn,receiver_cumulative_daily_balance_change,receiver_current_balance,receiver_bal_ratio_after_to_current
0,T400357,C3013,A8276,,B3027,D3593,2025-09-01 00:00:53,6939.94,mobile,debit,UPI,0,,,,NaT,normal,0,19832.46,558,medium,medium,low,low,0,self_employed,unknown,current,28.61,77.21,Delhi,,,,463,0,android,0,0,crypto,high,10.113.106.226,0.05,28.67,77.2,0,0,1,9,2025,3,0,1,0,1,2025-09-01,1,0.0,6939.94,0,1,1,0.0,6939.94,0,1,1,0.0,6939.94,0,1,1,0.0,6939.94,0,1,2,2,2,2,6939.94,6939.94,6939.94,6939.94,6939.94,6939.94,6939.94,6939.94,1,1,1,1,1,1,1,1,608107.2,-6939.94,601167.26,-6939.94,286367.13,2.1,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,,
1,T56222,C1559,A1154,A5708,,D2663,2025-09-01 00:02:48,2814.15,web,debit,online_transfer,0,,,,NaT,normal,0,11328.25,411,medium,high,medium,low,0,self_employed,real_estate,savings,26.91,75.79,Jaipur,,,,1038,0,ios,0,0,,,10.227.193.228,0.1,26.94,75.83,0,0,1,9,2025,3,0,1,0,1,2025-09-01,1,0.0,2814.15,0,1,1,0.0,2814.15,0,1,1,0.0,2814.15,0,1,1,0.0,2814.15,0,1,1,1,1,1,0.0,0.0,0.0,0.0,2814.15,2814.15,2814.15,2814.15,0,0,0,0,1,1,1,1,380902.6,-2814.15,378088.45,-2814.15,436050.9,0.87,1,2814.15,0.0,1,0,1,2814.15,0.0,1,0,1,2814.15,0.0,1,0,1,2814.15,0.0,1,0,380902.6,2814.15,383716.75,2814.15,511481.35,0.75


In [51]:
final_df[(final_df['sender_account_id']=="A8276") | (final_df['receiver_account_id']=="A8276")  ].to_excel("Check_A8276_V4.xlsx", index=False)

In [35]:
# ─── Save enriched datasets ────────────────────────────────────────

# 1. Transaction-level enriched (with sender/receiver features merged)
final_df.to_parquet('transactions_enriched.parquet', index=False)
print(f"✅ Saved: transactions_enriched.parquet ({final_df.shape})")

# 2. Account-level event log (long format with all features)
acct_events.to_parquet('account_events_enriched.parquet', index=False)
print(f"✅ Saved: account_events_enriched.parquet ({acct_events.shape})")

# ─── Summary of all new features ──────────────────────────────────
new_temporal = [c for c in final_df.columns if c.startswith('txn_') or c.startswith('is_')]
new_acct_vel = [c for c in acct_events.columns if c.startswith('acct_')]
new_cust_vel = [c for c in acct_events.columns if c.startswith('cust_')]
new_balance  = ['balance_before_txn','running_balance_txn_amount','balance_after_txn',
                'current_balance','bal_ratio_after_to_current','cumulative_daily_balance_change']

print("\n📋 FEATURE ENGINEERING SUMMARY")
print(f"   Temporal features      : {len(new_temporal)} columns")
print(f"   Account velocity feats : {len(new_acct_vel)} columns")
print(f"   Customer velocity feats: {len(new_cust_vel)} columns")
print(f"   Balance features       : {len(new_balance)} columns")
print(f"   Flow features          : 4 (passed through / derived from raw data)")
print(f"\n   TOTAL new features: ~{len(new_temporal)+len(new_acct_vel)+len(new_cust_vel)+len(new_balance)+4}")

✅ Saved: transactions_enriched.parquet ((429039, 140))
✅ Saved: account_events_enriched.parquet ((858078, 62))

📋 FEATURE ENGINEERING SUMMARY
   Temporal features      : 11 columns
   Account velocity feats : 20 columns
   Customer velocity feats: 21 columns
   Balance features       : 6 columns
   Flow features          : 4 (passed through / derived from raw data)

   TOTAL new features: ~62
