# 💳 SmartSentry AML — Notebook 2c: Rule-Based Features
---
Applies the 50-rule AML Rule Engine, populates the Fraud Intensity Score (FIS),
then runs final validation and saves the completed transactions table.

**Rules produced** (50 total)  
Amount, cash, temporal, account/customer velocity, balance, KYC/risk, beneficiary,
device/IP, channel, occupation/industry, and combined-signal rules.  
Rule columns: `rule_<name>` (0/1), `rule_trigger_count`, `max_rule_severity`,
`weighted_rule_score`, `fraud_intensity_score`.

**FIS formula (V6)**  
`FIS = clip(rule_trigger_count × 2.5, max=60) + label × 25 + (ratio/10) × 15`, capped at 100.

**Input  :** `outputs/txns_stage2.parquet`  (Notebook 2b)  
**Output :** `outputs/transactions.csv`  (final deliverable)


In [None]:
# ── C-1: Imports ──────────────────────────────────────────────────────────────
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')

OUTPUT_DIR = Path('./outputs')
print('✅ Libraries loaded')


In [None]:
# ── C-2: FIS Constants ────────────────────────────────────────────────────────
# These constants mirror those in Notebook 2a — defined here independently so
# this notebook has no runtime dependency on 2a's kernel state.

FIS_RULE_MULT   = 2.5    # per-rule multiplier (applied to trigger count)
FIS_RULE_CAP    = 60.0   # ceiling on the rule component
FIS_LABEL_BONUS = 25.0   # added for confirmed fraud (label=1)
FIS_RATIO_SCALE = 15.0   # weight for amount-to-balance-ratio component
FIS_SCALE       = 100    # final cap on FIS

print('✅ FIS constants loaded')
print(f'   Formula: clip(trigger_count × {FIS_RULE_MULT}, {FIS_RULE_CAP}) '
      f'+ label × {FIS_LABEL_BONUS} + (ratio/10) × {FIS_RATIO_SCALE}  [cap {FIS_SCALE}]')


In [None]:
# ── C-3: Load Stage-2 Data & Reference Tables ─────────────────────────────────
result = pd.read_parquet(OUTPUT_DIR / 'txns_stage2.parquet')
result['timestamp'] = pd.to_datetime(result['timestamp'])

# Reference tables required for FK integrity validation in C-6
customers     = pd.read_csv(OUTPUT_DIR / 'customers.csv')
accounts      = pd.read_csv(OUTPUT_DIR / 'accounts.csv')
devices       = pd.read_csv(OUTPUT_DIR / 'devices.csv')

# Restore dormant_account_set for dormancy-flag validation
dormant_account_set = set(
    pd.read_csv(OUTPUT_DIR / 'dormant_account_set.csv')['account_id']
)

print(f'✅ Stage-2 data loaded: {len(result):,} rows × {result.shape[1]} cols')
print(f'   Fraud rate : {result["label"].mean():.4f}')
print(f'   Dormant set: {len(dormant_account_set):,} accounts')


In [None]:
# ── C-4: AML Rule Engine — 50 Rules ───────────────────────────────────────────
# Each rule is a (name, severity, condition_lambda) triple.
# severity ∈ {1, 2, 3}  →  used to weight rule_trigger_count in weighted_rule_score.
# condition_lambda(row)  →  returns bool; wrapped in int(bool(...)) for robustness.
#
# Rule categories:
#   [A] Amount / cash thresholds
#   [B] Temporal signals
#   [C] Account-level velocity
#   [D] Customer-level velocity
#   [E] Balance & ratio signals
#   [F] KYC / customer risk profile
#   [G] Beneficiary risk
#   [H] Device / IP signals
#   [I] Channel signals
#   [J] Occupation / industry
#   [K] Combined multi-signal rules

RULES = [
    # [A] Amount & cash ─────────────────────────────────────────────────────
    ('large_cash_deposit',         3, lambda r: r['cash_flag']==1 and r['amount']>50_000),
    ('cash_just_below_threshold',  3, lambda r: r['cash_flag']==1 and 8_000<=r['amount']<=9_999),
    ('high_value_transfer',        2, lambda r: r['amount']>100_000),
    ('micro_transaction',          1, lambda r: r['amount']<10),
    ('round_amount_large',         1, lambda r: r.get('is_round_amount',0)==1 and r['amount']>=10_000),

    # [B] Temporal ───────────────────────────────────────────────────────────
    ('night_transaction',          2, lambda r: r.get('is_night',0)==1),
    ('weekend_high_value',         2, lambda r: r.get('is_weekend',0)==1 and r['amount']>20_000),
    ('dormant_account_activation', 3, lambda r: r.get('dormancy_flag',0)==1 and r['amount']>5_000),

    # [C] Account-level velocity ─────────────────────────────────────────────
    ('high_acct_velocity_1hr',     3, lambda r: r.get('txn_count_last_1hr',0)>5),
    ('high_acct_velocity_24hr',    2, lambda r: r.get('txn_count_last_24hr',0)>20),
    ('high_acct_velocity_7d',      2, lambda r: r.get('txn_count_last_7d',0)>60),
    ('high_acct_volume_24hr',      2, lambda r: r.get('total_amount_last_24hr',0)>500_000),
    ('high_acct_volume_7d',        2, lambda r: r.get('total_amount_last_7d',0)>2_000_000),
    ('amount_spike_30d',           3, lambda r: r.get('amount_zscore_30d',0)>4),
    ('rapid_burst',                3, lambda r: r.get('txn_count_last_1hr',0)>3),

    # [D] Customer-level velocity ────────────────────────────────────────────
    ('high_cust_velocity_1hr',     3, lambda r: r.get('cust_txn_count_last_1hr',0)>8),
    ('high_cust_velocity_24hr',    2, lambda r: r.get('cust_txn_count_last_24hr',0)>30),
    ('high_cust_volume_24hr',      2, lambda r: r.get('cust_total_amount_last_24hr',0)>1_000_000),
    ('high_cust_volume_30d',       2, lambda r: r.get('cust_total_amount_last_30d',0)>3_000_000),

    # [E] Balance & ratio ────────────────────────────────────────────────────
    ('low_kyc_high_amount',        3, lambda r: r.get('kyc_level')=='low' and r['amount']>30_000),
    ('new_account_large_txn',      3, lambda r: r.get('account_open_days',9999)<60 and r['amount']>10_000),
    ('high_amount_to_balance',     2, lambda r: r.get('amount_to_balance_ratio',0)>3),

    # [F] KYC / customer risk ────────────────────────────────────────────────
    ('very_high_risk_customer',    3, lambda r: r.get('customer_risk_rating')=='very_high'),
    ('pep_high_value',             3, lambda r: r.get('pep_flag',0)==1 and r['amount']>10_000),
    ('high_risk_country_sender',   2, lambda r: r.get('country_risk')=='high'),
    ('corporate_large_cash',       2, lambda r: r.get('account_type') in ('corporate','business')
                                                  and r['cash_flag']==1 and r['amount']>20_000),

    # [G] Beneficiary risk ───────────────────────────────────────────────────
    ('high_risk_beneficiary',      3, lambda r: r.get('high_risk_beneficiary',0)==1),
    ('crypto_transfer',            2, lambda r: r.get('beneficiary_type')=='crypto'),
    ('offshore_transfer',          3, lambda r: r.get('beneficiary_type')=='offshore'),
    ('high_risk_bene_country',     2, lambda r: r.get('beneficiary_country_risk')=='high'),

    # [H] Device / IP ────────────────────────────────────────────────────────
    ('rooted_device',              2, lambda r: r.get('rooted_flag',0)==1),
    ('vpn_proxy_detected',         2, lambda r: r.get('vpn_flag',0)==1),
    ('emulator_detected',          3, lambda r: r.get('emulator_flag',0)==1),
    ('new_device_large_txn',       1, lambda r: r.get('device_age_days',9999)<90 and r['amount']>10_000),

    # [I] Channel ────────────────────────────────────────────────────────────
    ('atm_high_withdrawal',        2, lambda r: r['channel']=='atm' and r['amount']>20_000),
    ('branch_night_txn',           2, lambda r: r['channel']=='branch' and r.get('is_night',0)==1),

    # [J] Cash structuring patterns ──────────────────────────────────────────
    ('structuring_pattern',        3, lambda r: r['cash_flag']==1 and 8_500<=r['amount']<=9_999),
    ('multiple_small_cash',        3, lambda r: r['cash_flag']==1
                                                  and r.get('txn_count_last_7d',0)>5 and r['amount']<10_000),

    # [K] Occupation / industry ──────────────────────────────────────────────
    ('high_risk_industry',         1, lambda r: r.get('industry') in ('real_estate','construction','unknown')),
    ('student_high_value',         2, lambda r: r.get('occupation')=='student' and r['amount']>50_000),
    ('unemployed_large_transfer',  2, lambda r: r.get('occupation')=='unemployed' and r['amount']>20_000),
    ('freelancer_offshore',        2, lambda r: r.get('occupation')=='freelancer'
                                                  and r.get('beneficiary_type')=='offshore'),

    # [L] Combined multi-signal rules ────────────────────────────────────────
    ('pep_crypto_transfer',        3, lambda r: r.get('pep_flag',0)==1 and r.get('beneficiary_type')=='crypto'),
    ('very_high_risk_offshore',    3, lambda r: r.get('customer_risk_rating')=='very_high'
                                                  and r.get('beneficiary_type')=='offshore'),
    ('low_kyc_offshore',           3, lambda r: r.get('kyc_level')=='low' and r.get('beneficiary_type')=='offshore'),
    ('low_income_large_txn',       2, lambda r: r.get('income_bracket')=='low' and r['amount']>50_000),
    ('vpn_offshore',               3, lambda r: r.get('vpn_flag',0)==1 and r.get('beneficiary_type')=='offshore'),
    ('emulator_crypto',            3, lambda r: r.get('emulator_flag',0)==1 and r.get('beneficiary_type')=='crypto'),
    ('new_account_offshore',       3, lambda r: r.get('account_open_days',9999)<90
                                                  and r.get('beneficiary_type')=='offshore'),
    ('new_acct_high_cust_velocity',3, lambda r: r.get('account_open_days',9999)<90
                                                  and r.get('cust_txn_count_last_24hr',0)>10),
]

assert len(RULES) == 50, f'Expected 50 rules, got {len(RULES)}'

severity_map = {}
for rule_name, severity, condition in RULES:
    col = f'rule_{rule_name}'
    try:
        result[col] = result.apply(lambda row, c=condition: int(bool(c(row))), axis=1)
    except Exception:
        result[col] = 0
    severity_map[col] = severity

rule_cols = [f'rule_{n}' for n, _, _ in RULES]

result['rule_trigger_count']  = result[rule_cols].sum(axis=1).astype(int)
result['max_rule_severity']   = result[rule_cols].mul(pd.Series(severity_map)).max(axis=1).astype(int)
result['weighted_rule_score'] = result[rule_cols].mul(pd.Series(severity_map)).sum(axis=1).astype(int)

total_firings  = int(result['rule_trigger_count'].sum())
print(f'✅ Rule engine complete: {len(RULES)} rules | {total_firings:,} total firings | '
      f'avg {result["rule_trigger_count"].mean():.2f}/txn')
print()
print('Top 10 most-triggered rules:')
top10 = (
    result[rule_cols].sum()
    .sort_values(ascending=False)
    .head(10)
    .rename('firings')
    .reset_index()
    .rename(columns={'index': 'rule'})
)
for _, row in top10.iterrows():
    print(f'   {row["rule"]:<45}  {int(row["firings"]):>8,}')


In [None]:
# ── C-5: Fraud Intensity Score (FIS) ─────────────────────────────────────────
# FIS aggregates three components into a [0, 100] score:
#   rule_component  = clip(rule_trigger_count × FIS_RULE_MULT,  FIS_RULE_CAP)
#   label_component = label × FIS_LABEL_BONUS
#   ratio_component = clip(amount_to_balance_ratio, 0–10) / 10 × FIS_RATIO_SCALE
# This replaces the NaN placeholder set in Notebook 2b.

rule_comp  = result['rule_trigger_count'].fillna(0) * FIS_RULE_MULT
rule_comp  = rule_comp.clip(upper=FIS_RULE_CAP)
label_comp = result['label'] * FIS_LABEL_BONUS
ratio_comp = (result['amount_to_balance_ratio'].fillna(0).clip(upper=10) / 10.0) * FIS_RATIO_SCALE

result['fraud_intensity_score'] = (rule_comp + label_comp + ratio_comp).clip(upper=FIS_SCALE).round(2)

fis_fraud = result.loc[result['label']==1, 'fraud_intensity_score'].mean()
fis_legit = result.loc[result['label']==0, 'fraud_intensity_score'].mean()

print(f'✅ FIS populated (V6): mean fraud = {fis_fraud:.2f} | mean legit = {fis_legit:.2f}')
print()
print('FIS distribution:')
for lo, hi, band in [(0,20,'very_low'),(21,40,'low'),(41,60,'medium'),(61,80,'high'),(81,100,'critical')]:
    cnt = ((result['fraud_intensity_score']>=lo) & (result['fraud_intensity_score']<=hi)).sum()
    bar = '█' * int(cnt / len(result) * 50)
    print(f'   {lo:>3}–{hi:<3}  {band:<10}  {cnt:>8,}  ({cnt/len(result)*100:5.1f}%)  {bar}')


In [None]:
# ── C-6: Final Transactions Summary ──────────────────────────────────────────
final_txns = result.copy()

print('══════════════════════════════════════════════════════════════')
print('  FINAL TRANSACTIONS TABLE SUMMARY — V7')
print('══════════════════════════════════════════════════════════════')
print(f'  Rows      : {len(final_txns):,}')
print(f'  Columns   : {final_txns.shape[1]}')
print(f'  Fraud rate: {final_txns["label"].mean():.4f}')
print()

pk_fk_cols = ['transaction_id','customer_id','sender_account_id',
              'receiver_account_id','beneficiary_id','device_id']
print('  PK / FK columns:')
for col in pk_fk_cols:
    null_pct = final_txns[col].isna().mean() * 100
    print(f'    {col:<30}  {null_pct:.1f}% null')

both    = (final_txns['receiver_account_id'].notna() & final_txns['beneficiary_id'].notna()).sum()
neither = (final_txns['receiver_account_id'].isna()  & final_txns['beneficiary_id'].isna()).sum()
print(f'\n  XOR (receiver ⊕ beneficiary):  BOTH={both} {"✅" if both==0 else "❌"}  '
      f'NEITHER={neither} {"✅" if neither==0 else "❌"}')

print(f'\n  Dormancy:')
print(f'    dormancy_flag=1  : {final_txns["dormancy_flag"].sum():,}')
print(f'    dormant accounts : {len(dormant_account_set):,}')

print('\n  Rolling feature ranges:')
for col, label in [
    ('account_open_days',       'account_open_days (dynamic)'),
    ('txn_count_last_24hr',     'txn_count_last_24hr (acc)'),
    ('cust_txn_count_last_24hr','cust_txn_count_last_24hr'),
    ('cust_unique_accounts_30d','cust_unique_accounts_30d'),
]:
    print(f'    {label:<35}  min={final_txns[col].min()}, max={final_txns[col].max()}')

print(f'\n  Balance stats:')
print(f'    balance_after_txn mean : ₹{final_txns["balance_after_txn"].mean():,.2f}')
n_pos = (final_txns['running_balance_txn_amount'] >  0).sum()
n_neg = (final_txns['running_balance_txn_amount'] <  0).sum()
print(f'    running_balance_txn_amt: {n_pos:,} positive / {n_neg:,} negative')
print(f'    time_since_origin_txn  : {final_txns["time_since_origin_txn"].notna().sum():,} non-null')


In [None]:
# ── C-7: PK / FK Integrity & V6-V7 Fix Validation ────────────────────────────
print('══════════════════════════════════════════════')
print('  FK & FIX INTEGRITY REPORT — V7')
print('══════════════════════════════════════════════')

checks = {
    'transaction_id is unique'                      : final_txns['transaction_id'].is_unique,
    'customer_id FK valid'                          : final_txns['customer_id'].isin(customers['customer_id']).all(),
    'sender_account_id FK valid'                    : final_txns['sender_account_id'].isin(accounts['account_id']).all(),
    'device_id FK valid'                            : final_txns['device_id'].isin(devices['device_id']).all(),
    'receiver XOR beneficiary'                      : (
        (final_txns['receiver_account_id'].notna() & final_txns['beneficiary_id'].notna()).sum()==0 and
        (final_txns['receiver_account_id'].isna()  & final_txns['beneficiary_id'].isna()).sum()==0
    ),
    '[FIX 2] dormancy_flag=0 for non-dormant accts' : (
        final_txns.loc[~final_txns['sender_account_id'].isin(dormant_account_set),
                       'dormancy_flag'].eq(0).all()
    ),
    '[FIX 2] dormancy_flag has non-zero values'     : final_txns['dormancy_flag'].sum() > 0,
    '[FIX 3] txn_velocity_cumulative >= 1'          : (final_txns['txn_velocity_cumulative'] >= 1).all(),
    '[FIX 4] txn_count_last_1hr >= 1'               : (final_txns['txn_count_last_1hr'] >= 1).all(),
    '[FIX 4] cust_txn_count_last_1hr >= 1'          : (final_txns['cust_txn_count_last_1hr'] >= 1).all(),
    '[FIX 8] running_bal has negative values'       : (final_txns['running_balance_txn_amount'] < 0).any(),
    '[FIX 9] balance_after_txn >= 0'                : (final_txns['balance_after_txn'] >= 0).all(),
    '[FIX 9] balance_before_txn >= 0'               : (final_txns['balance_before_txn'] >= 0).all(),
    '[FIX V7] balance continuity (no phantom jumps)': (
        final_txns
        .sort_values(['sender_account_id','timestamp'])
        .assign(_prev_after=lambda d: d.groupby('sender_account_id')['balance_after_txn'].shift(1))
        .pipe(lambda d: (
            d[d['_prev_after'].notna()]['balance_before_txn'].round(2) ==
            d[d['_prev_after'].notna()]['_prev_after'].round(2)
        ).all())
    ),
    '[FIX 10] account_open_days dynamic (max > static)': (
        final_txns['account_open_days'].max() > accounts['account_open_days'].max()
    ),
    '[FIX 11] cust_unique_accounts_30d >= 1'        : (final_txns['cust_unique_accounts_30d'] >= 1).all(),
    '[FIX 1]  FIS in [0, 100]'                      : (
        (final_txns['fraud_intensity_score'] >= 0) &
        (final_txns['fraud_intensity_score'] <= 100)
    ).all(),
    'is_night / is_business_hours exclusive'        : (
        (final_txns['is_night']==1) & (final_txns['is_business_hours']==1)
    ).sum()==0,
    'dormant_smurfing cash_flag=1'                  : (
        final_txns.loc[final_txns['fraud_type']=='dormant_smurfing', 'cash_flag'].eq(1).all()
        if (final_txns['fraud_type']=='dormant_smurfing').any() else True
    ),
}

all_ok = True
for check, passed in checks.items():
    icon = '✅' if passed else '❌'
    print(f'  {icon}  {check}')
    if not passed:
        all_ok = False

print()
print(f"  Overall: {'✅ ALL CHECKS PASSED (V7)' if all_ok else '❌ FAILURES DETECTED'}")


In [None]:
# ── C-8: Save Final Transactions ──────────────────────────────────────────────
final_txns.to_csv(OUTPUT_DIR / 'transactions.csv', index=False)

size_kb = (OUTPUT_DIR / 'transactions.csv').stat().st_size / 1024
print(f'✅ transactions.csv saved')
print(f'   {len(final_txns):,} rows × {final_txns.shape[1]} cols | {size_kb:,.0f} KB')
print()
print('All outputs in ./outputs/:')
for f in sorted(OUTPUT_DIR.glob('*.csv')):
    sz = f.stat().st_size / 1024
    print(f'  📄 {f.name:<35}  {sz:>8,.0f} KB')
for f in sorted(OUTPUT_DIR.glob('*.parquet')):
    sz = f.stat().st_size / 1024
    print(f'  📦 {f.name:<35}  {sz:>8,.0f} KB')
