# 🏦 SmartSentry AML — Notebook 1: Customer & Account Generator
---
**Purpose:** Generate the four reference / entity tables that every other table links back to.

| Table | Primary Key | Foreign Key |
|---|---|---|
| `customers` | `customer_id` | — |
| `accounts` | `account_id` | `customer_id → customers` |
| `devices` | `device_id` | — |
| `beneficiaries` | `beneficiary_id` | — |

**Output files (in `./outputs/`):**
- `customers.csv`
- `accounts.csv`
- `devices.csv`
- `beneficiaries.csv`
- `customer_feature_catalogue.csv`

In [None]:
# ── CELL 1: Imports ──────────────────────────────────────────────────────
import random
import warnings
import os
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Output directory — all CSVs written here
OUTPUT_DIR = Path("./outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

print("✅ Libraries loaded")
print(f"📁 Output directory: {OUTPUT_DIR.resolve()}")

In [None]:
# ── CELL 2: Configuration ────────────────────────────────────────────────
# All parameters live here — edit this cell to change data characteristics.
# No magic numbers anywhere else in the notebook.

# ─── Random Seed ──────────────────────────────────────────────────────────
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# ─── Population Sizes ─────────────────────────────────────────────────────
NUM_CUSTOMERS     = 20_000    # Unique customer entities
NUM_ACCOUNTS      = 25_000    # Bank accounts (customers may have >1)
NUM_DEVICES       = 18_000    # Unique device fingerprints
NUM_BENEFICIARIES = 30_000    # External payment destinations

# ─── Customer Distributions ───────────────────────────────────────────────
CUSTOMER_AGE_MIN  = 21
CUSTOMER_AGE_MAX  = 70

# KYC verification depth
KYC_LEVELS = ["low", "medium", "high"]
KYC_PROBS  = [0.20,  0.50,    0.30]

# CDD risk rating
RISK_RATINGS = ["low",  "medium", "high", "very_high"]
RISK_PROBS   = [0.50,    0.30,   0.15,    0.05]

# PEP = Politically Exposed Person (triggers EDD)
PEP_PREVALENCE = 0.03

# Declared occupation
OCCUPATIONS = [
    "salaried",            # Regular employment income
    "self_employed",       # Own business, variable income
    "business_owner",      # Company director / proprietor
    "student",             # Low income, high-risk for synthetic fraud
    "retired",             # Fixed income, dormancy risk
    "government_employee", # Stable income, PEP-adjacent risk
    "freelancer",          # Variable income, offshore exposure
    "unemployed",          # High-risk: no income source
]
OCCUPATION_PROBS = [0.30, 0.15, 0.12, 0.08, 0.10, 0.10, 0.08, 0.07]

# Industry sector
INDUSTRIES = [
    "finance",       "retail",        "healthcare",  "technology",
    "real_estate",   "manufacturing", "education",   "hospitality",
    "construction",  "unknown",
]
INDUSTRY_PROBS = [0.12, 0.14, 0.10, 0.13, 0.09, 0.10, 0.08, 0.08, 0.07, 0.09]

# Account type (from design doc Section 4.2)
ACCOUNT_TYPES = ["retail", "corporate", "savings", "current", "business"]
ACCOUNT_PROBS = [0.35,    0.15,       0.25,     0.15,     0.10]

# Income bracket
INCOME_BRACKETS = ["low",  "medium", "high"]
INCOME_PROBS    = [0.30,    0.50,   0.20]

# Home country / jurisdiction risk
COUNTRY_RISKS = ["low",  "medium", "high"]
COUNTRY_PROBS = [0.60,    0.30,   0.10]

# Days as a customer (tenure)
CUSTOMER_SINCE_DAYS_MIN = 30
CUSTOMER_SINCE_DAYS_MAX = 3650

# ─── Account Distributions ────────────────────────────────────────────────
# Balance: lognormal → realistic wealth distribution
# median balance ≈ ₹22,026 (exp(10))
AVG_BALANCE_LOG_MEAN  = 10
AVG_BALANCE_LOG_SIGMA = 1

ACCOUNT_OPEN_DAYS_MIN = 30
ACCOUNT_OPEN_DAYS_MAX = 2000

# ─── Device Distributions ─────────────────────────────────────────────────
OS_TYPES  = ["android", "ios", "windows", "unknown"]
OS_PROBS  = [0.45,      0.40,  0.10,     0.05]

DEVICE_AGE_MIN      = 30
DEVICE_AGE_MAX      = 1500
ROOTED_DEVICE_RATE  = 0.05    # Jailbroken/rooted — elevated risk
VPN_USAGE_RATE      = 0.08    # VPN/proxy detected — geo-masking
EMULATOR_RATE       = 0.03    # Emulated environment — strong fraud signal

# ─── Beneficiary Distributions ────────────────────────────────────────────
BENE_TYPES  = ["individual", "merchant", "crypto", "offshore"]
BENE_PROBS  = [0.60,         0.30,       0.05,     0.05]

# crypto and offshore are high-risk exit node types
HIGH_RISK_BENE_TYPES = {"crypto", "offshore"}

BENE_COUNTRY_RISKS = ["low",  "medium", "high"]
BENE_COUNTRY_PROBS = [0.65,    0.25,   0.10]

# Per-account pre-assigned beneficiaries for legit transactions
BENE_PER_ACCOUNT_MIN = 2
BENE_PER_ACCOUNT_MAX = 5

print("✅ Configuration loaded")
print(f"   Customers: {NUM_CUSTOMERS:,} | Accounts: {NUM_ACCOUNTS:,} | "
      f"Devices: {NUM_DEVICES:,} | Beneficiaries: {NUM_BENEFICIARIES:,}")

In [None]:
# ── CELL 3: Feature Catalogue ────────────────────────────────────────────
# Documents every column that will be created in this notebook.
# Saved as customer_feature_catalogue.csv for reference.

FEATURE_CATALOGUE = {
    # ── CUSTOMERS TABLE ───────────────────────────────────────────────────
    "customer_id":           ("customers",     "string",      "PK. Unique customer identifier. Format: C0, C1, …"),
    "age":                   ("customers",     "integer",     "Customer age in years. Range: 21–70."),
    "customer_risk_rating":  ("customers",     "categorical", "CDD risk tier: low | medium | high | very_high."),
    "pep_flag":              ("customers",     "binary",      "Politically Exposed Person: 1=PEP. Triggers EDD."),
    "occupation":            ("customers",     "categorical", "Declared occupation: salaried | self_employed | business_owner | student | retired | government_employee | freelancer | unemployed."),
    "industry":              ("customers",     "categorical", "Business sector: finance | retail | healthcare | technology | real_estate | manufacturing | education | hospitality | construction | unknown."),
    "account_type":          ("customers",     "categorical", "Primary account type: retail | corporate | savings | current | business."),
    "kyc_level":             ("customers",     "categorical", "KYC depth: low=basic ID | medium=standard | high=full EDD."),
    "income_bracket":        ("customers",     "categorical", "Declared income range: low | medium | high."),
    "country_risk":          ("customers",     "categorical", "Home jurisdiction risk: low | medium | high (FATF-aligned)."),
    "customer_since_days":   ("customers",     "integer",     "Days since customer relationship established. Range: 30–3650."),

    # ── ACCOUNTS TABLE ────────────────────────────────────────────────────
    "account_id":            ("accounts",      "string",      "PK. Unique account identifier. Format: A0, A1, …"),
    "avg_balance":           ("accounts",      "float",       "Average account balance in INR (lognormal). Denominator for amount_to_balance_ratio."),
    "account_open_days":     ("accounts",      "integer",     "Days since account was opened. <60 days = high-risk for identity fraud."),

    # ── DEVICES TABLE ─────────────────────────────────────────────────────
    "device_id":             ("devices",       "string",      "PK. Unique device fingerprint. Format: D0, D1, …"),
    "device_age_days":       ("devices",       "integer",     "Age of device in days. Range: 30–1500."),
    "rooted_flag":           ("devices",       "binary",      "1 = rooted (Android) / jailbroken (iOS). Tampering indicator."),
    "os_type":               ("devices",       "categorical", "Operating system: android | ios | windows | unknown."),
    "vpn_flag":              ("devices",       "binary",      "1 = VPN/proxy detected. Geo-masking indicator."),
    "emulator_flag":         ("devices",       "binary",      "1 = emulated environment, not a physical device. Strong fraud signal."),

    # ── BENEFICIARIES TABLE ───────────────────────────────────────────────
    "beneficiary_id":        ("beneficiaries", "string",      "PK. Unique external payment destination. Format: B0, B1, …"),
    "beneficiary_type":      ("beneficiaries", "categorical", "Type: individual | merchant | crypto | offshore. crypto/offshore = high risk."),
    "beneficiary_country_risk": ("beneficiaries", "categorical", "Beneficiary jurisdiction risk: low | medium | high."),
}

# Display as DataFrame
cat_df = pd.DataFrame([
    {"column_name": k, "table": v[0], "data_type": v[1], "description": v[2]}
    for k, v in FEATURE_CATALOGUE.items()
])
cat_df.to_csv(OUTPUT_DIR / "customer_feature_catalogue.csv", index=False)
print(f"✅ Feature catalogue: {len(cat_df)} columns documented")
cat_df

In [None]:
# ── CELL 4: Generate Customers Table ─────────────────────────────────────
# PK: customer_id
# All CDD/KYC fields from design doc Section 4.2 are included.

n = NUM_CUSTOMERS

customers = pd.DataFrame({
    # ── Primary Key ──────────────────────────────────────────────────────
    "customer_id":          [f"C{i}" for i in range(n)],

    # ── Demographics ─────────────────────────────────────────────────────
    "age":                  np.random.randint(CUSTOMER_AGE_MIN, CUSTOMER_AGE_MAX, n),

    # ── Risk & Regulatory (Section 4.2) ──────────────────────────────────
    "customer_risk_rating": np.random.choice(RISK_RATINGS,    n, p=RISK_PROBS),
    "pep_flag":             np.random.choice([0, 1],          n, p=[1-PEP_PREVALENCE, PEP_PREVALENCE]),
    "occupation":           np.random.choice(OCCUPATIONS,     n, p=OCCUPATION_PROBS),
    "industry":             np.random.choice(INDUSTRIES,      n, p=INDUSTRY_PROBS),
    "account_type":         np.random.choice(ACCOUNT_TYPES,   n, p=ACCOUNT_PROBS),

    # ── KYC & Profile ─────────────────────────────────────────────────────
    "kyc_level":            np.random.choice(KYC_LEVELS,      n, p=KYC_PROBS),
    "income_bracket":       np.random.choice(INCOME_BRACKETS, n, p=INCOME_PROBS),
    "country_risk":         np.random.choice(COUNTRY_RISKS,   n, p=COUNTRY_PROBS),
    "customer_since_days":  np.random.randint(CUSTOMER_SINCE_DAYS_MIN, CUSTOMER_SINCE_DAYS_MAX, n),
})

# ── Validation ────────────────────────────────────────────────────────────
assert customers["customer_id"].is_unique,        "❌ Duplicate customer_ids!"
assert customers["customer_id"].notna().all(),     "❌ NULL customer_ids!"
assert len(customers) == NUM_CUSTOMERS,            "❌ Row count mismatch!"

print(f"✅ customers table: {len(customers):,} rows × {customers.shape[1]} cols")
print(f"   PEP customers  : {customers['pep_flag'].sum():,} ({customers['pep_flag'].mean()*100:.1f}%)")
print(f"   Very high risk : {(customers['customer_risk_rating']=='very_high').sum():,}")
print()
customers.head()

In [None]:
# ── CELL 5: Customer Distribution Summary ────────────────────────────────

print("── Risk Rating ──────────────────────")
print(customers["customer_risk_rating"].value_counts().to_string())

print("\n── KYC Level ────────────────────────")
print(customers["kyc_level"].value_counts().to_string())

print("\n── Top 5 Occupations ────────────────")
print(customers["occupation"].value_counts().head().to_string())

print("\n── Income Bracket ───────────────────")
print(customers["income_bracket"].value_counts().to_string())

print("\n── Country Risk ─────────────────────")
print(customers["country_risk"].value_counts().to_string())

In [None]:
# ── CELL 6: Generate Accounts Table ──────────────────────────────────────
# PK: account_id
# FK: customer_id → customers.customer_id
#
# Customer risk attributes are DENORMALISED into accounts.
# This means transactions only need ONE join (to accounts) to get
# all customer context — no chaining through customers every time.

n = NUM_ACCOUNTS

accounts_base = pd.DataFrame({
    # ── Primary Key ──────────────────────────────────────────────────────
    "account_id":       [f"A{i}" for i in range(n)],

    # ── Foreign Key (links to customers table) ────────────────────────────
    "customer_id":      np.random.choice(customers["customer_id"], n),

    # ── Account-Level Fields ──────────────────────────────────────────────
    "avg_balance":      np.round(
                            np.random.lognormal(AVG_BALANCE_LOG_MEAN, AVG_BALANCE_LOG_SIGMA, n), 2
                        ),
    "account_open_days": np.random.randint(ACCOUNT_OPEN_DAYS_MIN, ACCOUNT_OPEN_DAYS_MAX, n),
})

# Denormalise customer risk/profile attributes for fast lookup
CUSTOMER_ATTRS = [
    "customer_id", "kyc_level", "country_risk", "income_bracket",
    "customer_risk_rating", "pep_flag", "occupation", "industry", "account_type",
]
accounts = accounts_base.merge(customers[CUSTOMER_ATTRS], on="customer_id", how="left")

# ── Validation ────────────────────────────────────────────────────────────
assert accounts["account_id"].is_unique
assert accounts["customer_id"].isin(customers["customer_id"]).all(), "❌ FK violation: orphan customer_ids!"
orphaned = (~accounts["customer_id"].isin(customers["customer_id"])).sum()

print(f"✅ accounts table  : {len(accounts):,} rows × {accounts.shape[1]} cols")
print(f"   FK violations  : {orphaned} (should be 0)")
print(f"   New accounts (<60d): {(accounts['account_open_days']<60).sum():,} "
      f"— high-risk for identity fraud")
print(f"   Avg balance    : ₹{accounts['avg_balance'].median():,.0f} (median)")
print()
accounts.head()

In [None]:
# ── CELL 7: Generate Devices Table ───────────────────────────────────────
# PK: device_id
# No FK — devices are linked to transactions via device_id column.
#
# Each account is assigned one 'home' device (account_device_map).
# Fraud injectors deliberately use a DIFFERENT device for ATO transactions,
# which is the key account takeover detection signal.

n = NUM_DEVICES

devices = pd.DataFrame({
    # ── Primary Key ──────────────────────────────────────────────────────
    "device_id":       [f"D{i}" for i in range(n)],

    # ── Device Attributes ─────────────────────────────────────────────────
    "device_age_days": np.random.randint(DEVICE_AGE_MIN, DEVICE_AGE_MAX, n),
    "rooted_flag":     np.random.choice([0, 1], n, p=[1-ROOTED_DEVICE_RATE, ROOTED_DEVICE_RATE]),
    "os_type":         np.random.choice(OS_TYPES, n, p=OS_PROBS),
    "vpn_flag":        np.random.choice([0, 1], n, p=[1-VPN_USAGE_RATE, VPN_USAGE_RATE]),
    "emulator_flag":   np.random.choice([0, 1], n, p=[1-EMULATOR_RATE, EMULATOR_RATE]),
})

# Build account → device mapping (one home device per account)
device_list = devices["device_id"].tolist()
account_device_map = {acc: random.choice(device_list) for acc in accounts["account_id"]}

assert devices["device_id"].is_unique

print(f"✅ devices table   : {len(devices):,} rows × {devices.shape[1]} cols")
print(f"   Rooted devices  : {devices['rooted_flag'].sum():,} ({devices['rooted_flag'].mean()*100:.1f}%)")
print(f"   VPN users       : {devices['vpn_flag'].sum():,} ({devices['vpn_flag'].mean()*100:.1f}%)")
print(f"   Emulators       : {devices['emulator_flag'].sum():,} ({devices['emulator_flag'].mean()*100:.1f}%)")
print(f"   account_device_map: {len(account_device_map):,} entries")
print()
devices.head()

In [None]:
# ── CELL 8: Generate Beneficiaries Table ─────────────────────────────────
# PK: beneficiary_id
# No FK — linked to transactions via beneficiary_id column.
#
# HIGH-RISK POOL: crypto + offshore types, or high-risk country.
# Fraud injectors always route to high-risk beneficiaries as exit nodes.

n = NUM_BENEFICIARIES

beneficiaries = pd.DataFrame({
    # ── Primary Key ──────────────────────────────────────────────────────
    "beneficiary_id":           [f"B{i}" for i in range(n)],

    # ── Beneficiary Attributes ────────────────────────────────────────────
    "beneficiary_type":         np.random.choice(BENE_TYPES,        n, p=BENE_PROBS),
    "beneficiary_country_risk": np.random.choice(BENE_COUNTRY_RISKS, n, p=BENE_COUNTRY_PROBS),
})

# High-risk pool = crypto/offshore OR high-risk country
high_risk_mask = (
    beneficiaries["beneficiary_type"].isin(HIGH_RISK_BENE_TYPES) |
    (beneficiaries["beneficiary_country_risk"] == "high")
)
high_risk_bene_pool = beneficiaries.loc[high_risk_mask, "beneficiary_id"].tolist()

# Pre-assign 2–5 known beneficiaries per account (for legit transactions)
bene_list = beneficiaries["beneficiary_id"].tolist()
account_beneficiaries_map = {
    acc: random.sample(bene_list, k=random.randint(BENE_PER_ACCOUNT_MIN, BENE_PER_ACCOUNT_MAX))
    for acc in accounts["account_id"]
}

assert beneficiaries["beneficiary_id"].is_unique

print(f"✅ beneficiaries   : {len(beneficiaries):,} rows × {beneficiaries.shape[1]} cols")
print(f"   High-risk pool  : {len(high_risk_bene_pool):,} ({len(high_risk_bene_pool)/n*100:.1f}%)")
print(f"   Crypto          : {(beneficiaries['beneficiary_type']=='crypto').sum():,}")
print(f"   Offshore        : {(beneficiaries['beneficiary_type']=='offshore').sum():,}")
print(f"   account_beneficiaries_map: {len(account_beneficiaries_map):,} entries")
print()
beneficiaries.head()

In [None]:
# ── CELL 9: PK / FK Schema Validation ───────────────────────────────────

print("══════════════════════════════════════════════════════")
print("  PK / FK INTEGRITY REPORT")
print("══════════════════════════════════════════════════════")

checks = {
    "customers.customer_id  is unique"    : customers["customer_id"].is_unique,
    "accounts.account_id    is unique"    : accounts["account_id"].is_unique,
    "devices.device_id      is unique"    : devices["device_id"].is_unique,
    "beneficiaries.beneficiary_id unique" : beneficiaries["beneficiary_id"].is_unique,
    "accounts.customer_id → customers FK" : accounts["customer_id"].isin(customers["customer_id"]).all(),
}

all_passed = True
for check, result in checks.items():
    icon = "✅" if result else "❌"
    print(f"  {icon}  {check}")
    if not result:
        all_passed = False

print()
print("══════════════════════════════════════════════════════")
print("  TABLE SUMMARY")
print("══════════════════════════════════════════════════════")
summary = pd.DataFrame([
    {"table": "customers",     "pk": "customer_id",     "fk": "—",                           "rows": len(customers),     "cols": customers.shape[1]},
    {"table": "accounts",      "pk": "account_id",      "fk": "customer_id → customers",     "rows": len(accounts),      "cols": accounts.shape[1]},
    {"table": "devices",       "pk": "device_id",       "fk": "—",                           "rows": len(devices),       "cols": devices.shape[1]},
    {"table": "beneficiaries", "pk": "beneficiary_id",  "fk": "—",                           "rows": len(beneficiaries), "cols": beneficiaries.shape[1]},
])
print(summary.to_string(index=False))
print()
print(f"  Overall: {'✅ ALL CHECKS PASSED' if all_passed else '❌ FAILURES DETECTED'}")

In [None]:
# ── CELL 10: Save Reference Tables ──────────────────────────────────────

customers.to_csv(OUTPUT_DIR / "customers.csv", index=False)
accounts.to_csv(OUTPUT_DIR / "accounts.csv", index=False)
devices.to_csv(OUTPUT_DIR / "devices.csv", index=False)
beneficiaries.to_csv(OUTPUT_DIR / "beneficiaries.csv", index=False)

print("✅ Saved:")
for fname in ["customers.csv", "accounts.csv", "devices.csv", "beneficiaries.csv"]:
    size = (OUTPUT_DIR / fname).stat().st_size / 1024
    print(f"   📄 {fname:<30} {size:>7.1f} KB")

print()
print("Available in subsequent notebooks as:")
print("  customers            — customers.csv")
print("  accounts             — accounts.csv")
print("  devices              — devices.csv")
print("  beneficiaries        — beneficiaries.csv")
print("  account_device_map   — in-memory dict { account_id → device_id }")
print("  account_beneficiaries_map — in-memory dict { account_id → [bene_ids] }")
print("  high_risk_bene_pool  — in-memory list [beneficiary_ids]")