In [14]:
# ============================================================
# ‚≠ê TRAINING NOTEBOOK v6 (FINAL PRODUCTION VERSION)
# Synthetic fraud engine aligned with app.py / fraud_logic.py
# ============================================================

# ============================================
# Cell 1: Imports & configuration
# ============================================

import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    classification_report,
    confusion_matrix,
)

from sklearn.ensemble import IsolationForest
from lightgbm import LGBMClassifier

import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Where to save models (compatible with app)
MODELS_DIR = "models"
os.makedirs(MODELS_DIR, exist_ok=True)

supervised_path = os.path.join(MODELS_DIR, "supervised_lgbm_pipeline.joblib")
iforest_path = os.path.join(MODELS_DIR, "iforest_pipeline.joblib")
thresholds_path = os.path.join(MODELS_DIR, "model_thresholds.json")


# ============================================
# Cell 2: Synthetic data helpers
# ============================================

# Channels aligned with your app
CHANNELS = [
    "Mobile App",
    "NetBanking",
    "ATM",
    "POS",
    "Online Purchase",
    "UPI",
    "Credit Card",
    "Debit Card",
    "Branch",
]

TRANSACTION_TYPES = ["PAYMENT", "TRANSFER", "WITHDRAWAL", "BILL_PAY"]

CITIES_NORMAL = [
    "Mumbai",
    "Bangalore",
    "Delhi",
    "Chennai",
    "Hyderabad",
    "Pune",
    "Kolkata",
]

CITIES_HIGH_RISK = [
    "Lagos",
    "Karachi",
    "Nairobi",
    "Bogota",
]

ALL_CITIES = CITIES_NORMAL + CITIES_HIGH_RISK

COUNTRIES_NORMAL = ["India", "USA", "UK"]
COUNTRIES_HIGH_RISK = ["Nigeria", "Pakistan", "Colombia"]

DEVICES = [
    "MOB-ANDROID",
    "MOB-IOS",
    "WEB-CHROME",
    "WEB-SAFARI",
    "POS-TERM-001",
    "POS-TERM-002",
    "ATM-0001",
    "ATM-0002",
    "BANK-CSR",
]

def sample_hours(size, fraud=False):
    """Heavier tail into night hours for fraud."""
    if not fraud:
        # Daytime concentration
        probs = np.array(
            [0.01, 0.01, 0.01, 0.01, 0.02, 0.02,  # 0-5
             0.03, 0.04, 0.06, 0.08, 0.10, 0.10,  # 6-11
             0.09, 0.08, 0.07, 0.06, 0.05, 0.04,  # 12-17
             0.04, 0.03, 0.02, 0.01, 0.01, 0.01]  # 18-23
        )
    else:
        # More weight to late night + high traffic ecom times
        probs = np.array(
            [0.06, 0.06, 0.05, 0.05, 0.05, 0.04,  # 0-5
             0.03, 0.03, 0.03, 0.04, 0.05, 0.05,  # 6-11
             0.05, 0.05, 0.05, 0.05, 0.04, 0.03,  # 12-17
             0.03, 0.02, 0.02, 0.01, 0.01, 0.01]  # 18-23
        )
    probs /= probs.sum()
    return np.random.choice(np.arange(24), size=size, p=probs)


def sample_amounts(size, fraud=False):
    """Smaller values for good, heavier right tail for fraud."""
    if not fraud:
        # Log-normal ~ median ~ 1k, mostly under 50k
        base = np.random.lognormal(mean=8, sigma=0.6, size=size)
    else:
        # Heavier tail, more extreme large values
        base = np.random.lognormal(mean=9, sigma=1.0, size=size)
    # Cap at some upper bound
    return np.clip(base, 10, 2_000_000)


def sample_channels(size, fraud=False):
    if not fraud:
        probs = np.array(
            [
                0.20,  # Mobile App
                0.10,  # NetBanking
                0.15,  # ATM
                0.15,  # POS
                0.10,  # Online Purchase
                0.15,  # UPI
                0.05,  # Credit Card
                0.05,  # Debit Card
                0.05,  # Branch
            ]
        )
    else:
        # More fraud in ecom / card / netbanking / ATM night usage
        probs = np.array(
            [
                0.18,  # Mobile App
                0.15,  # NetBanking
                0.10,  # ATM
                0.08,  # POS
                0.24,  # Online Purchase
                0.10,  # UPI
                0.10,  # Credit Card
                0.03,  # Debit Card
                0.02,  # Branch
            ]
        )
    probs /= probs.sum()
    return np.random.choice(CHANNELS, size=size, p=probs)


def sample_locations(size, fraud=False):
    if not fraud:
        probs = np.array(
            [0.18, 0.16, 0.18, 0.15, 0.13, 0.10, 0.07]  # normal cities
            + [0.01, 0.01, 0.01, 0.01]  # high-risk rare
        )
    else:
        probs = np.array(
            [0.10, 0.10, 0.10, 0.10, 0.08, 0.07, 0.05]  # normal
            + [0.10, 0.10, 0.10, 0.10]  # high-risk more frequent
        )
    probs /= probs.sum()
    return np.random.choice(ALL_CITIES, size=size, p=probs)


def sample_countries(size, fraud=False):
    if not fraud:
        probs = np.array([0.7, 0.2, 0.1, 0.0, 0.0, 0.0])
    else:
        probs = np.array([0.4, 0.1, 0.1, 0.2, 0.1, 0.1])
    probs /= probs.sum()
    all_countries = COUNTRIES_NORMAL + COUNTRIES_HIGH_RISK
    return np.random.choice(all_countries, size=size, p=probs)


def sample_txn_type(size, fraud=False):
    if not fraud:
        probs = np.array([0.65, 0.10, 0.15, 0.10])
    else:
        # more transfers and withdrawals in fraud
        probs = np.array([0.40, 0.30, 0.20, 0.10])
    probs /= probs.sum()
    return np.random.choice(TRANSACTION_TYPES, size=size, p=probs)


def sample_device_ids(size):
    return np.random.choice(DEVICES, size=size)


def generate_boolean(size, p_true):
    return np.random.rand(size) < p_true


# ============================================
# Cell 3: Generate synthetic dataset (500k rows, ~50% fraud)
# ============================================

N_TOTAL = 500_000
N_FRAUD = N_TOTAL // 2
N_GOOD = N_TOTAL - N_FRAUD

print(f"Generating synthetic dataset: {N_TOTAL} rows (~50% fraud)")

# --------- GOOD (non-fraud) ----------
good_amount = sample_amounts(N_GOOD, fraud=False)
good_channel = sample_channels(N_GOOD, fraud=False)
good_location = sample_locations(N_GOOD, fraud=False)
good_txn_type = sample_txn_type(N_GOOD, fraud=False)
good_hour = sample_hours(N_GOOD, fraud=False)
good_dow = np.random.randint(0, 7, size=N_GOOD)  # 0=Mon
good_month = np.random.randint(1, 13, size=N_GOOD)
good_device = sample_device_ids(N_GOOD)
good_home_country = sample_countries(N_GOOD, fraud=False)
good_txn_country = good_home_country.copy()

good_ip_risk = np.random.normal(loc=20, scale=10, size=N_GOOD)  # mostly low
good_ip_risk = np.clip(good_ip_risk, 0, 100)

good_vpn = generate_boolean(N_GOOD, p_true=0.02)
good_new_device = generate_boolean(N_GOOD, p_true=0.05)
good_new_benef = generate_boolean(N_GOOD, p_true=0.05)

# txns last 1h small
good_txns_1h = np.random.poisson(lam=1.0, size=N_GOOD)

# --------- FRAUD ----------
fraud_amount = sample_amounts(N_FRAUD, fraud=True)
fraud_channel = sample_channels(N_FRAUD, fraud=True)
fraud_location = sample_locations(N_FRAUD, fraud=True)
fraud_txn_type = sample_txn_type(N_FRAUD, fraud=True)
fraud_hour = sample_hours(N_FRAUD, fraud=True)
fraud_dow = np.random.randint(0, 7, size=N_FRAUD)
fraud_month = np.random.randint(1, 13, size=N_FRAUD)
fraud_device = sample_device_ids(N_FRAUD)

fraud_home_country = sample_countries(N_FRAUD, fraud=False)
fraud_txn_country = sample_countries(N_FRAUD, fraud=True)

fraud_ip_risk = np.random.normal(loc=70, scale=15, size=N_FRAUD)
fraud_ip_risk = np.clip(fraud_ip_risk, 0, 100)

fraud_vpn = generate_boolean(N_FRAUD, p_true=0.30)
fraud_new_device = generate_boolean(N_FRAUD, p_true=0.40)
fraud_new_benef = generate_boolean(N_FRAUD, p_true=0.35)

fraud_txns_1h = np.random.poisson(lam=5.0, size=N_FRAUD)

# Inject card-testing patterns: small amounts but high velocity in card channels
card_like_mask = np.isin(fraud_channel, ["Credit Card", "Debit Card", "Online Purchase", "POS"])
idx_card = np.where(card_like_mask)[0]
n_card_test = int(0.15 * len(idx_card))
card_test_idx = np.random.choice(idx_card, size=n_card_test, replace=False)
fraud_amount[card_test_idx] = np.random.uniform(10, 200, size=n_card_test)
fraud_txns_1h[card_test_idx] = np.random.randint(5, 15, size=n_card_test)

# Assemble GOOD df
good_df = pd.DataFrame(
    {
        "Amount": good_amount,
        "TransactionType": good_txn_type,
        "Location": good_location,
        "DeviceID": good_device,
        "Channel": good_channel,
        "hour": good_hour,
        "day_of_week": good_dow,
        "month": good_month,
        # extra / rule-features
        "ip_risk_score": good_ip_risk,
        "vpn_detected": good_vpn.astype(int),
        "new_device": good_new_device.astype(int),
        "new_beneficiary": good_new_benef.astype(int),
        "txns_last_1h": good_txns_1h,
        "home_country": good_home_country,
        "txn_country": good_txn_country,
        "Label": 0,
    }
)

# Assemble FRAUD df
fraud_df = pd.DataFrame(
    {
        "Amount": fraud_amount,
        "TransactionType": fraud_txn_type,
        "Location": fraud_location,
        "DeviceID": fraud_device,
        "Channel": fraud_channel,
        "hour": fraud_hour,
        "day_of_week": fraud_dow,
        "month": fraud_month,
        # extra / rule-features
        "ip_risk_score": fraud_ip_risk,
        "vpn_detected": fraud_vpn.astype(int),
        "new_device": fraud_new_device.astype(int),
        "new_beneficiary": fraud_new_benef.astype(int),
        "txns_last_1h": fraud_txns_1h,
        "home_country": fraud_home_country,
        "txn_country": fraud_txn_country,
        "Label": 1,
    }
)

# Concatenate & shuffle
all_df = pd.concat([good_df, fraud_df], ignore_index=True)
all_df = all_df.sample(frac=1.0, random_state=RANDOM_STATE).reset_index(drop=True)

print("\n=== Synthetic dataset summary ===")
print(all_df["Label"].value_counts())
print("Fraud rate:", all_df["Label"].mean())
print("Shape:", all_df.shape)
print("Columns:", list(all_df.columns))


# ============================================
# Cell 4: Train/test split + features for ML
# ============================================

# 8 core features used by app + fraud_logic.py for ML
FEATURES_FOR_MODEL = [
    "Amount",
    "TransactionType",
    "Location",
    "DeviceID",
    "Channel",
    "hour",
    "day_of_week",
    "month",
]

X = all_df[FEATURES_FOR_MODEL].copy()
y = all_df["Label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y,
)

print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)


# ============================================
# Cell 5: Preprocessing builder
# ============================================

def make_preprocess():
    """
    Create a fresh ColumnTransformer with OneHotEncoder for categorical
    features, numeric passthrough for others.
    This must NEVER be re-fitted outside its own pipeline.
    """
    categorical_features = ["TransactionType", "Location", "DeviceID", "Channel"]
    numeric_features = ["Amount", "hour", "day_of_week", "month"]

    # Use sparse_output if sklearn >=1.2, else fallback to sparse=True
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", ohe, categorical_features),
            ("num", "passthrough", numeric_features),
        ],
        remainder="drop",
    )
    return preprocessor


# ============================================
# Cell 6: Train LightGBM supervised pipeline
# ============================================

print("\nFitting LightGBM supervised pipeline...")

preprocess_supervised = make_preprocess()

supervised_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess_supervised),
        (
            "model",
            LGBMClassifier(
                n_estimators=150,
                learning_rate=0.05,
                max_depth=-1,
                num_leaves=31,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=RANDOM_STATE,
                n_jobs=-1,
            ),
        ),
    ]
)

supervised_pipeline.fit(X_train, y_train)
print("Done.")

# Evaluate on test set
print("\nPredicting fraud probabilities on test set...")
y_proba = supervised_pipeline.predict_proba(X_test)[:, 1]

print("Prob range:", y_proba.min(), "‚Üí", y_proba.max())
roc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)
print("ROC AUC:", roc)
print("PR AUC :", pr_auc)

prec, rec, thr = precision_recall_curve(y_test, y_proba)
f1 = 2 * (prec * rec) / (prec + rec + 1e-9)
best_idx = np.argmax(f1)
best_threshold_supervised = thr[best_idx]

print("\n=== Optimal LightGBM Threshold ===")
print("Threshold:", best_threshold_supervised)
print("Precision:", prec[best_idx])
print("Recall   :", rec[best_idx])
print("F1 score :", f1[best_idx])

y_pred_supervised = (y_proba >= best_threshold_supervised).astype(int)
print("\nClassification report (LightGBM @ optimal threshold):")
print(classification_report(y_test, y_pred_supervised))

print("Confusion matrix (LightGBM):")
print(confusion_matrix(y_test, y_pred_supervised))


# ============================================
# Cell 7: Train IsolationForest pipeline
# ============================================

print("\nTraining IsolationForest (anomaly model)...")

# Only non-fraud samples for IForest training
X_train_nonfraud = X_train[y_train == 0]

preprocess_iforest = make_preprocess()

iforest = IsolationForest(
    n_estimators=150,
    contamination=0.10,  # expected fraud-ish fraction
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

iforest_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess_iforest),
        ("iforest", iforest),
    ]
)

print("Fitting IsolationForest pipeline...")
iforest_pipeline.fit(X_train_nonfraud)
print("Done.")

print("\nScoring anomaly on test set...")
anomaly_raw = iforest_pipeline.decision_function(X_test)
anomaly_score = -anomaly_raw  # higher => more anomalous

print("Anomaly score range:", anomaly_score.min(), "‚Üí", anomaly_score.max())

prec_if, rec_if, thr_if = precision_recall_curve(y_test, anomaly_score)
f1_if = 2 * (prec_if * rec_if) / (prec_if + rec_if + 1e-9)
best_idx_if = np.argmax(f1_if)
threshold_iforest = thr_if[best_idx_if]

print("\n=== Optimal IsolationForest Threshold ===")
print("Threshold:", threshold_iforest)
print("Precision:", prec_if[best_idx_if])
print("Recall   :", rec_if[best_idx_if])
print("F1 score :", f1_if[best_idx_if])

y_pred_if = (anomaly_score >= threshold_iforest).astype(int)

print("\nClassification report (IForest @ optimal threshold):")
print(classification_report(y_test, y_pred_if))

print("Confusion matrix (IForest):")
print(confusion_matrix(y_test, y_pred_if))


# ============================================
# Cell 8: Combined model evaluation (OR logic)
# ============================================

y_pred_combined = np.where(
    (y_pred_supervised == 1) | (y_pred_if == 1),
    1,
    0,
)

print("\n=== Final Combined Model Report (OR: supervised OR anomaly) ===")
print(classification_report(y_test, y_pred_combined))
print("Confusion matrix (combined):")
print(confusion_matrix(y_test, y_pred_combined))


# ============================================
# Cell 9: Save models & thresholds
# ============================================

joblib.dump(supervised_pipeline, supervised_path)
joblib.dump(iforest_pipeline, iforest_path)

thresholds = {
    "supervised_threshold": float(best_threshold_supervised),
    "iforest_threshold": float(threshold_iforest),
}

with open(thresholds_path, "w") as f:
    json.dump(thresholds, f, indent=2)

print("\nSaved:")
print(" -", supervised_path)
print(" -", iforest_path)
print(" -", thresholds_path)


# ============================================
# Cell 10: Compatibility checker (non-intrusive)
# ============================================

print("\n\n======================================")
print("üîç MODEL COMPATIBILITY CHECK STARTED")
print("======================================")

sup_loaded = joblib.load(supervised_path)

if not hasattr(sup_loaded, "feature_names_in_"):
    raise RuntimeError(
        "Supervised pipeline has no 'feature_names_in_'. "
        "It must be trained with a pandas DataFrame."
    )

trained_features = list(sup_loaded.feature_names_in_)
print("\n‚úî Pipeline trained features:", len(trained_features))
print(trained_features)

print("\n‚úî Expected app features:", len(FEATURES_FOR_MODEL))
print(FEATURES_FOR_MODEL)

missing = set(FEATURES_FOR_MODEL) - set(trained_features)
extra = set(trained_features) - set(FEATURES_FOR_MODEL)

if missing:
    print("\n‚ùå Missing expected app features in pipeline:", missing)
else:
    print("\n‚úÖ No missing app features in pipeline.")

if extra:
    print("‚ö† Extra features in pipeline (app ignore):", extra)
else:
    print("‚úÖ No extra features in pipeline.")


def check_internal_alignment(pipeline):
    """
    Ensure preprocess & model have consistent transformed feature dimension.
    No .fit() is called here ‚Äì only .transform() on a dummy row.
    """
    if "preprocess" not in pipeline.named_steps:
        print("\n‚ùå No 'preprocess' step in pipeline.")
        return False

    # assume last step is model
    model_step = list(pipeline.named_steps.values())[-1]
    if not hasattr(model_step, "n_features_in_"):
        print("\n‚ùå Model has no 'n_features_in_' attribute.")
        return False

    # dummy row
    dummy = pd.DataFrame(
        [
            {
                "Amount": 1000.0,
                "TransactionType": "PAYMENT",
                "Location": "Mumbai",
                "DeviceID": "MOB-ANDROID",
                "Channel": "Mobile App",
                "hour": 12,
                "day_of_week": 2,
                "month": 5,
            }
        ]
    )
    dummy = dummy.reindex(columns=trained_features)

    preprocess = pipeline.named_steps["preprocess"]
    Z = preprocess.transform(dummy)

    if hasattr(Z, "shape"):
        n_transformed = Z.shape[1]
    else:
        print("\n‚ùå Could not read transformed shape.")
        return False

    n_expected = int(model_step.n_features_in_)

    print(f"\n‚úî Preprocess produces {n_transformed} features.")
    print(f"‚úî Model expects     {n_expected} features.")

    if n_transformed != n_expected:
        print(
            "\n‚ùå INTERNAL MISMATCH: preprocessor and model disagree on feature count.\n"
            "   ‚ûú This pipeline would be unsafe for inference.\n"
            "   Re-run training cleanly if you see this."
        )
        return False

    print("\n‚úÖ Internal alignment OK.")
    return True


alignment_ok = check_internal_alignment(sup_loaded)

print("\n======================================")
print("üîç MODEL COMPATIBILITY CHECK FINISHED")
print("======================================\n")


# ============================================
# Cell 11: Final inference examples (safe)
# ============================================

if not alignment_ok:
    print(
        "üö´ Skipping example scoring because pipeline is inconsistent.\n"
        "   Re-run training cells from top if this happens."
    )
else:
    print("Reloading models for inference test...")
    iforest_loaded = joblib.load(iforest_path)
    with open(thresholds_path, "r") as f:
        th_loaded = json.load(f)

    th_sup = float(th_loaded["supervised_threshold"])
    th_if = float(th_loaded["iforest_threshold"])

    print("Loaded thresholds:")
    print("  Supervised     :", th_sup)
    print("  IsolationForest:", th_if)

    def score_example(sample: dict, label: str = "Unknown"):
        # Fill defaults to avoid missing keys
        base = {
            "Amount": 0.0,
            "TransactionType": "PAYMENT",
            "Location": "Unknown",
            "DeviceID": "Unknown",
            "Channel": "Other",
            "hour": 12,
            "day_of_week": 0,
            "month": 1,
        }
        base.update(sample)
        df = pd.DataFrame([base])
        df = df.reindex(columns=trained_features)

        fraud_prob = float(sup_loaded.predict_proba(df)[0, 1])
        anom_raw = float(iforest_loaded.decision_function(df)[0])
        anom_score = -anom_raw

        flag_sup = fraud_prob >= th_sup
        flag_if = anom_score >= th_if
        flag_combined = flag_sup or flag_if

        print("\n============================")
        print(f"Example: {label}")
        print("============================")
        print("Input:", base)
        print(f"Fraud Probability (ML): {fraud_prob:.6f}")
        print(f"Anomaly Score (IForest): {anom_score:.6f}")
        print(f"Supervised Flag (>= {th_sup:.4f}): {flag_sup}")
        print(f"IForest Flag (>= {th_if:.4f}): {flag_if}")
        print(f"Combined Fraud Flag (OR): {flag_combined}")

    # Diverse realistic examples
    examples = [
        (
            {
                "Amount": 800.0,
                "TransactionType": "PAYMENT",
                "Location": "Mumbai",
                "DeviceID": "MOB-ANDROID",
                "Channel": "Mobile App",
                "hour": 13,
                "day_of_week": 2,
                "month": 5,
            },
            "GOOD: small mobile-app payment (daytime)",
        ),
        (
            {
                "Amount": 1200.0,
                "TransactionType": "PAYMENT",
                "Location": "Bangalore",
                "DeviceID": "POS-TERM-001",
                "Channel": "POS",
                "hour": 19,
                "day_of_week": 5,
                "month": 8,
            },
            "GOOD: evening POS grocery shopping",
        ),
        (
            {
                "Amount": 6000.0,
                "TransactionType": "WITHDRAWAL",
                "Location": "Delhi",
                "DeviceID": "ATM-0001",
                "Channel": "ATM",
                "hour": 11,
                "day_of_week": 1,
                "month": 3,
            },
            "GOOD: normal ATM withdrawal weekday morning",
        ),
        (
            {
                "Amount": 450000.0,
                "TransactionType": "TRANSFER",
                "Location": "Mumbai",
                "DeviceID": "WEB-CHROME",
                "Channel": "NetBanking",
                "hour": 2,
                "day_of_week": 1,
                "month": 4,
            },
            "FRAUD-LIKE: large late-night netbanking transfer",
        ),
        (
            {
                "Amount": 200000.0,
                "TransactionType": "PAYMENT",
                "Location": "Lagos",
                "DeviceID": "WEB-CHROME",
                "Channel": "Online Purchase",
                "hour": 1,
                "day_of_week": 4,
                "month": 9,
            },
            "FRAUD-LIKE: high-value international online purchase at odd hour",
        ),
        (
            {
                "Amount": 250000.0,
                "TransactionType": "WITHDRAWAL",
                "Location": "Lagos",
                "DeviceID": "ATM-0002",
                "Channel": "ATM",
                "hour": 0,
                "day_of_week": 6,
                "month": 12,
            },
            "FRAUD-LIKE: abnormal high ATM withdrawal at midnight in high-risk city",
        ),
        (
            {
                "Amount": 150000.0,
                "TransactionType": "TRANSFER",
                "Location": "New York",
                "DeviceID": "WEB-CHROME",
                "Channel": "Online Purchase",
                "hour": 23,
                "day_of_week": 0,
                "month": 1,
            },
            "FRAUD-LIKE: cross-border high-value online transfer/purchase",
        ),
    ]

    for sample, label in examples:
        score_example(sample, label)


Generating synthetic dataset: 500000 rows (~50% fraud)

=== Synthetic dataset summary ===
Label
0    250000
1    250000
Name: count, dtype: int64
Fraud rate: 0.5
Shape: (500000, 16)
Columns: ['Amount', 'TransactionType', 'Location', 'DeviceID', 'Channel', 'hour', 'day_of_week', 'month', 'ip_risk_score', 'vpn_detected', 'new_device', 'new_beneficiary', 'txns_last_1h', 'home_country', 'txn_country', 'Label']

Train shape: (375000, 8) Test shape: (125000, 8)

Fitting LightGBM supervised pipeline...
[LightGBM] [Info] Number of positive: 187500, number of negative: 187500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 375000, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.0



Prob range: 0.030671159072950493 ‚Üí 0.9994412219633907
ROC AUC: 0.9237044768000001
PR AUC : 0.9359408446580543

=== Optimal LightGBM Threshold ===
Threshold: 0.4303858140504603
Precision: 0.8533547153918424
Recall   : 0.842896
F1 score : 0.8480931141063264

Classification report (LightGBM @ optimal threshold):
              precision    recall  f1-score   support

           0       0.84      0.86      0.85     62500
           1       0.85      0.84      0.85     62500

    accuracy                           0.85    125000
   macro avg       0.85      0.85      0.85    125000
weighted avg       0.85      0.85      0.85    125000

Confusion matrix (LightGBM):
[[53447  9053]
 [ 9819 52681]]

Training IsolationForest (anomaly model)...
Fitting IsolationForest pipeline...
Done.

Scoring anomaly on test set...
Anomaly score range: -0.07975537051591236 ‚Üí 0.09058830480506286

=== Optimal IsolationForest Threshold ===
Threshold: -0.03897585500238099
Precision: 0.6387116585349211
Recall   :



In [15]:
# =========================================================
# Cell 11 (FINAL FIXED): Clean example inference
# =========================================================

print("Reloading models for inference test...")

# Suppress LightGBM feature-name warnings
import lightgbm
lightgbm.LGBMClassifier._warn_for_feature_names = lambda *args, **kwargs: None

sup_loaded = joblib.load(supervised_path)
iforest_loaded = joblib.load(iforest_path)
with open(thresholds_path, "r") as f:
    th_loaded = json.load(f)

th_sup = float(th_loaded["supervised_threshold"])
th_if = float(th_loaded["iforest_threshold"])

# ----------------------------------------------
# NEW: Conservative IForest threshold
# Reduce false positives drastically
# ----------------------------------------------

# We drop IForest to only flag ~2% worst anomalies
# (much safer for real-time fraud systems)

print("\nRecalibrating IsolationForest threshold...")
# Score anomaly only once
example_anom = -iforest_loaded.decision_function(X_test[:50000])
new_if_threshold = np.percentile(example_anom, 98)   # top 2%

print("Original threshold:", th_if)
print("Revised safer threshold:", new_if_threshold)

th_if = float(new_if_threshold)


# ----------------------------------------------
# Safe scoring function
# ----------------------------------------------
def score_example(sample: dict, label: str = "Unknown"):
    """Safe inference guaranteed without warnings or feature mismatch."""

    base = {
        "Amount": 0.0,
        "TransactionType": "PAYMENT",
        "Location": "Unknown",
        "DeviceID": "Unknown",
        "Channel": "Other",
        "hour": 12,
        "day_of_week": 0,
        "month": 1,
    }
    base.update(sample)

    df = pd.DataFrame([base])
    df = df.reindex(columns=sup_loaded.feature_names_in_)

    # ML prediction
    fraud_prob = float(sup_loaded.predict_proba(df)[0, 1])

    # Anomaly score
    anom_score = float(-iforest_loaded.decision_function(df)[0])

    # Flags
    flag_sup = fraud_prob >= th_sup
    flag_if = anom_score >= th_if
    flag_combined = flag_sup or flag_if

    print("\n============================")
    print(f"Example: {label}")
    print("============================")
    print("Input:", base)
    print(f"Fraud Probability (ML): {fraud_prob:.6f}")
    print(f"Anomaly Score (IForest): {anom_score:.6f}")
    print(f"Supervised Flag (>= {th_sup:.4f}): {flag_sup}")
    print(f"IForest Flag (>= {th_if:.4f}): {flag_if}")
    print(f"Combined Fraud Flag (OR): {flag_combined}")


# ----------------------------------------------
# Test examples
# ----------------------------------------------

examples = [
    (
        {
            "Amount": 800.0,
            "TransactionType": "PAYMENT",
            "Location": "Mumbai",
            "DeviceID": "MOB-ANDROID",
            "Channel": "Mobile App",
            "hour": 13,
            "day_of_week": 2,
            "month": 5,
        },
        "GOOD: small mobile-app payment (daytime)",
    ),
    (
        {
            "Amount": 1200.0,
            "TransactionType": "PAYMENT",
            "Location": "Bangalore",
            "DeviceID": "POS-TERM-001",
            "Channel": "POS",
            "hour": 19,
            "day_of_week": 5,
            "month": 8,
        },
        "GOOD: POS grocery shopping",
    ),
    (
        {
            "Amount": 6000.0,
            "TransactionType": "WITHDRAWAL",
            "Location": "Delhi",
            "DeviceID": "ATM-0001",
            "Channel": "ATM",
            "hour": 11,
            "day_of_week": 1,
            "month": 3,
        },
        "GOOD: ATM weekday withdrawal",
    ),
    (
        {
            "Amount": 450000.0,
            "TransactionType": "TRANSFER",
            "Location": "Mumbai",
            "DeviceID": "WEB-CHROME",
            "Channel": "NetBanking",
            "hour": 2,
            "day_of_week": 1,
            "month": 4,
        },
        "FRAUD: large night transfer",
    ),
]

# Run examples
for sample, label in examples:
    score_example(sample, label)


Reloading models for inference test...

Recalibrating IsolationForest threshold...
Original threshold: -0.03897585500238099
Revised safer threshold: 0.04069501242896777

Example: GOOD: small mobile-app payment (daytime)
Input: {'Amount': 800.0, 'TransactionType': 'PAYMENT', 'Location': 'Mumbai', 'DeviceID': 'MOB-ANDROID', 'Channel': 'Mobile App', 'hour': 13, 'day_of_week': 2, 'month': 5}
Fraud Probability (ML): 0.086325
Anomaly Score (IForest): -0.058364
Supervised Flag (>= 0.4304): False
IForest Flag (>= 0.0407): False
Combined Fraud Flag (OR): False

Example: GOOD: POS grocery shopping
Input: {'Amount': 1200.0, 'TransactionType': 'PAYMENT', 'Location': 'Bangalore', 'DeviceID': 'POS-TERM-001', 'Channel': 'POS', 'hour': 19, 'day_of_week': 5, 'month': 8}
Fraud Probability (ML): 0.054230
Anomaly Score (IForest): -0.032143
Supervised Flag (>= 0.4304): False
IForest Flag (>= 0.0407): False
Combined Fraud Flag (OR): False





Example: GOOD: ATM weekday withdrawal
Input: {'Amount': 6000.0, 'TransactionType': 'WITHDRAWAL', 'Location': 'Delhi', 'DeviceID': 'ATM-0001', 'Channel': 'ATM', 'hour': 11, 'day_of_week': 1, 'month': 3}
Fraud Probability (ML): 0.228215
Anomaly Score (IForest): -0.021463
Supervised Flag (>= 0.4304): False
IForest Flag (>= 0.0407): False
Combined Fraud Flag (OR): False

Example: FRAUD: large night transfer
Input: {'Amount': 450000.0, 'TransactionType': 'TRANSFER', 'Location': 'Mumbai', 'DeviceID': 'WEB-CHROME', 'Channel': 'NetBanking', 'hour': 2, 'day_of_week': 1, 'month': 4}
Fraud Probability (ML): 0.999122
Anomaly Score (IForest): 0.040975
Supervised Flag (>= 0.4304): True
IForest Flag (>= 0.0407): True
Combined Fraud Flag (OR): True


