In [None]:
# improved_synth_demo.py

import numpy as np
import pandas as pd

def generate_synthetic_demo(n_samples=100_000,
                            random_state=42,
                            raw_out="raw_cc_transactions.csv"):
    np.random.seed(random_state)

    # 1️⃣ Numeric features
    Amount       = np.random.exponential(scale=500,   size=n_samples)         # skewed dollars
    Txn24h       = np.random.poisson(lam=1.5,        size=n_samples)         # transactions today
    Avg30d       = np.random.poisson(lam=20,         size=n_samples)         # avg per 30 days
    Tenure       = np.random.poisson(lam=12,         size=n_samples)         # months with card
    IPReputation = np.random.beta(a=2, b=5,          size=n_samples) * 10    # 0–10 risk score
    MerchantRisk = np.random.uniform(0, 10,          size=n_samples)         # merchant risk 0–10
    DeviceTrust  = np.random.beta(a=2, b=2,          size=n_samples) * 10    # device trust 0–10

    # 2️⃣ Categorical features
    TxType      = np.random.choice(
        ["purchase","withdrawal","transfer","payment"],
        p=[0.75,0.10,0.10,0.05], size=n_samples
    )
    DeviceType  = np.random.choice(
        ["mobile","desktop","ATM","POS","web"],
        p=[0.5,0.2,0.05,0.2,0.05], size=n_samples
    )
    Channel     = np.random.choice(
        ["online","in-store","contactless","chip"],
        p=[0.5,0.3,0.1,0.1], size=n_samples
    )
    CardPresent = np.random.choice([0,1], p=[0.3,0.7], size=n_samples)

    # 3️⃣ Assemble DataFrame
    df = pd.DataFrame({
        "Amount":       Amount,
        "Txn24h":       Txn24h,
        "Avg30d":       Avg30d,
        "Tenure":       Tenure,
        "IPReputation": IPReputation,
        "MerchantRisk": MerchantRisk,
        "DeviceTrust":  DeviceTrust,
        "TxType":       TxType,
        "DeviceType":   DeviceType,
        "Channel":      Channel,
        "CardPresent":  CardPresent
    })

    # 4️⃣ Amplified fraud signal
    sig = (
        (df["Amount"] > 2000).astype(float)     * 25 +
        (df["Txn24h"] > 2).astype(float)        * 15 +
        (df["Avg30d"] > 40).astype(float)       * 10 +
        (df["Tenure"] < 3).astype(float)        * 10 +
        (df["IPReputation"] > 6).astype(float)  * 20 +
        (df["MerchantRisk"] > 8).astype(float)  * 15 +
        (df["DeviceTrust"] < 4).astype(float)   * 12 +
        (df["CardPresent"] == 0).astype(float)  * 30 +
        (df["Channel"] == "online").astype(float)* 10 +
        (df["DeviceType"] == "web").astype(float)*  8
    )
    sig += np.random.normal(0, 0.1, size=n_samples)

    # 5️⃣ Convert to fraud probability
    intercept = 2.0   # raises difficulty so fraud ≈1–3%
    prob = 1 / (1 + np.exp(-(sig - intercept)))
    df["Class"] = (np.random.rand(n_samples) < prob).astype(int)

    # 6️⃣ ~0.5% missingness
    for col in ["DeviceTrust","Txn24h","Channel"]:
        mask = np.random.rand(n_samples) < 0.005
        df.loc[mask, col] = np.nan

    # 7️⃣ Save and show fraud rate
    df.to_csv(raw_out, index=False)
    print(f"Wrote → {raw_out} (fraud rate: {df['Class'].mean():.2%})")

if __name__ == "__main__":
    generate_synthetic_demo(
        n_samples=100_000,
        random_state=42,
        raw_out="raw_cc_transactions.csv"
    )


Wrote synthetic data → raw_cc_transactions.csv
