In [11]:
# Credit Card Fraud Detection Predictive Models
# Synthetic Data Generator and PCA Transformation
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

def generate_synthetic_demo(
    n_samples: int = 100_000,
    random_state: int = 42,
    raw_out: str = "SYNTH_RAW_DATA.csv",
    pca_out: str = "SYNTH_PCA_DATA.csv",
):
    np.random.seed(random_state)

    # 1) COMMON SETUP
    n_latent = 5
    Z = np.random.normal(0, 1, size=(n_samples, n_latent))
    times = np.random.choice(np.arange(0, 172_800, dtype=int),
                             size=n_samples, replace=True)
    times.sort()
    Hour = (times // 3600) % 24

    # 2) NUMERIC FEATURES WITH CORRELATION
    numeric_cols = [
        "Amount", "Age", "Tenure", "MerchantRisk", "DeviceTrust",
        "Txn24h", "Avg30d", "IPReputation", "Latitude", "Longitude", "DistFromHome"
    ]
    loadings = np.random.uniform(-1, 1, size=(n_latent, len(numeric_cols)))
    num_data = Z.dot(loadings) + np.random.normal(0, 0.5, size=(n_samples, len(numeric_cols)))
    df_num = pd.DataFrame(num_data, columns=numeric_cols)
    df_num["Amount"]    = np.exp(df_num["Amount"] * 0.5 + 3.5)
    df_num["Age"]       = np.clip(df_num["Age"] * 5 + 40, 18, 90)
    df_num["Tenure"]    = np.abs(df_num["Tenure"] * 10).astype(int)
    df_num["Txn24h"]    = np.abs(df_num["Txn24h"].round()).astype(int)
    df_num["Latitude"]  = np.clip(37 + df_num["Latitude"]*5, 25, 50)
    df_num["Longitude"] = np.clip(-95 + df_num["Longitude"]*10, -125, -67)

    # 3) CATEGORICAL FEATURES
    cats = {
        "TxType":     (["purchase","withdrawal","transfer","payment"], [0.7,0.1,0.1,0.1]),
        "DeviceType": (["mobile","desktop","ATM","POS","web"],         [0.5,0.2,0.05,0.2,0.05]),
        "MerchantCat":([
            "grocery","electronics","travel","entertainment","gas",
            "restaurant","utilities","clothing"],[1/8]*8),
        "Channel":    (["online","in-store","contactless","chip"],      [0.4,0.4,0.1,0.1]),
    }
    df_cat = pd.DataFrame({
        col: np.random.choice(domain, size=n_samples, p=probs)
             if isinstance(probs, list)
             else np.random.choice(domain, size=n_samples)
        for col, (domain, probs) in cats.items()
    })

    # 4) COMBINE & ADD TIME/CardPresent
    raw = pd.concat([
        pd.Series(times, name="Time"),
        df_num.reset_index(drop=True),
        pd.Series(Hour, name="Hour"),
        df_cat,
    ], axis=1)
    raw["CardPresent"] = (np.random.rand(n_samples) < 0.7).astype(int)

    # 5) RE-INTRODUCE MISSINGNESS
    missing_rates = {
        "Amount": 0.01, "Age": 0.01, "MerchantRisk": 0.01, "DeviceTrust": 0.01,
        "Txn24h": 0.01, "Avg30d": 0.01, "IPReputation": 0.01,
        "Latitude": 0.005, "Longitude": 0.005, "DistFromHome": 0.01,
        "TxType": 0.005, "DeviceType": 0.005, "MerchantCat": 0.01, "Channel": 0.005,
    }
    for col, rate in missing_rates.items():
        mask = np.random.rand(n_samples) < rate
        raw.loc[mask, col] = np.nan

    # 6) ADD MEASUREMENT NOISE (reduced)
    noise_cols = [
        "Amount","Age","MerchantRisk","DeviceTrust",
        "Txn24h","Avg30d","IPReputation",
        "Latitude","Longitude","DistFromHome"
    ]
    for c in noise_cols:
        nonnull = raw[c].dropna()
        if nonnull.empty:
            continue
        sigma = nonnull.std() * 0.005  # much less noise
        raw.loc[raw[c].notna(), c] += np.random.normal(0, sigma, size=len(nonnull))

    # 7) SIMULATE FRAUD LABEL - STRONG DEPENDENCE ON NUMERICS & CATEGORICALS
    fraud_signal = (
        9.0 * (raw["Amount"] > 2000).astype(float) +
        2.0 * (raw["CardPresent"] == 0).astype(float) +
        2.0 * (raw["MerchantRisk"] > 3.0).astype(float) +
        1.5 * (raw["DeviceType"] == "web").astype(float) +
        1.5 * (raw["Channel"] == "online").astype(float) +
        1.0 * (raw["TxType"] == "withdrawal").astype(float) +
        1.0 * (raw["Hour"].isin([0,1,2,3,4,23])).astype(float) +
        1.0 * (raw["Txn24h"] > 10).astype(float) +
        1.0 * (raw["IPReputation"] > 2.0).astype(float)
    )
    # Add tiny noise
    fraud_signal += np.random.normal(0, 0.01, size=n_samples)
    fraud_prob = 1 / (1 + np.exp(-fraud_signal))
    raw["Class"] = (np.random.rand(n_samples) < fraud_prob).astype(int)

    raw.to_csv(raw_out, index=False)
    print(f"• wrote raw → {raw_out}")

    # 8) PCA TRANSFORM (NUMERIC + OHE CATEGORICALS)
    cat_cols = ["TxType", "DeviceType", "MerchantCat", "Channel", "CardPresent"]
    num = raw[numeric_cols].fillna(raw[numeric_cols].mean())
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    cat_mat = ohe.fit_transform(raw[cat_cols].fillna("MISSING"))
    features = np.concatenate([num.values, cat_mat], axis=1)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(features)
    pca = PCA(n_components=28, random_state=random_state)
    PCs = pca.fit_transform(Xs)
    pca_df = pd.DataFrame(PCs, columns=[f"V{i+1}" for i in range(28)])
    pca_df["Time"]   = raw["Time"].astype(int)
    pca_df["Amount"] = raw["Amount"]
    pca_df["Class"]  = raw["Class"]
    pca_df.to_csv(pca_out, index=False)
    print(f"• wrote PCA → {pca_out}")

if __name__ == "__main__":
    generate_synthetic_demo(
        n_samples=478_324,
        raw_out="SYNTH_RAW_DATA.csv",
        pca_out="SYNTH_PCA_DATA.csv",
    )

• wrote raw → SYNTH_RAW_DATA.csv
• wrote PCA → SYNTH_PCA_DATA.csv
• wrote PCA → SYNTH_PCA_DATA.csv
