In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA

def generate_synthetic_demo(
    n_samples: int = 100_000,
    random_state: int = 42,
    raw_out: str = "SYNTH_RAW_DATA.csv",
    pca_out: str = "SYNTH_PCA_DATA.csv",
):
    np.random.seed(random_state)

    # ─── 1) COMMON SETUP ───────────────────────────────────────────────
    # A few latent factors (e.g. underlying fraud drivers)
    n_latent = 5
    Z = np.random.normal(0, 1, size=(n_samples, n_latent))

    # Time feature, sorted so it still makes sense as a timestamp
    times = np.random.choice(np.arange(0, 172_800, dtype=int),
                             size=n_samples, replace=True)
    times.sort()
    Hour = (times // 3600) % 24

    # ─── 2) NUMERIC FEATURES WITH CORRELATION ─────────────────────────
    numeric_cols = [
        "Amount", "Age", "Tenure", "MerchantRisk", "DeviceTrust",
        "Txn24h", "Avg30d", "IPReputation", "Latitude", "Longitude", "DistFromHome"
    ]
    # Random loadings from latent factors → numeric features
    loadings = np.random.uniform(-1, 1, size=(n_latent, len(numeric_cols)))
    # Build the numeric matrix
    num_data = Z.dot(loadings) + np.random.normal(0, 0.5, size=(n_samples, len(numeric_cols)))

    # Clip or transform to realistic ranges
    df_num = pd.DataFrame(num_data, columns=numeric_cols)
    df_num["Amount"]    = np.exp(df_num["Amount"] * 0.5 + 3.5)       # log-normal style
    df_num["Age"]       = np.clip(df_num["Age"] * 5 + 40, 18, 90)
    df_num["Tenure"]    = np.abs(df_num["Tenure"] * 10).astype(int)
    df_num["Txn24h"]    = np.abs(df_num["Txn24h"].round()).astype(int)
    df_num["Latitude"]  = np.clip(37 + df_num["Latitude"]*5, 25, 50)
    df_num["Longitude"] = np.clip(-95 + df_num["Longitude"]*10, -125, -67)
    # leave MerchantRisk, DeviceTrust, Avg30d, IPReputation, DistFromHome continuous

    # ─── 3) CATEGORICAL FEATURES ───────────────────────────────────────
    cats = {
        "TxType":     (["purchase","withdrawal","transfer","payment"], [0.7,0.1,0.1,0.1]),
        "DeviceType": (["mobile","desktop","ATM","POS","web"],         [0.5,0.2,0.05,0.2,0.05]),
        # give MerchantCat uniform probabilities so it's a tuple too
        "MerchantCat":(
            ["grocery","electronics","travel","entertainment","gas",
             "restaurant","utilities","clothing"],
            [1/8]*8
        ),
        "Channel":    (["online","in-store","contactless","chip"],      [0.4,0.4,0.1,0.1]),
    }
    df_cat = pd.DataFrame({
        col: np.random.choice(domain, size=n_samples, p=probs)
             if isinstance(probs, list)
             else np.random.choice(domain, size=n_samples)
        for col, (domain, probs) in cats.items()
    })

    # ─── 4) COMBINE & ADD TIME/CardPresent ────────────────────────────
    raw = pd.concat([
        pd.Series(times, name="Time"),
        df_num.reset_index(drop=True),
        pd.Series(Hour, name="Hour"),
        df_cat,
    ], axis=1)
    raw["CardPresent"] = (np.random.rand(n_samples) < 0.7).astype(int)

    # ─── 5) RE-INTRODUCE MISSINGNESS ────────────────────────────────
    missing_rates = {
        "Amount": 0.01, "Age": 0.01, "MerchantRisk": 0.02, "DeviceTrust": 0.02,
        "Txn24h": 0.01, "Avg30d": 0.01, "IPReputation": 0.02,
        "Latitude": 0.005, "Longitude": 0.005, "DistFromHome": 0.01,
        "TxType": 0.005, "DeviceType": 0.005, "MerchantCat": 0.01, "Channel": 0.005,
    }
    for col, rate in missing_rates.items():
        mask = np.random.rand(n_samples) < rate
        raw.loc[mask, col] = np.nan

    # ─── 6) ADD MEASUREMENT NOISE ────────────────────────────────────
    noise_cols = [
        "Amount","Age","MerchantRisk","DeviceTrust",
        "Txn24h","Avg30d","IPReputation",
        "Latitude","Longitude","DistFromHome"
    ]
    for c in noise_cols:
        nonnull = raw[c].dropna()
        if nonnull.empty:
            continue
        sigma = nonnull.std() * 0.02
        raw.loc[raw[c].notna(), c] += np.random.normal(0, sigma, size=len(nonnull))


    # ─── 5) SIMULATE FRAUD LABEL ON LATENT FACTORS ────────────────────
    # Score driven mostly by latent factor 0 and factor 1
    score = 0.8 * Z[:, 0] - 0.6 * Z[:, 1] + 1.2 * (raw["CardPresent"] == 0)
    prob  = 1 / (1 + np.exp(-score))
    raw["Class"] = (np.random.rand(n_samples) < prob).astype(int)

    raw.to_csv(raw_out, index=False)
    print(f"• wrote raw → {raw_out}")

    # ─── 6) PCA TRANSFORM ─────────────────────────────────────────────
    ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    cat_mat = ohe.fit_transform(raw[list(cats.keys())].fillna("MISSING"))
    cat_cols = ohe.get_feature_names_out(list(cats.keys()))

    num = raw.drop(columns=list(cats.keys()) + ["Class"])
    num = num.fillna(num.mean())

    features = pd.concat([
        num.reset_index(drop=True),
        pd.DataFrame(cat_mat, columns=cat_cols)
    ], axis=1)

    Xs = StandardScaler().fit_transform(features)
    pca = PCA(n_components=28, random_state=random_state)
    PCs = pca.fit_transform(Xs)

    pca_df = pd.DataFrame(PCs, columns=[f"V{i+1}" for i in range(28)])
    pca_df["Time"]   = raw["Time"].astype(int)
    pca_df["Amount"] = raw["Amount"]
    pca_df["Class"]  = raw["Class"]
    pca_df.to_csv(pca_out, index=False)
    print(f"• wrote PCA → {pca_out}")


if __name__ == "__main__":
    generate_synthetic_demo(
        n_samples=478_324,
        raw_out="SYNTH_RAW_DATA.csv",
        pca_out="SYNTH_PCA_DATA.csv",
    )
