## Creating Synthetic Data
#### for Fraud Detection Training

In [1]:
import numpy as np, pandas as pd
from datetime import datetime, timedelta
rng = np.random.default_rng(7)


MERCHANTS = [
    {"name":"Uber", "channel_probs":{"online":0.95,"in_store":0.05,"other":0.0},  "amt_shape":2.0,"amt_scale":4.0, "hour_bias":(18, 23)},
    {"name":"Starbucks", "channel_probs":{"online":0.10,"in_store":0.90,"other":0.0}, "amt_shape":2.5,"amt_scale":3.5, "hour_bias":(6, 10)},
    {"name":"Target", "channel_probs":{"online":0.40,"in_store":0.60,"other":0.0}, "amt_shape":3.0,"amt_scale":12.0,"hour_bias":(12, 19)},
    {"name":"Walmart", "channel_probs":{"online":0.30,"in_store":0.70,"other":0.0}, "amt_shape":3.0,"amt_scale":10.0,"hour_bias":(11, 20)},
    {"name":"Comcast", "channel_probs":{"online":0.85,"in_store":0.00,"other":0.15},"amt_shape":2.0,"amt_scale":50.0,"hour_bias":(8, 18)},
    {"name":"Airbnb", "channel_probs":{"online":0.99,"in_store":0.00,"other":0.01},"amt_shape":1.5,"amt_scale":150.0,"hour_bias":(9, 22)},
    {"name":"Delta", "channel_probs":{"online":0.99,"in_store":0.00,"other":0.01},"amt_shape":1.6,"amt_scale":180.0,"hour_bias":(8, 21)},
    {"name":"Apple", "channel_probs":{"online":0.80,"in_store":0.20,"other":0.0}, "amt_shape":2.0,"amt_scale":60.0,"hour_bias":(10, 22)},
    {"name":"Spotify", "channel_probs":{"online":1.00,"in_store":0.00,"other":0.0}, "amt_shape":1.2,"amt_scale":15.0,"hour_bias":(0, 23)},
    {"name":"DoorDash", "channel_probs":{"online":0.95,"in_store":0.05,"other":0.0}, "amt_shape":2.1, "amt_scale":12.0, "hour_bias":(17, 22)},
    {"name":"CVS", "channel_probs":{"online":0.15,"in_store":0.85,"other":0.0}, "amt_shape":2.6, "amt_scale":9.0, "hour_bias":(9, 20)},
    {"name":"Home Depot", "channel_probs":{"online":0.35,"in_store":0.65,"other":0.0}, "amt_shape":2.8, "amt_scale":35.0, "hour_bias":(10, 19)},
    {"name":"Lyft", "channel_probs":{"online":0.98,"in_store":0.02,"other":0.0}, "amt_shape":1.9, "amt_scale":5.0, "hour_bias":(18, 23)},
    {"name":"Netflix", "channel_probs":{"online":1.00,"in_store":0.00,"other":0.0}, "amt_shape":1.2, "amt_scale":16.0, "hour_bias":(0, 23)},
    {"name":"Gas Station", "channel_probs":{"online":0.05, "in_store":0.90, "other":0.05}, "amt_shape":2.5, "amt_scale":25.0, "hour_bias":(6, 22)}
]


In [2]:
# creating 150 accounts with 120 - 320 transaction across the span of a few months
NUMBER_OF_ACCOUNTS = 150

TXNS_PER_ACCOUNT = rng.integers(120, 320, size=NUMBER_OF_ACCOUNTS)
start_date = datetime(2025, 6, 1) ; end_date = datetime(2025, 10, 1)


rows = []

In [3]:
def sample_channel(probs):
    keys = list(probs.keys())
    vals = [probs[k] for k in keys]
    vals = np.array(vals, dtype=float)
    total = vals.sum()
    if total == 0:
        vals = np.ones_like(vals)
        total = vals.sum()

    # vals need to all add up to 1
    vals = vals / total
    try:
        return rng.choice(keys, p=vals)
    except Exception as e: 
        return "online"


for account_id in range(1, NUMBER_OF_ACCOUNTS + 1):

    # est some fav merchants and giving them a weight to show the users preference
    k = int(rng.integers(5, 10))
    fav_idx = rng.choice(len(MERCHANTS), size=k, replace=False if k <= len(MERCHANTS) else True)
    weights = rng.dirichlet(np.ones(k))

    # determining if this user typical hours
    spend_scale = float(rng.uniform(0.6, 1.8))
    hour_shift = int(rng.integers(-2, 3))

    number_txn = int(TXNS_PER_ACCOUNT[account_id - 1])
    days_span = (end_date - start_date).days

    for i in range(number_txn):
        m_i = int(rng.choice(range(k), p=weights))
        m = MERCHANTS[int(fav_idx[m_i])]

        # spreading txns across time period
        d_offset = int(rng.integers(0, days_span))
        d_day = start_date + timedelta(days=d_offset)

        # using the merchants hours and the users personal preference
        hb = m.get("hour_bias", (9, 21))
        center = (hb[0] + hb[1]) / 2.0 + hour_shift
        hour = int(np.clip(rng.normal(center, 2.5), 0, 23))
        minute = int(rng.integers(0, 60))  # add minutes so timestamps aren't all :00

        d = d_day + timedelta(hours=hour, minutes=minute)
        auth_back = int(rng.integers(0, 3))

        # give authorized a random earlier hour too
        auth_hour = int(np.clip(hour - int(rng.integers(0, 4)), 0, 23))
        auth_date = (d_day - timedelta(days=auth_back)) + timedelta(hours=auth_hour)

        # determining the amount with gamma distribution
        shape = float(m.get("amt_shape", 2.0)); scale = float(m.get("amt_scale", 10.0))
        amount_raw = rng.gamma(shape, scale)
        amount = float(np.round(max(1.0, amount_raw * spend_scale), 2))

        # setting channel probs
        channel = sample_channel(m.get("channel_probs", {"online": 1.0}))

        # pending status is evidently mostly something that occurs more wiht online payments
        if channel == "online":
            pending = bool(rng.random() < 0.08)
        else:
            pending = bool(rng.random() < 0.03)

        txn_id = f"txn_{account_id}_{i:05d}"
        plaid_id = f"plaid_{rng.integers(10**9, 10**10)}"

        # making rows mimic plaid data
        rows.append({
            "transaction_id": txn_id,
            "account_id": account_id,
            "user_id": int(rng.integers(1, 6)),
            "item_id": int(rng.integers(1, 30)),
            "plaid_transaction_id": plaid_id,
            "name": f"{m['name']} purchase",
            "merchant_name": m["name"],
            "amount": amount,
            "iso_currency_code": "USD",
            "date": d.isoformat(timespec="seconds"),
            "authorized_date": auth_date.isoformat(timespec="seconds"),
            "pending": pending,
            "payment_channel": channel,
            "removed": False,
            "is_anomaly": False,
        })

df = pd.DataFrame(rows)

## Injecting anomalies

In [4]:
# going to add 2% of anomalies for now
anomaly_rate = 0.02
NUMBER_ANOMALY = max(1, int(len(df) * anomaly_rate))

anomaly_idx = rng.choice(df.index, size=NUMBER_ANOMALY, replace=False)

# adding different types of anomalies
######
# high amount
new_high  = int(NUMBER_ANOMALY * 0.45)

# new or rare merchant with a large amount
new_merchant = int(NUMBER_ANOMALY * 0.30)

# odd hour spikes
new_odd = int(NUMBER_ANOMALY * 0.15)

# wrong or odd channel for merchant
new_flip = int(NUMBER_ANOMALY * 0.10)
rest = NUMBER_ANOMALY - (new_high + new_merchant + new_odd + new_flip)

# high amounts txns
idx = rng.choice(anomaly_idx, size=new_high, replace=False)
df.loc[idx, "amount"] = np.round(df.loc[idx, "amount"] * rng.uniform(25, 80, size=new_high), 2)
df.loc[idx, "is_anomaly"] = True

# big amount and a new merchant
idx_new = rng.choice(anomaly_idx, size=new_merchant, replace=False)
df.loc[idx_new, "merchant_name"] = rng.choice(
    ["UnfamiliarSeller", "WeirdSellerID", "SketchySellerHandle", "Merchant_99999", "Store_XYZ"],
    size=new_merchant
)
df.loc[idx_new, "payment_channel"] = "online"
df.loc[idx_new, "amount"] = np.round(df.loc[idx_new, "amount"] * rng.uniform(10, 30, size=new_merchant), 2)
df.loc[idx_new, "is_anomaly"] = True

# adding odd middle night hours
idx_odd = rng.choice(anomaly_idx, size=new_odd, replace=False)
df.loc[idx_odd, "payment_channel"] = "online"
df.loc[idx_odd, "name"] = "Night purchase"
dates_odd = pd.to_datetime(df.loc[idx_odd, "date"])
dates_odd = dates_odd.dt.floor("D") + pd.to_timedelta(rng.integers(2, 5, size=len(idx_odd)), unit="h")

df.loc[idx_odd, "date"] = dates_odd.dt.strftime("%Y-%m-%dT%H:%M:%S")
df.loc[idx_odd, "authorized_date"] = df.loc[idx_odd, "date"]
df.loc[idx_odd, "is_anomaly"] = True

# setting payment channel to opposite that it normally likely is
idx_flip = rng.choice(anomaly_idx, size=(new_flip + rest), replace=False)
df.loc[idx_flip, "payment_channel"] = "online"
df.loc[idx_flip, "amount"] = np.round(df.loc[idx_flip, "amount"] * rng.uniform(1.2, 2.0, size=(new_flip + rest)), 2)
df.loc[idx_flip, "is_anomaly"] = True



In [5]:
# saving to csv
df.to_csv("synthetic_plaid_transactions.csv", index=False)
print(df.shape, "rows; anomalies:", df["is_anomaly"].sum())

(33242, 15) rows; anomalies: 477
