<a href="https://colab.research.google.com/github/chandan-n-max/fraud_detection/blob/main/01_create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

# reproducibility
np.random.seed(42)
random.seed(42)

# -----------------------------
# CONFIGURATION
# -----------------------------
N = 300_000           # number of transactions
fraud_rate = 0.003     # 0.3% fraud = 900 rows

# -----------------------------
# BASIC INFO
# -----------------------------
end = datetime.now()
start = end - timedelta(days=180)
timestamps = [
    start + timedelta(seconds=int(np.random.rand() * (end - start).total_seconds()))
    for _ in range(N)
]

num_customers = 60000
customer_ids = [f"C{np.random.randint(1, num_customers+1):06d}" for _ in range(N)]

# -----------------------------
# FEATURES
# -----------------------------
amounts = np.round(np.random.lognormal(mean=6, sigma=1.2, size=N), 2)

txn_types = np.random.choice(['POS','ONLINE','ATM','TRANSFER'], N, p=[0.4,0.35,0.15,0.1])
merchant_cats = np.random.choice(
    ['Grocery','Electronics','Travel','Dining','Utility','Mobile','Fashion','Healthcare'],
    N, p=[0.25,0.15,0.1,0.15,0.12,0.08,0.1,0.05]
)
device_types = np.random.choice(['Mobile','Web','ATM'], N, p=[0.6,0.35,0.05])
countries = np.random.choice(['India','USA','UK','UAE','Singapore'], N, p=[0.75,0.1,0.06,0.05,0.04])
is_international = (countries != 'India').astype(int)

past24h_count = np.random.poisson(lam=1.2, size=N)
avg7d = np.round(np.maximum(1, np.random.normal(loc=amounts.mean()*0.6, scale=amounts.std()*0.3, size=N)),2)
hours = [t.hour for t in timestamps]
time_of_day = np.where((np.array(hours)<6),'Night',
                np.where(np.array(hours)<12,'Morning',
                np.where(np.array(hours)<18,'Afternoon','Evening')))

# -----------------------------
# FRAUD SCORING LOGIC
# -----------------------------
base_risk = {'Grocery':0.01,'Electronics':0.03,'Travel':0.05,'Dining':0.02,
             'Utility':0.005,'Mobile':0.015,'Fashion':0.02,'Healthcare':0.008}
merchant_risk = np.array([base_risk[c] for c in merchant_cats])

prob = (
    0.0005
    + merchant_risk
    + 0.000001*amounts
    + 0.001*is_international
    + 0.001*(np.array(time_of_day)=='Night')
    + 0.0003*(past24h_count>5)
)
prob = prob / prob.max() * 0.02

# -----------------------------
# LABELING FRAUD
# -----------------------------
desired = int(N * fraud_rate)
labels = np.zeros(N, dtype=int)
top_idx = np.argsort(prob)[-desired:]
labels[top_idx] = 1

# -----------------------------
# BUILD FINAL DATAFRAME
# -----------------------------
df = pd.DataFrame({
    "TransactionID": [f"TXN{i+1:07d}" for i in range(N)],
    "CustomerID": customer_ids,
    "Timestamp": timestamps,
    "Amount": amounts,
    "TransactionType": txn_types,
    "MerchantCategory": merchant_cats,
    "DeviceType": device_types,
    "Country": countries,
    "IsInternational": is_international,
    "Past24hTxnCount": past24h_count,
    "AvgTxnAmount7d": avg7d,
    "Hour": hours,
    "TimeOfDay": time_of_day,
    "Fraud": labels
})

df.head()


Unnamed: 0,TransactionID,CustomerID,Timestamp,Amount,TransactionType,MerchantCategory,DeviceType,Country,IsInternational,Past24hTxnCount,AvgTxnAmount7d,Hour,TimeOfDay,Fraud
0,TXN0000001,C045155,2025-06-16 21:12:28.098143,375.0,ONLINE,Grocery,Mobile,India,0,1,567.99,21,Evening,0
1,TXN0000002,C059931,2025-09-28 14:16:49.098143,618.01,ONLINE,Healthcare,Mobile,India,0,1,836.8,14,Afternoon,0
2,TXN0000003,C017198,2025-08-20 05:24:30.098143,788.38,ONLINE,Mobile,Web,USA,1,1,547.47,5,Night,0
3,TXN0000004,C028254,2025-07-27 05:23:57.098143,388.4,POS,Fashion,Web,India,0,1,943.57,5,Night,0
4,TXN0000005,C015405,2025-05-08 13:11:42.098143,100.67,TRANSFER,Dining,Mobile,UAE,1,0,491.76,13,Afternoon,0


In [4]:
df.shape, df["Fraud"].value_counts(), round(df["Fraud"].mean()*100, 4)


((300000, 14),
 Fraud
 0    299100
 1       900
 Name: count, dtype: int64,
 np.float64(0.3))

In [7]:
import os
os.makedirs("/content/drive/MyDrive/fraud_detection/data", exist_ok=True)


In [8]:

df.to_csv("/content/drive/MyDrive/fraud_detection/data/synthetic_fraud_transactions_v2.csv", index=False)
