In [None]:
import pandas as pd
import numpy as np
from faker import Faker

# Setup
fake = Faker()
np.random.seed(42) # The answer for all questions in life, the universe, and everything

# Load transactions and sample
df = pd.read_csv("../data/raw/PS_20174392719_1491204439457_log.csv")
df = df.sample(3000, random_state=42)

# Extract unique users
all_users = pd.Series(df["nameOrig"].tolist() + df["nameDest"].tolist()).unique()

# Generate metadata
meta = {
    "user_id": [],
    "name": [],
    "email": [],
    "phone": [],
    "ip": [],
    "company": []
}

for u in all_users:
    meta["user_id"].append(u)
    meta["name"].append(fake.name())
    meta["email"].append(fake.email())
    meta["phone"].append(fake.phone_number())
    meta["ip"].append(fake.ipv4_public())
    meta["company"].append(fake.company())

meta_df = pd.DataFrame(meta)

# Inject suspicious overlaps
# 10 phones used by 50 people
susp_phones = meta_df["phone"].sample(10).values
for i, idx in enumerate(meta_df.sample(50).index):
    meta_df.at[idx, "phone"] = susp_phones[i % 10]

# 5 IPs used by 30 people
susp_ips = meta_df["ip"].sample(5).values
for i, idx in enumerate(meta_df.sample(30).index):
    meta_df.at[idx, "ip"] = susp_ips[i % 5]

# Inject 5-person fraud ring (same phone + ip + company)
fraud_ring = meta_df.sample(5, random_state=99).copy()
shared_phone = fake.phone_number()
shared_ip = fake.ipv4_public()
shared_company = "Evil Corp" # In honour of mr. Robot

for idx in fraud_ring.index:
    meta_df.at[idx, "phone"] = shared_phone
    meta_df.at[idx, "ip"] = shared_ip
    meta_df.at[idx, "company"] = shared_company

# Save
meta_df.to_csv("../data/processed/users_metadata.csv", index=False)
print("Saved: users_metadata.csv")


Total unique users: 19917
Enriched user metadata saved to 'data/processed/users_metadata.csv'
