In [2]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()
np.random.seed(42)

# -------- Config (tweak here) ----------
TXN_SAMPLE = 1000         # smaller for speed
N_PHONE_KEYS = 15         # background: # of phones that get reused
PHONE_GROUP_SIZE = 6      # background: each reused by this many people
N_IP_KEYS = 10            # background: # of IPs that get reused
IP_GROUP_SIZE = 6

N_RINGS = 8               # fraud rings with same phone+ip (+company)
RING_SIZE = 5
RING_COMPANY = "Evil Corp"
# ---------------------------------------

# Load & sample transactions
df = pd.read_csv("../data/raw/PS_20174392719_1491204439457_log.csv").sample(TXN_SAMPLE, random_state=42)

# Unique users
all_users = pd.Series(df["nameOrig"].tolist() + df["nameDest"].tolist()).unique()
rng = np.random.default_rng(42)

# Base metadata
meta = {
    "user_id": [],
    "name": [],
    "email": [],
    "phone": [],
    "ip": [],
    "company": []
}

for u in all_users:
    meta["user_id"].append(u)
    meta["name"].append(fake.name())
    meta["email"].append(fake.email())
    meta["phone"].append(fake.phone_number())      # natural variation
    meta["ip"].append(fake.ipv4_public())
    meta["company"].append(fake.company())

meta_df = pd.DataFrame(meta)

# ---- Helpers -------------------------------------------------
def assign_shared_values(df, col, n_keys, group_size, seed):
    """Pick n_keys values, and for each one, assign it to group_size random users."""
    rng_local = np.random.default_rng(seed)
    # pick seed values from existing to keep format consistency
    keys = df[col].sample(n_keys, random_state=seed).values
    # choose recipients (without replacement per group, but across groups can overlap)
    candidate_idx = df.index.to_numpy()
    rng_local.shuffle(candidate_idx)
    ptr = 0
    for k in keys:
        # if we run out, reshuffle
        if ptr + group_size > len(candidate_idx):
            rng_local.shuffle(candidate_idx)
            ptr = 0
        group_idx = candidate_idx[ptr:ptr+group_size]
        ptr += group_size
        for idx in group_idx:
            df.at[idx, col] = k

def make_rings(df, n_rings, ring_size, seed):
    """Create multiple rings: same phone + ip (+company)."""
    rng_local = np.random.default_rng(seed)
    idx_pool = df.index.to_numpy().copy()
    rng_local.shuffle(idx_pool)
    ptr = 0
    for _ in range(n_rings):
        if ptr + ring_size > len(idx_pool):
            rng_local.shuffle(idx_pool)
            ptr = 0
        group = idx_pool[ptr:ptr+ring_size]
        ptr += ring_size
        shared_phone = fake.phone_number()
        shared_ip = fake.ipv4_public()
        for idx in group:
            df.at[idx, "phone"] = shared_phone
            df.at[idx, "ip"] = shared_ip
            df.at[idx, "company"] = RING_COMPANY

# ---- Background overlaps (phones, IPs) ----------------------
assign_shared_values(meta_df, "phone", N_PHONE_KEYS, PHONE_GROUP_SIZE, seed=101)
assign_shared_values(meta_df, "ip",    N_IP_KEYS,    IP_GROUP_SIZE,    seed=202)

# ---- Multiple fraud rings (both phone+ip) -------------------
make_rings(meta_df, N_RINGS, RING_SIZE, seed=303)

# ---- Quick sanity prints ------------------------------------
phone_counts = meta_df["phone"].value_counts()
ip_counts = meta_df["ip"].value_counts()
print("Phones used by >=2 people:", (phone_counts >= 2).sum())
print("IPs used by >=2 people:",    (ip_counts   >= 2).sum())
print("Top 5 shared phones:\n", phone_counts.head())
print("Top 5 shared IPs:\n", ip_counts.head())

# Save
meta_df.to_csv("../data/processed/users_metadata.csv", index=False)
print("Saved: users_metadata.csv, rows:", len(meta_df))

Phones used by >=2 people: 23
IPs used by >=2 people: 18
Top 5 shared phones:
 phone
(410)581-1747             7
762-423-0005x4934         7
919.902.2773              7
001-278-951-6696x34319    7
8895694926                7
Name: count, dtype: int64
Top 5 shared IPs:
 ip
71.244.144.50      7
213.32.239.48      7
144.204.74.36      7
51.218.138.227     7
202.229.209.156    7
Name: count, dtype: int64
Saved: users_metadata.csv, rows: 1999
