In [2]:
import pandas as pd

# Load raw sample
df = pd.read_csv("../data/raw/PS_20174392719_1491204439457_log.csv")
df = df.sample(3000, random_state=42)

# Load user metadata
meta = pd.read_csv("../data/processed/users_metadata.csv")

# Merge sender
df = df.merge(meta, left_on="nameOrig", right_on="user_id", how="left").rename(columns={
    "name": "sender_name",
    "email": "sender_email",
    "phone": "sender_phone",
    "ip": "sender_ip",
    "company": "sender_company"
}).drop(columns=["user_id"])

# Merge receiver
df = df.merge(meta, left_on="nameDest", right_on="user_id", how="left").rename(columns={
    "name": "receiver_name",
    "email": "receiver_email",
    "phone": "receiver_phone",
    "ip": "receiver_ip",
    "company": "receiver_company"
}).drop(columns=["user_id"])

# Quick overlap checks on enriched data
phones = df.groupby("sender_phone")["nameOrig"].nunique().sort_values(ascending=False)
ips    = df.groupby("sender_ip")["nameOrig"].nunique().sort_values(ascending=False)
print("Senders per phone >=2:", (phones >= 2).sum(), "Top:\n", phones.head())
print("Senders per ip    >=2:", (ips    >= 2).sum(), "Top:\n", ips.head())

# Save
df.to_csv("../data/processed/enriched_transactions.csv", index=False)
print("Saved: enriched_transactions.csv")

Senders per phone >=2: 21 Top:
 sender_phone
+1-276-325-7114           4
+1-590-586-9987x961       4
(873)330-8776x1949        4
001-779-274-4370x36100    4
762-423-0005x4934         4
Name: nameOrig, dtype: int64
Senders per ip    >=2: 16 Top:
 sender_ip
85.108.225.196    5
137.167.55.87     4
202.30.20.82      4
211.154.39.34     4
213.32.239.48     4
Name: nameOrig, dtype: int64
Saved: enriched_transactions.csv
