In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/enriched_transactions.csv")

# helpers
def clean_col(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.strip()
    s = s.replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
    return s

# strip all string-like cols
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].astype(str).str.strip()

# Persons
persons = pd.concat([
    df[["nameOrig", "sender_name", "sender_email"]].rename(
        columns={"nameOrig": "person_id", "sender_name": "name", "sender_email": "email"}),
    df[["nameDest", "receiver_name", "receiver_email"]].rename(
        columns={"nameDest": "person_id", "receiver_name": "name", "receiver_email": "email"})
]).drop_duplicates()

persons["person_id"] = clean_col(persons["person_id"])
persons = persons.dropna(subset=["person_id"])
persons.to_csv("../data/processed/nodes_person.csv", index=False)

# Phones  -> number
phones = pd.concat([
    df[["sender_phone"]].rename(columns={"sender_phone": "number"}),
    df[["receiver_phone"]].rename(columns={"receiver_phone": "number"})
]).drop_duplicates()
phones["number"] = clean_col(phones["number"])
phones = phones.dropna(subset=["number"])
phones.to_csv("../data/processed/nodes_phone.csv", index=False)

# IPs -> value
ips = pd.concat([
    df[["sender_ip"]].rename(columns={"sender_ip": "value"}),
    df[["receiver_ip"]].rename(columns={"receiver_ip": "value"})
]).drop_duplicates()
ips["value"] = clean_col(ips["value"])
ips = ips.dropna(subset=["value"])
ips.to_csv("../data/processed/nodes_ip.csv", index=False)

# Companies -> name
companies = pd.concat([
    df[["sender_company"]].rename(columns={"sender_company": "name"}),
    df[["receiver_company"]].rename(columns={"receiver_company": "name"})
]).drop_duplicates()
companies["name"] = clean_col(companies["name"])
companies = companies.dropna(subset=["name"])
companies.to_csv("../data/processed/nodes_company.csv", index=False)

# Relationships: SENT (allow all rows)
df[["nameOrig", "nameDest", "amount", "type", "isFraud"]].rename(
    columns={"nameOrig": "from", "nameDest": "to"}
).to_csv("../data/processed/relationships_sent.csv", index=False)

# Person -> Phone
r_phone = pd.concat([
    df[["nameOrig", "sender_phone"]].rename(columns={"nameOrig": "person_id", "sender_phone": "number"}),
    df[["nameDest", "receiver_phone"]].rename(columns={"nameDest": "person_id", "receiver_phone": "number"})
]).drop_duplicates()
r_phone["person_id"] = clean_col(r_phone["person_id"])
r_phone["number"] = clean_col(r_phone["number"])
r_phone = r_phone.dropna(subset=["person_id", "number"])
r_phone.to_csv("../data/processed/relationships_uses_phone.csv", index=False)

# Person -> IP
r_ip = pd.concat([
    df[["nameOrig", "sender_ip"]].rename(columns={"nameOrig": "person_id", "sender_ip": "value"}),
    df[["nameDest", "receiver_ip"]].rename(columns={"nameDest": "person_id", "receiver_ip": "value"})
]).drop_duplicates()
r_ip["person_id"] = clean_col(r_ip["person_id"])
r_ip["value"] = clean_col(r_ip["value"])
r_ip = r_ip.dropna(subset=["person_id", "value"])
r_ip.to_csv("../data/processed/relationships_logged_from.csv", index=False)

# Person -> Company
r_co = pd.concat([
    df[["nameOrig", "sender_company"]].rename(columns={"nameOrig": "person_id", "sender_company": "company"}),
    df[["nameDest", "receiver_company"]].rename(columns={"nameDest": "person_id", "receiver_company": "company"})
]).drop_duplicates()
r_co["person_id"] = clean_col(r_co["person_id"])
r_co["company"] = clean_col(r_co["company"])
r_co = r_co.dropna(subset=["person_id", "company"])
r_co.to_csv("../data/processed/relationships_works_for.csv", index=False)

print("All node and relationship files exported (cleaned).")

All node and relationship files exported (cleaned).
