In [2]:
import os
import random
import numpy as np
import torch
from torch_geometric.data import Data
from neo4j import GraphDatabase

# set seed
seed = 2023
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
DATA_PATH = "./data"

URI = "neo4j://localhost"
AUTH = ("neo4j", "password")

In [3]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

data = Data()
data.x = []
data.y = []

In [4]:
transaction_index_mappings = {}
account_index_mappings = {}
user_index_mappings = {}
country_index_mappings = {}
lob_index_mappings = {}
sector_index_mappings = {}

In [5]:
NODE_TYPE_MAPPINGS = {
    'account': 0,
    'transaction': 1,
    'user': 2,
    'country': 3,
    'lob': 4,
    'sector': 5
}

TXN_TYPE_MAPPINGS = {
    'DEPOSIT-CASH':0,
    'DEPOSIT-CHECK':1,
    'EXCHANGE': 2,
    'MAKE-PAYMENT': 3,
    'MOVE-FUNDS': 4,
    'PAY-CHECK': 5,
    'QUICK-PAYMENT': 6,
    'WITHDRAWAL': 7,
}

In [6]:
def insert_nodes(nodes, index_mappings):
    next_idx = len(data.x)
    for index, t in enumerate(nodes):
        data.y.append(t['label'])
        data.x.append(t['features'])
        index_mappings[t['id']] = next_idx + index

In [7]:
def _fetch_nodes(tx, type, index_mappings):
    raw_nodes = list(tx.run(f"MATCH (n:{type}) RETURN properties(n)"))
    nodes = []
    for node in raw_nodes:
        props = node[0]
        node_id = ''
        if 'id' in props:
            node_id = props['id']
        elif 'name' in props:
            node_id = props['name']
        node_features = []
        node_label = -1
        node_features.append(NODE_TYPE_MAPPINGS[type.lower()])
        if type.lower() == 'transaction':
            node_label = int(props['isFraud'])
            node_features.append(float(props['amount']))
            import time
            from datetime import datetime

            date_time_str = props['ts']
            date_time_obj = datetime.strptime(
                date_time_str, '%Y-%m-%d %H:%M:%S')

            timestamp = int(time.mktime(date_time_obj.timetuple()))

            node_features.append(timestamp)
            node_features.append(TXN_TYPE_MAPPINGS[props['type']])
        else:
            node_features += [-1,-1,-1]

        nodes.append(
            {'id': node_id, 'features': node_features, 'label': node_label})
    insert_nodes(nodes, index_mappings)

def fetch_nodes(tx):
    _fetch_nodes(tx, 'Transaction', transaction_index_mappings)
    _fetch_nodes(tx, 'User', user_index_mappings)
    _fetch_nodes(tx, 'Account', account_index_mappings)
    _fetch_nodes(tx, 'Country', country_index_mappings)
    _fetch_nodes(tx, 'Lob', lob_index_mappings)
    _fetch_nodes(tx, 'Sector', sector_index_mappings)

In [8]:
data.x = []
data.y = []
with driver.session() as session:
    session.execute_read(fetch_nodes)

In [9]:
data.y = torch.from_numpy(np.array(data.y))
data.x = torch.from_numpy(np.array(data.x))

In [10]:
edge_index = {
    'src': [],
    'dst': []
}

In [11]:
def construct_edges(edges, a, b, a_mappings, b_mappings):
    edge_index['src'] += [a_mappings[e[0][a]]for e in edges]
    edge_index['dst'] += [b_mappings[e[0][b]]for e in edges]
    
def fetch_edges(tx):
    belongs_to = list(tx.run(f"MATCH p=()-[r:BELONGS_TO]->() RETURN p"))
    # from_country = list(tx.run("MATCH ()-[r:FROM]->() RETURN r"))
    # lob_in = list(tx.run("MATCH ()-[r:LOB_IN]->() RETURN r"))
    # received_by = list(
    #     tx.run(f"MATCH ()-[r:RECEIVED_BY]->() RETURN properties(r)"))
    # transferred_by = list(
    #     tx.run(f"MATCH ()-[r:TRANSFERRED_BY]->() RETURN properties(r)"))
    # works_in = list(tx.run("MATCH ()-[r:WORKS_IN]->() RETURN r"))
    
    # construct_edges(belongs_to, 'account_id', 'user_id', account_index_mappings, user_index_mappings)
    # construct_edges(from_country, 'account_id', 'country', account_index_mappings, country_index_mappings)
    # construct_edges(lob_in, 'account_id', 'lob_name', account_index_mappings, lob_index_mappings)
    # construct_edges(received_by, 'txn_id', 'account_id', transaction_index_mappings, account_index_mappings)
    # construct_edges(transferred_by, 'txn_id', 'account_id',
    #                 transaction_index_mappings, account_index_mappings)
    # construct_edges(works_in, 'account_id', 'sector_id', account_index_mappings, sector_index_mappings)

In [12]:
with driver.session() as session:
    session.execute_read(fetch_edges)

In [13]:
data.edge_index = torch.stack([torch.from_numpy(np.array(edge_index['src'])), torch.from_numpy(np.array(edge_index['dst']))], dim=0)

In [14]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)

print(data)

Data(x=[2142726, 4], y=[2142726], edge_index=[2, 8592098])


In [15]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
)

train_data, val_data, test_data = transform(data)

print("Training data:")
print("==============")
print(train_data)

Training data:
Data(x=[2142726, 4], y=[2142726], edge_index=[2, 4811576], edge_label=[2062104], edge_label_index=[2, 2062104])


In [16]:
!python3 -m pip install pygod   



In [17]:
import gc
torch.cuda.empty_cache()
gc.collect()

3409709

In [21]:
# train a dominant detector
from pygod.models import DOMINANT

model = DOMINANT(num_layers=4, epoch=20, batch_size=128, num_neigh=10)  # hyperparameters can be set here
model.fit(val_data)  # data is a Pytorch Geometric data object

# get outlier scores on the input data
outlier_scores = model.decision_scores_  # raw outlier scores on the input data
print(outlier_scores)

RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 18365098844304 bytes. Error code 12 (Cannot allocate memory)

In [None]:
# predict on the new data in the inductive setting
# raw outlier scores on the input data
outlier_scores = model.decision_function(test_data)
print(outlier_scores)

In [92]:
import torch.nn.functional as F

model = model.to('cuda:0')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 10):
    total_loss = total_examples = 0
    optimizer.zero_grad()

    pred = model(train_data)

    ground_truth = train_data["transaction", "received_by", "account"].edge_label
    loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

    loss.backward()
    optimizer.step()
    total_loss += float(loss) * pred.numel()
    total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Epoch: 001, Loss: 0.5580
Epoch: 002, Loss: 0.3737
Epoch: 003, Loss: 0.2478
Epoch: 004, Loss: 0.1645
Epoch: 005, Loss: 0.1104
Epoch: 006, Loss: 0.0755
Epoch: 007, Loss: 0.0528
Epoch: 008, Loss: 0.0378
Epoch: 009, Loss: 0.0276


In [None]:
driver.close()