In [26]:
import os
import random
import numpy as np
import torch
from torch_geometric.data import HeteroData
from neo4j import GraphDatabase

# set seed
seed = 2023
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
DATA_PATH = "./data"

URI = "neo4j://localhost"
AUTH = ("neo4j", "password")

In [27]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

data = HeteroData()

In [28]:
transaction_index_mappings = {}
account_index_mappings = {}
user_index_mappings = {}
country_index_mappings = {}
lob_index_mappings = {}
sector_index_mappings = {}

lookup_data = {}

In [29]:
def construct_nodes(transactions, users, accounts, countries, lobs, sectors):
    data['transaction'].y = []
    for index, t in enumerate(transactions):
        data['transaction'].y.append(int(t[0]['isFraud']))
        transaction_index_mappings[t[0]['id']] = index
    data['transaction'].node_id = torch.arange(len(transaction_index_mappings.keys())).cuda()
    data['transaction'].y = torch.from_numpy(np.array(data['transaction'].y)).cuda()

    for index, t in enumerate(users):
        user_index_mappings[t[0]['id']] = index
    data['user'].node_id = torch.arange(len(user_index_mappings.keys())).cuda()

    for index, t in enumerate(accounts):
        account_index_mappings[t[0]['id']] = index
    data['account'].node_id = torch.arange(len(account_index_mappings.keys())).cuda()
    
    for index, t in enumerate(countries):
        country_index_mappings[t[0]['name']] = index
    data['country'].node_id = torch.arange(len(country_index_mappings.keys())).cuda()
    
    for index, t in enumerate(lobs):
        lob_index_mappings[t[0]['id']] = index
    data['lob'].node_id = torch.arange(len(lob_index_mappings.keys())).cuda()

    for index, t in enumerate(sectors):
        sector_index_mappings[t[0]['id']] = index
    data['sector'].node_id = torch.arange(len(sector_index_mappings.keys())).cuda()

def fetch_nodes(tx):
    transactions = list(tx.run("MATCH (n:Transaction) RETURN properties(n)"))
    users = list(tx.run("MATCH (n:User) RETURN properties(n)"))
    accounts = list(tx.run("MATCH (n:Account) RETURN properties(n)"))
    countries = list(tx.run("MATCH (n:Country) RETURN properties(n)"))
    lobs = list(tx.run("MATCH (n:Lob) RETURN properties(n)"))
    sectors = list(tx.run("MATCH (n:Sector) RETURN properties(n)"))

    lookup_data['transactions'] = transactions
    lookup_data['users'] = users
    lookup_data['accounts'] = accounts
    lookup_data['countries'] = countries
    lookup_data['lobs'] = lobs
    lookup_data['sectors'] = sectors

    construct_nodes(transactions, users, accounts, countries, lobs, sectors)

In [30]:
with driver.session() as session:
    session.execute_read(fetch_nodes)

In [31]:
def construct_edges(belongs_to, received_by, transferred_by, from_country, lob_in, works_in):
    data['account', 'belongs_to', 'user'].edge_index = torch.from_numpy(np.array([[account_index_mappings[r[0]['account_id']] for r in belongs_to], [user_index_mappings[r[0]['user_id']] for r in belongs_to]])).cuda()
    data['account', 'from', 'country'].edge_index = torch.from_numpy(np.array([[account_index_mappings[r[0]['account_id']] for r in from_country], [country_index_mappings[r[0]['country']] for r in from_country]])).cuda()
    data['account', 'lob_in', 'lob'].edge_index = torch.from_numpy(np.array([[account_index_mappings[r[0]['account_id']] for r in lob_in], [lob_index_mappings[r[0]['lob_name']] for r in lob_in]])).cuda()
    data['transaction', 'received_by', 'account'].edge_index = torch.from_numpy(np.array([[transaction_index_mappings[r[0]['txn_id']] for r in received_by], [account_index_mappings[r[0]['account_id']] for r in received_by]])).cuda()
    data['transaction', 'transferred_by', 'account'].edge_index = torch.from_numpy(np.array([[transaction_index_mappings[r[0]['txn_id']] for r in transferred_by], [account_index_mappings[r[0]['account_id']] for r in transferred_by]])).cuda()
    data['account', 'works_in', 'sector'].edge_index = torch.from_numpy(np.array([[account_index_mappings[r[0]['account_id']] for r in works_in], [sector_index_mappings[r[0]['sector_id']] for r in works_in]])).cuda()
    
def fetch_edges(tx):
    belongs_to = list(tx.run(f"MATCH ()-[r:BELONGS_TO]->() RETURN properties(r)"))
    from_country = list(tx.run("MATCH ()-[r:FROM]->() RETURN r"))
    lob_in = list(tx.run("MATCH ()-[r:LOB_IN]->() RETURN r"))
    received_by = list(tx.run(f"MATCH ()-[r:RECEIVED_BY]->() RETURN properties(r)"))
    transferred_by = list(tx.run(f"MATCH ()-[r:TRANSFERRED_BY]->() RETURN properties(r)"))
    works_in = list(tx.run("MATCH ()-[r:WORKS_IN]->() RETURN r"))
    construct_edges(belongs_to, received_by, transferred_by, from_country, lob_in, works_in)

In [32]:
with driver.session() as session:
    session.execute_read(fetch_edges)

print(data)

HeteroData(
  [1mtransaction[0m={
    y=[1498177],
    node_id=[1498177]
  },
  [1muser[0m={ node_id=[288867] },
  [1maccount[0m={ node_id=[305429] },
  [1mcountry[0m={ node_id=[252] },
  [1mlob[0m={ node_id=[1] },
  [1msector[0m={ node_id=[50000] },
  [1m(account, belongs_to, user)[0m={ edge_index=[2, 305429] },
  [1m(account, from, country)[0m={ edge_index=[2, 75161] },
  [1m(account, lob_in, lob)[0m={ edge_index=[2, 75161] },
  [1m(transaction, received_by, account)[0m={ edge_index=[2, 1282284] },
  [1m(transaction, transferred_by, account)[0m={ edge_index=[2, 1279291] },
  [1m(account, works_in, sector)[0m={ edge_index=[2, 1278723] }
)


In [33]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)

print(data)

HeteroData(
  [1mtransaction[0m={
    y=[1498177],
    node_id=[1498177]
  },
  [1muser[0m={ node_id=[288867] },
  [1maccount[0m={ node_id=[305429] },
  [1mcountry[0m={ node_id=[252] },
  [1mlob[0m={ node_id=[1] },
  [1msector[0m={ node_id=[50000] },
  [1m(account, belongs_to, user)[0m={ edge_index=[2, 305429] },
  [1m(account, from, country)[0m={ edge_index=[2, 75161] },
  [1m(account, lob_in, lob)[0m={ edge_index=[2, 75161] },
  [1m(transaction, received_by, account)[0m={ edge_index=[2, 1282284] },
  [1m(transaction, transferred_by, account)[0m={ edge_index=[2, 1279291] },
  [1m(account, works_in, sector)[0m={ edge_index=[2, 1278723] },
  [1m(user, rev_belongs_to, account)[0m={ edge_index=[2, 305429] },
  [1m(country, rev_from, account)[0m={ edge_index=[2, 75161] },
  [1m(lob, rev_lob_in, account)[0m={ edge_index=[2, 75161] },
  [1m(account, rev_received_by, transaction)[0m={ edge_index=[2, 1282284] },
  [1m(account, rev_transferred_by, transaction)[

In [48]:
data = data.to('cuda:0')
del data["transaction", "rev_received_by", "account"]

In [49]:
data

HeteroData(
  [1mtransaction[0m={
    y=[1498177],
    node_id=[1498177]
  },
  [1muser[0m={ node_id=[288867] },
  [1maccount[0m={ node_id=[305429] },
  [1mcountry[0m={ node_id=[252] },
  [1mlob[0m={ node_id=[1] },
  [1msector[0m={ node_id=[50000] },
  [1m(account, belongs_to, user)[0m={ edge_index=[2, 305429] },
  [1m(account, from, country)[0m={ edge_index=[2, 75161] },
  [1m(account, lob_in, lob)[0m={ edge_index=[2, 75161] },
  [1m(transaction, received_by, account)[0m={ edge_index=[2, 1282284] },
  [1m(transaction, transferred_by, account)[0m={ edge_index=[2, 1279291] },
  [1m(account, works_in, sector)[0m={ edge_index=[2, 1278723] },
  [1m(user, rev_belongs_to, account)[0m={ edge_index=[2, 305429] },
  [1m(country, rev_from, account)[0m={ edge_index=[2, 75161] },
  [1m(lob, rev_lob_in, account)[0m={ edge_index=[2, 75161] },
  [1m(account, rev_received_by, transaction)[0m={ edge_index=[2, 1282284] },
  [1m(account, rev_transferred_by, transaction)[

In [63]:
from torch_geometric.transforms import RandomLinkSplit

transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("transaction", "received_by", "account"),
    rev_edge_types=("account", "rev_received_by", "transaction"), 
)

train_data, val_data, test_data = transform(data)

print("Training data:")
print("==============")
print(train_data)

Training data:
HeteroData(
  [1mtransaction[0m={
    y=[1498177],
    node_id=[1498177]
  },
  [1muser[0m={ node_id=[288867] },
  [1maccount[0m={ node_id=[305429] },
  [1mcountry[0m={ node_id=[252] },
  [1mlob[0m={ node_id=[1] },
  [1msector[0m={ node_id=[50000] },
  [1m(account, belongs_to, user)[0m={ edge_index=[2, 305429] },
  [1m(account, from, country)[0m={ edge_index=[2, 75161] },
  [1m(account, lob_in, lob)[0m={ edge_index=[2, 75161] },
  [1m(transaction, received_by, account)[0m={
    edge_index=[2, 718080],
    edge_label=[307748],
    edge_label_index=[2, 307748]
  },
  [1m(transaction, transferred_by, account)[0m={ edge_index=[2, 1279291] },
  [1m(account, works_in, sector)[0m={ edge_index=[2, 1278723] },
  [1m(user, rev_belongs_to, account)[0m={ edge_index=[2, 305429] },
  [1m(country, rev_from, account)[0m={ edge_index=[2, 75161] },
  [1m(lob, rev_lob_in, account)[0m={ edge_index=[2, 75161] },
  [1m(account, rev_received_by, transaction)[0m=

In [55]:
train_data = train_data.to('cuda:0')
test_data = test_data.to('cuda:0')
val_data = val_data.to('cuda:0')

In [88]:
data.metadata()

(['transaction', 'user', 'account', 'country', 'lob', 'sector'],
 [('account', 'belongs_to', 'user'),
  ('account', 'from', 'country'),
  ('account', 'lob_in', 'lob'),
  ('transaction', 'received_by', 'account'),
  ('transaction', 'transferred_by', 'account'),
  ('account', 'works_in', 'sector'),
  ('user', 'rev_belongs_to', 'account'),
  ('country', 'rev_from', 'account'),
  ('lob', 'rev_lob_in', 'account'),
  ('account', 'rev_received_by', 'transaction'),
  ('account', 'rev_transferred_by', 'transaction'),
  ('sector', 'rev_works_in', 'account')])

In [89]:
from torch import Tensor
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_account: Tensor, x_transaction: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_account = x_account[edge_label_index[1]]
        edge_feat_transaction = x_transaction[edge_label_index[0]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_account * edge_feat_transaction).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for accounts and transactions:
        # self.transaction_lin = torch.nn.Linear(20, hidden_channels)
        self.account_emb = torch.nn.Embedding(data["account"].num_nodes, hidden_channels)
        self.transaction_emb = torch.nn.Embedding(data["transaction"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=(
            ['transaction', 'account'],
            [('transaction', 'received_by', 'account'),('account', 'rev_received_by', 'transaction')]
            )
        )

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "account": self.account_emb(data["account"].node_id),
          "transaction": self.transaction_emb(data["transaction"].node_id)# + self.transaction_lin(data["transaction"].x),
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        pred = self.classifier(
            x_dict["account"],
            x_dict["transaction"],
            data["transaction", "received_by", "account"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)

print(model)

Model(
  (account_emb): Embedding(305429, 64)
  (transaction_emb): Embedding(1498177, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (transaction__received_by__account): SAGEConv(64, 64, aggr=mean)
      (account__rev_received_by__transaction): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (transaction__received_by__account): SAGEConv(64, 64, aggr=mean)
      (account__rev_received_by__transaction): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [91]:
import gc
torch.cuda.empty_cache()
gc.collect()

17181

In [92]:
import torch.nn.functional as F

model = model.to('cuda:0')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 10):
    total_loss = total_examples = 0
    optimizer.zero_grad()

    pred = model(train_data)

    ground_truth = train_data["transaction", "received_by", "account"].edge_label
    loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

    loss.backward()
    optimizer.step()
    total_loss += float(loss) * pred.numel()
    total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Epoch: 001, Loss: 0.5580
Epoch: 002, Loss: 0.3737
Epoch: 003, Loss: 0.2478
Epoch: 004, Loss: 0.1645
Epoch: 005, Loss: 0.1104
Epoch: 006, Loss: 0.0755
Epoch: 007, Loss: 0.0528
Epoch: 008, Loss: 0.0378
Epoch: 009, Loss: 0.0276


In [None]:
driver.close()