<div class="header">
  <img src="img/kg_logo_white_side.png" alt="logo" style="width: 300px;"/>
  <h1>Transaction Monitoring with Graph Neural Networks</h1>
</div>

The Elliptic Data Set maps Bitcoin transactions to real entities belonging to licit categories (exchanges, wallet providers, miners, licit services, etc.) versus illicit ones (scams, malware, terrorist organizations, ransomware, Ponzi schemes, etc.). The task on the dataset is to classify the illicit and licit nodes in the graph.
We will use Graph Neural Networks (GraphSAGE) to perform the node classification task. This demo shows the end-to-end pipeline that can be done directly on the Katana Graph platform. 

-------

## Setup

In [None]:
import os                                                                        
import time                                                                      
import json
import uuid
import pandas as pd
import numpy
import argparse
from timeit import default_timer as timer
import warnings
warnings.filterwarnings('ignore')
from katana import remote
from katana.remote import import_data
from katana.remote.import_data import Operation
import dask.dataframe as dd
os.environ["KATANA_SERVER_ADDRESS"] = "localhost:8080"

print("--")

## Dask Dataframe Import

Use Python dataframes interface to prepare and generate graph

In [None]:
tx_classes = "gs://katana-demo-datasets/fsi/solution_raw_data/elliptic/elliptic_txs_classes.csv"
tx_edges = "gs://katana-demo-datasets/fsi/solution_raw_data/elliptic/elliptic_txs_edgelist.csv"
tx_features = "gs://katana-demo-datasets/fsi/solution_raw_data/elliptic/elliptic_txs_features.csv"

print("--")

In [None]:
feat_col_names = ["txId", "timestamp"]
local_feats_name = [f"local_feat_{i}" for i in range(2,95)]
agg_feats_name = [f"agg_feat_{i}" for i in range(95,167)]
feat_col_names.extend(local_feats_name)
feat_col_names.extend(agg_feats_name)

print("--")

In [None]:
feat_types = {
    "class": "string",
    "timestamp": "string", 
    "target": "float",
    "node_type": "string"
}
local_cols = {}
for i in range(2,95):
    local_cols[f"local_feat_{i}"] = "float"
agg_cols = {}
for i in range(95,167):
    agg_cols[f"agg_feat_{i}"] = "float"
feat_types.update(local_cols)
feat_types.update(agg_cols)

print("--")

In [None]:
%%time
classes = dd.read_csv(tx_classes)
edges = dd.read_csv(tx_edges)
features = dd.read_csv(tx_features, header=None, names=feat_col_names)

print("--")

In [None]:
classes['target'] = classes['class'].map({'unknown': -1.0, '1': 1.0, '2': 0.0})
classes['node_type'] = classes['class'].map({'unknown': 'Unclassified_Txn', '1': 'Classified_Txn', '2': 'Classified_Txn'})

print("--")

Some of the elliptic transactions are classified as either licit (0) or illicit (1). We can filter for only classified transactions for training

In [None]:
classes.head()

In [None]:
edges.head()

In [None]:
nodes = features.merge(classes)

print("--")

In [None]:
nodes.head()

There is a 4 node cluster running on GCP, which is why we select 4 partitions. As the data scales, we can increase the number of machines in the cluster and the number of partitions in our graph

In [None]:
NUM_PARTITIONS = 6

graph = remote.Client(disable_version_check=False).create_graph(
    num_partitions = NUM_PARTITIONS
)

print("graph id:", graph.graph_id)


In [4]:
#  Get a KatanaGraph Connection Handle ..

from katana import remote
from katana.remote import import_data


NUM_PARTITIONS = 8
   #
DB_NAME        = "my_db"
GRAPH_NAME     = "my_graph"


my_client = remote.Client()

print(my_client)


VERBOSE: ../../../../external/katana/libgalois/include/katana/AsyncPool.h:152: 3148: async pool tids: 140215910070016, 140215907968768, 140215905867520
VERBOSE: ../../../../external/katana/libgalois/include/katana/AsyncPool.h:152: 3148: async pool tids: 140215903766272, 140215899567872
VERBOSE: ../../../../external/katana/libgalois/include/katana/AsyncPool.h:152: 3148: async pool tids: 140215895369472, 140215886976768
VERBOSE: ../../../../external/katana/libgalois/include/katana/AsyncPool.h:152: 3148: async pool tids: 140215878584064, 140215861802752
VERBOSE: ../../../../external/katana/libgalois/include/katana/AsyncPool.h:152: 3148: async pool tids: 140215845021440, 140215842920192


<katana_enterprise.remote.sync_wrappers.Client object at 0x7f86d4f0f3d0>


In [5]:
#  CONNECT TO GRAPH

for l_graph in my_client.graphs():
   if (l_graph.name == GRAPH_NAME):
      my_graph=my_client.get_database(name=DB_NAME).get_graph_by_id(id=l_graph.graph_id)
        
print(my_graph)

<_Graph my_graph, J9qMFV5XdTMTuq5vbVgZwWnNrfH77rRFM8NMHbPAEmKC, 1>


In [None]:
%%time
reverse_edges=True
with import_data.DataFrameImporter(graph) as df_importer:   
    
    df_importer.nodes_dataframe(nodes,
                            id_column="txId",
                            id_space="transaction", 
                            property_columns=feat_types,
                            label_column="node_type")
    
    df_importer.edges_dataframe(edges,
                            source_id_space="transaction",
                            destination_id_space="transaction",
                            source_column="txId1",
                            destination_column="txId2",
                            type="tx_flow")
    if reverse_edges:
        df_importer.edges_dataframe(edges,
                        source_id_space="transaction",
                        destination_id_space="transaction",
                        source_column="txId2",
                        destination_column="txId1",
                        type="rev_tx_flow")
        
print("--")
    

In [None]:
print(f"Number of nodes: {graph.num_nodes():,}")
print(f"Number of edges: {graph.num_edges():,}")

print("--")

## Visualization

See all relationships from transaction

In [None]:
%%time
graph.query("""
MATCH (tx1)-[:tx_flow]->(tx2)
RETURN tx1, tx2
LIMIT 500
""", contextualize=True).visualize()

## Feature Initialization

To initialize features for the GNN, we will combine all node properties into a feature vector and save as a new feature on the graph. Katana Graph supports saving binary feature vectors as individual properties on the graph. In this case, we save 3 different feature vectors: 

1. ```local_feats``` - raw features provided for each transaction
2. ```agg_feats```  - aggregated features from each node's neighborhood
3. ```h_init``` - both ```local_feats``` + ```agg_feats``` combined into one feature. This will be the starting point for our GNN

In [None]:
%%time
def run_feature_init(g): 
    import sys, os
    import numpy as np
    sys.path.append(os.path.join("/home/gsteck_katanagraph_com/solutions/fsi/src"))
    from katana_ai import get_node_property_list, visualize_embeddings, train_test_split_mask, save_features_to_graph
    # extract features
    local_feats = get_node_property_list(g, property_list=local_feats_name)
    agg_feats = get_node_property_list(g, property_list=agg_feats_name)
    feat_vec = np.concatenate([local_feats, agg_feats], axis=-1)
    # save new features vector to graph
    g = save_features_to_graph(g, feat_vec, feature_name="h_init")
    g = save_features_to_graph(g, local_feats, feature_name="local_feats")
    g = save_features_to_graph(g, agg_feats, feature_name="agg_feats")
    # create train/test split mask
    g = train_test_split_mask(g, train_test_validation_split=[0.8, 0.15, 0.05])
    g.write()

graph.run(lambda g: run_feature_init(g))

print("--")

In [6]:

def run_feature_init(g): 
    import sys, os
    sys.path.append(os.path.join("/tmp"))
    
    from katana_ai import get_node_property_list, visualize_embeddings, train_test_split_mask, save_features_to_graph

my_graph.run(lambda g: run_feature_init(g))

print("--")


          0/? [?op/s]


Host 0 errors:
Traceback (most recent call last):
  File "/opt/miniconda/lib/python3.8/site-packages/katana_enterprise/worker/worker.py", line 85, in execute
    value = function(graph)
  File "/tmp/ipykernel_3148/3080334400.py", line 7, in <lambda>
  File "/tmp/ipykernel_3148/3080334400.py", line 5, in run_feature_init
ModuleNotFoundError: No module named 'katana_ai'


ModuleNotFoundError: No module named 'katana_ai'

## Analyze Features

Generate run ID for Tensorboard

In [None]:
run_id = uuid.uuid4().hex

Analyze initial feature vectors in Tensorboard on ```Projector``` tab. Features can be visualized using T-SNE / UMAP plots. 

In [None]:
def analyze_features(g): 
    import sys, os
    sys.path.append(os.path.join("/home/gsteck_katanagraph_com/solutions/fsi/src"))
    from katana_ai import visualize_embeddings
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(f"/tmp/tensorboard/elliptic-embed-init-{run_id}")
    # analyze features in tensorboard
    visualize_embeddings(g, writer, feature_name="h_init", target_name="target", filter_node_type="Classified_Txn", sample_size=2000)
    visualize_embeddings(g, writer, feature_name="local_feats", target_name="target", filter_node_type="Classified_Txn", sample_size=2000)
    visualize_embeddings(g, writer, feature_name="agg_feats", target_name="target", filter_node_type="Classified_Txn", sample_size=2000)
    
graph.run(lambda g: analyze_features(g))
print(f"See results at https://demo-finance-tensorboard.katanagraph.com/")
print(f"Run ID: elliptic-embed-init-{run_id}")

## GNN Training

In [None]:
args = argparse.Namespace(
    feat_name="h_init",
    label_name="target",
    label_dtype=numpy.float32,
    split_name="train_test_val_mask",
    distributed_execution=True,
    tensorboard_dir=f"/tmp/tensorboard/elliptic-remote-{run_id}",
    model_dir="/tmp/models",
    katana_ai_dir="/home/gsteck_katanagraph_com/solutions/fsi/src",
    pred_node_label="Classified_Txn",
    pred_node_label_prop="node_type",
    pos_weight=8,
    in_dim=165,
    hidden_dim=256,
    train_fan_in="100,100,100,100",
    test_fan_in="100,100,100,100",
    num_layers=4,
    out_dim=1,
    minibatch_size=1024,
    max_minibatches=20,
    lr=0.001,
    dropout=0.2,
    num_epochs=100
)

In [None]:
def run_gnn(graph, args):
    from calendar import c
    import torch
    import numpy
    import katana
    from katana_enterprise.distributed.pytorch import init_workers
    from katana_enterprise.ai.data import PyGNodeSubgraphSampler, SampledSubgraphConfig 
    from katana_enterprise.ai.data import NodeDataLoader
    from torch_geometric.nn import SAGEConv
    from torch.nn.parallel import DistributedDataParallel as torch_DDP
    from torch.utils.tensorboard import SummaryWriter
    import sys, os
    sys.path.append(os.path.join(args.katana_ai_dir))
    from katana_ai import get_split, train_model
    os.environ['MODIN_ENGINE']='python'
    #katana.distributed.initialize()

    katana.set_active_threads(32)
    exec_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # model definition
    class DistSAGE(torch.nn.Module):
        def __init__(self, in_dim, hidden_dim, out_dim, num_layers,
                     dropout):
            super(DistSAGE, self).__init__()

            self.convs = torch.nn.ModuleList()
            self.convs.append(SAGEConv(in_dim, hidden_dim))
            self.bns = torch.nn.ModuleList()
            self.bns.append(torch.nn.BatchNorm1d(hidden_dim))
            for _ in range(num_layers - 2):
                self.convs.append(SAGEConv(hidden_dim, hidden_dim))
                self.bns.append(torch.nn.BatchNorm1d(hidden_dim))
            self.convs.append(SAGEConv(hidden_dim, out_dim))
            self.activation = torch.nn.functional.relu
            self.dropout = torch.nn.Dropout(dropout)

        def reset_parameters(self):
            for conv in self.convs:
                conv.reset_parameters()
            for bn in self.bns:
                bn.reset_parameters()

        def forward(self, data):
            # unpack data loader
            x, edges = data.x, data.adjs
            #x.to(exec_device)
            #edges.to(exec_device)

            for i, conv in enumerate(self.convs):
                # for multilayer, set x_target for each layer
                x_target = x[:data.dest_count[i]]
                x = conv((x, x_target), edges[i])
                if i != len(self.convs) - 1:
                    x = self.bns[i](x)
                    x = self.activation(x)
                    x = self.dropout(x) 
                    embed = x
            return x, embed
    
    # initialize torch mpi process
    main_start = time.time()
    if args.distributed_execution:
        init_workers()

    # tensorboard writer
    writer = SummaryWriter(args.tensorboard_dir)

    # split train / test node idx
    train_nodes = get_split(graph, 0, split_name=args.split_name, node_label=args.pred_node_label, node_label_prop=args.pred_node_label_prop)
    test_nodes = get_split(graph, 1, split_name=args.split_name, node_label=args.pred_node_label, node_label_prop=args.pred_node_label_prop)

    # initialize the multiminibatch sampler
    train_sampler = PyGNodeSubgraphSampler(
        graph, 
        SampledSubgraphConfig(
        layer_fan=[int(fan_in) for fan_in in args.train_fan_in.split(',')], 
            max_minibatches=args.max_minibatches, 
            batch_props_to_pull=args.max_minibatches,
            feat_prop_name=args.feat_name,
            label_prop_name=args.label_name,
            label_dtype=args.label_dtype,
            multilayer_export=True
        )
    )
    
    # test sampler used for evaluation; it samples 100s per hop to simulate getting
    test_sampler = PyGNodeSubgraphSampler(
        graph, 
        SampledSubgraphConfig(
        layer_fan=[int(fan_in) for fan_in in args.train_fan_in.split(',')], 
            max_minibatches=args.max_minibatches, 
            batch_props_to_pull=args.max_minibatches,
            feat_prop_name=args.feat_name,
            label_prop_name=args.label_name,
            label_dtype=args.label_dtype,
            pull_edge_types=args.load_edge_types,
            multilayer_export=True
        )
    )
    
    # shuffle seeds between epochs + balance seed nodes across hosts
    train_dataloader = NodeDataLoader(
        train_sampler, 
        local_batch_size=args.minibatch_size, 
        node_ids=train_nodes,  
        shuffle=True, 
        drop_last=True,
        balance_seeds=True)
    test_dataloader = NodeDataLoader(
        test_sampler, 
        local_batch_size=args.minibatch_size, 
        node_ids=test_nodes, 
        balance_seeds=True)

    # model initialization
    model = DistSAGE(
        in_dim=args.in_dim, 
        hidden_dim=args.hidden_dim, 
        out_dim=args.out_dim, 
        num_layers=args.num_layers,
        dropout=args.dropout
    ).to(exec_device)

    if args.distributed_execution:
        model = torch_DDP(model)
    
    # optimizer and loss fn
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    loss_function = torch.nn.BCEWithLogitsLoss(pos_weight=torch.Tensor([args.pos_weight]))
    
    # train model
    train_model(model, loss_function, optimizer, writer, train_dataloader, test_dataloader, args)
    
    # save model
    #ts = time.time()
    #torch.save(model.state_dict(), os.path.join(args.model_dir, 'graph_sage.'+str(ts)+'.pth'))
