# Example notebook for GNN

## Notebook configuration

In [None]:
# Load some libraries
import os 
import sys
import pandas as pd
import numpy as np

# Set up system path, and import our custom modules
# helpers: for cell timer
# pipeline: all data preprocessing
# model: for GNN model & trainer
sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
from helpers import add_cell_timer
from pipeline import BaseModelPipeline
from pipeline.gnn_pipeline import GNNModelPipeline
from pipeline.catboost_pipeline import CatBoostPipeline
import model
add_cell_timer()

data_file = "../../data/subset_transactions2.csv"

In [2]:
# ### For reloading updated modules
# import pipeline.gnn_pipeline
# import importlib
# importlib.reload(pipeline.gnn_pipeline)
# from pipeline.gnn_pipeline import GNNModelPipeline

# ### For reloading updated modules
# importlib.reload(pipeline)
# from pipeline import ModelPipeline

## Load and preprocess data

In [3]:
pl = GNNModelPipeline(data_file)

In [4]:
pl.rename_columns()
pl.drop_duplicates()
pl.check_for_null()
pl.extract_currency_features()
pl.extract_time_features()
pl.create_unique_ids()
pl.extract_additional_time_features()
pl.cyclical_encoding()
pl.apply_one_hot_encoding()



## Split data into train/val/test, and continue with split-specific feature engineering
There are some features that, if engineered or standardized using the whole dataset, could result in data leakage between our train/val/test sets. Therefore, we must split the data prior to these calculations. 

In [5]:
import inspect
# Check default split mehod for GNN
print(inspect.getsource(pl.split_train_test_val))

# Temporal split for edges
pl.split_train_test_val()

    def split_train_test_val(self, X_cols=None, y_col="is_laundering", test_size=0.15, val_size=0.15, split_type="temporal_agg"):
            return super().split_train_test_val(X_cols, y_col, test_size, val_size, split_type)



### Create node features
Node features are specific to accounts, and include graph based features like pagerank and degree centrality, as well as some aggregate statistics such as net flow (total amount sent-total amount received for a specific account). 

In [6]:
# Compute node features split-specifically
pl.compute_split_specific_node_features()

# Scale only relevant node features (others like pagerank left raw)
pl.scale_node_data_frames()

print(pl.train_nodes.columns) # print node features to peek

✅ Computed node features for train with 107090 nodes.
✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.
Index(['node_id', 'degree_centrality', 'pagerank', 'net_flow', 'avg_txn_out',
       'avg_txn_in', 'std_txn_out', 'std_txn_in', 'num_unique_out_partners',
       'num_unique_in_partners'],
      dtype='object')


### Create graph objects (GNN specific processes)
The `split_train_test_val_graph()` function uses the data split above, and creates PyG-style Data objects. PyG-style Data objects have features like:

- x: node (account) features (without column for "node_id", mind you--so these must be properly sorted and align with our unique edge indexers)
- edge_index: a [2, num_transactions] tensor containing the accounts involved in each transaction
- edge_attr: the edge (transaction) features, listed above, including things like amount, temporal features, and payment type
- y: our labels -- 'is_laundering' column, associated with each transaction

Another feature of our `split_train_test_val_graph` function is reordering columns such that we have 'edge_id' as the first column -- this is important for how our model works, since we use edge_id to determine which transactions to evaluate during model training, but then drop the column before passing the transactions into the model. 

In [7]:
print(pl.X_cols)
edge_feats = ['edge_id'] + list(set(pl.X_cols) - set(['timestamp_int','hour_of_day','is_weekend','edge_id','sent_amount','received_amount']))
print(edge_feats)

['day_cos', 'day_sin', 'edge_id', 'hour_of_day', 'is_weekend', 'log_exchange_rate', 'payment_type_ACH', 'payment_type_Bitcoin', 'payment_type_Cash', 'payment_type_Cheque', 'payment_type_Credit Card', 'payment_type_Reinvestment', 'payment_type_Wire', 'received_amount', 'received_currency_Australian Dollar', 'received_currency_Bitcoin', 'received_currency_Brazil Real', 'received_currency_Canadian Dollar', 'received_currency_Euro', 'received_currency_Mexican Peso', 'received_currency_Ruble', 'received_currency_Rupee', 'received_currency_Saudi Riyal', 'received_currency_Shekel', 'received_currency_Swiss Franc', 'received_currency_UK Pound', 'received_currency_US Dollar', 'received_currency_Yen', 'received_currency_Yuan', 'sent_amount', 'sent_amount_usd', 'sent_currency_Australian Dollar', 'sent_currency_Bitcoin', 'sent_currency_Brazil Real', 'sent_currency_Canadian Dollar', 'sent_currency_Euro', 'sent_currency_Mexican Peso', 'sent_currency_Ruble', 'sent_currency_Rupee', 'sent_currency_Saud

In [8]:
# Convert into PyG-style Data objects
pl.split_train_test_val_graph(edge_features=edge_feats)
print(pl.edge_features)

['edge_id', 'sent_currency_Euro', 'sent_currency_Mexican Peso', 'log_exchange_rate', 'payment_type_Reinvestment', 'received_currency_Saudi Riyal', 'received_currency_Yen', 'time_of_day_cos', 'sent_currency_Australian Dollar', 'received_currency_Yuan', 'sent_currency_Canadian Dollar', 'sent_currency_UK Pound', 'payment_type_Cheque', 'sent_currency_Rupee', 'received_currency_Shekel', 'day_sin', 'sent_currency_Brazil Real', 'sent_currency_Bitcoin', 'received_currency_Mexican Peso', 'sent_currency_Yuan', 'timestamp_scaled', 'received_currency_UK Pound', 'time_of_day_sin', 'received_currency_Euro', 'sent_currency_Saudi Riyal', 'received_currency_US Dollar', 'payment_type_Cash', 'payment_type_Credit Card', 'received_currency_Bitcoin', 'time_diff_to', 'payment_type_ACH', 'payment_type_Wire', 'time_diff_from', 'received_currency_Ruble', 'sent_currency_Ruble', 'sent_currency_Swiss Franc', 'received_currency_Swiss Franc', 'sent_currency_Shekel', 'sent_currency_US Dollar', 'turnaround_time', 'rec

In [9]:
pl.scale_edge_features(edge_features_to_scale=['sent_amount_usd','timestamp_scaled','time_diff_from','time_diff_to','turnaround_time'])
print(pl.scaled_edge_features)

['sent_amount_usd', 'timestamp_scaled', 'time_diff_from', 'time_diff_to', 'turnaround_time']


In [10]:
# Prepare data loaders for training
pl.get_data_loaders()

In [11]:
print(inspect.getsource(pl.initialize_training))

    def initialize_training(
        self,
        threshold: float=0.5,
        epochs: int=50,
        patience: int=10,
    ) -> None:
        """Setup the model pipeline for training: metrics, model,
        optimizer, scheduler, and criterion
        """
        self.threshold = threshold
        self.epochs = epochs
        self.patience = patience
        
        

        # Since `initialize_training` is run after preprocessing is
        # done, we can define the node and edge features here. This
        # does assume that column ordering between data frames and
        # tensors is preserved, and it removes node and edge id
        # TODO: ran into issue with this line bc node_id has already been dropped
        if 'node_id' in self.nodes.columns:
            self.node_feature_labels = self.nodes.drop(columns="node_id").columns
        else:
            self.node_feature_labels = self.nodes.columns
        self.edge_feature_labels = self.df[self.edge_features].drop(columns="

In [12]:
pl.initialize_training(epochs=1)



In [13]:
print(pl.model)

GINe(
  (node_emb): Linear(in_features=9, out_features=100, bias=True)
  (edge_emb): Linear(in_features=47, out_features=100, bias=True)
  (convs): ModuleList(
    (0-1): 2 x GINEConv(nn=Sequential(
      (0): Linear(in_features=100, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    ))
  )
  (emlps): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=300, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    )
  )
  (batch_norms): ModuleList(
    (0-1): 2 x BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.10527690625126304, inplace=False)
    (3): Linear(in_features=50, out_features=25, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.10527690625126304, inplace=False)
    (6): Linear(in_featur

In [14]:
pl.trainer.train()

Epoch 1 Training:   0%|          | 0/107 [00:00<?, ?it/s]


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'