# Example notebook for GNN

## Notebook configuration

In [None]:
# Load some libraries
import os 
import sys
import pandas as pd
import numpy as np

# Set up system path, and import our custom modules
# helpers: for cell timer
# pipeline: all data preprocessing
# model: for GNN model & trainer
sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
from helpers import add_cell_timer
from pipeline import ModelPipeline
import model
add_cell_timer()

data_file = "../../data/subset_transactions2.csv"

## Load and preprocess data

In [3]:
pl = ModelPipeline(data_file)

In [None]:
pl.rename_columns()
pl.drop_duplicates()
pl.check_for_null()
pl.extract_currency_features()
pl.extract_time_features()
pl.create_unique_ids()
pl.extract_additional_time_features()
pl.cyclical_encoding()
pl.apply_one_hot_encoding()



## Split data into train/val/test, and continue with split-specific feature engineering
There are some features that, if engineered or standardized using the whole dataset, could result in data leakage between our train/val/test sets. Therefore, we must split the data prior to these calculations. 

In [None]:
# Temporal split for edges
pl.split_train_test_val(split_type="temporal_agg")

✅ Computed node features for train with 107090 nodes.
✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.


### Create node features
Node features are specific to accounts, and include graph based features like pagerank and degree centrality, as well as some aggregate statistics such as net flow (total amount sent-total amount received for a specific account). 

In [None]:
# Compute node features split-specifically
pl.compute_split_specific_node_features()

# Scale only relevant node features (others like pagerank left raw)
_,_,_, = pl.scale_node_data_frames()

print(pl.df.train_nodes.columns) # print node features to peek

### Create graph objects (GNN specific processes)
The `split_train_test_val_graph()` function uses the data split above, and creates PyG-style Data objects. PyG-style Data objects have features like:

- x: node (account) features (without column for "node_id", mind you--so these must be properly sorted and align with our unique edge indexers)
- edge_index: a [2, num_transactions] tensor containing the accounts involved in each transaction
- edge_attr: the edge (transaction) features, listed above, including things like amount, temporal features, and payment type
- y: our labels -- 'is_laundering' column, associated with each transaction

Another feature of our `split_train_test_val_graph` function is reordering columns such that we have 'edge_id' as the first column -- this is important for how our model works, since we use edge_id to determine which transactions to evaluate during model training, but then drop the column before passing the transactions into the model. 

In [None]:
# Convert into PyG-style Data objects
_,_,_,_,_,_,_,_,= pl.split_train_test_val_graph()

(tensor([     0,      1,      2,  ..., 875627, 875628, 875629],
        dtype=torch.int32),
 tensor([ 875630,  875631,  875632,  ..., 1063262, 1063263, 1063264],
        dtype=torch.int32),
 tensor([1063265, 1063266, 1063267,  ..., 1250898, 1250899, 1250900],
        dtype=torch.int32),
 Data(x=[107090, 9], edge_index=[2, 875630], edge_attr=[875630, 50], y=[875630]),
 Data(x=[107355, 9], edge_index=[2, 1063265], edge_attr=[1063265, 50], y=[1063265]),
 Data(x=[107583, 9], edge_index=[2, 1250901], edge_attr=[1250901, 50], y=[1250901]),
 tensor([[    0,     1,     2,  ..., 14601, 14601, 14601],
         [    0,     1,     2,  ..., 14601, 59038, 45303]]),
 tensor([0, 0, 0,  ..., 0, 1, 1]))

In [8]:
pl.edge_features

['edge_id',
 'day_cos',
 'day_sin',
 'hour_of_day',
 'is_weekend',
 'log_exchange_rate',
 'payment_type_ACH',
 'payment_type_Bitcoin',
 'payment_type_Cash',
 'payment_type_Cheque',
 'payment_type_Credit Card',
 'payment_type_Reinvestment',
 'payment_type_Wire',
 'received_amount',
 'received_currency_Australian Dollar',
 'received_currency_Bitcoin',
 'received_currency_Brazil Real',
 'received_currency_Canadian Dollar',
 'received_currency_Euro',
 'received_currency_Mexican Peso',
 'received_currency_Ruble',
 'received_currency_Rupee',
 'received_currency_Saudi Riyal',
 'received_currency_Shekel',
 'received_currency_Swiss Franc',
 'received_currency_UK Pound',
 'received_currency_US Dollar',
 'received_currency_Yen',
 'received_currency_Yuan',
 'sent_amount',
 'sent_amount_usd',
 'sent_currency_Australian Dollar',
 'sent_currency_Bitcoin',
 'sent_currency_Brazil Real',
 'sent_currency_Canadian Dollar',
 'sent_currency_Euro',
 'sent_currency_Mexican Peso',
 'sent_currency_Ruble',
 'sen

In [9]:
pl.scale_edge_features(edge_features_to_scale=['sent_amount_usd','timestamp_scaled'])

{'sent_amount_usd': StandardScaler(), 'timestamp_scaled': StandardScaler()}

In [10]:
pl.scaled_edge_features

['sent_amount_usd', 'timestamp_scaled']

In [11]:
# Prepare data loaders for training
train_loader, val_loader, test_loader, *_ = pl.get_data_loaders()

In [17]:
import model
mymodel = model.GINe(n_node_feats=pl.train_data.x.shape[0], n_edge_feats=pl.train_data.edge_attr.shape[0]-1)
trainer = model.GNNTrainer(
    model=mymodel,
    train_loader=pl.train_loader,
    val_loader=pl.val_loader,
    test_loader=pl.test_loader,
    train_indices=pl.train_indices,
    val_indices=pl.val_indices,
    test_indices=pl.test_indices,
    df=pl.df
)
print(mymodel)

GINe(
  (node_emb): Linear(in_features=107090, out_features=100, bias=True)
  (edge_emb): Linear(in_features=875629, out_features=100, bias=True)
  (convs): ModuleList(
    (0-1): 2 x GINEConv(nn=Sequential(
      (0): Linear(in_features=100, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    ))
  )
  (emlps): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=300, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    )
  )
  (batch_norms): ModuleList(
    (0-1): 2 x BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.10527690625126304, inplace=False)
    (3): Linear(in_features=50, out_features=25, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.10527690625126304, inplace=False)
    (6): Linear(



In [16]:
trainer.train(epochs=1)

Epoch 1 Training:   0%|          | 0/107 [00:00<?, ?it/s]


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'