# Example notebook for GNN

## Notebook configuration

### For VSCode

In [1]:
# Load some libraries
# import os
# import sys
# import pandas as pd
# import numpy as np

# # Set up system path, and import our custom modules
# # helpers: for cell timer
# # pipeline: all data preprocessing
# # model: for GNN model & trainer
# sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
# from helpers import add_cell_timer
# from pipeline import ModelPipeline
# from pipeline.gnn_pipeline import GNNPipeline
# from pipeline.catboost_pipeline import CatBoostPipeline
# import model
# add_cell_timer()

# data_file = "../../data/subset_transactions2.csv"

### For colab

In [2]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install torch-scatter torch-sparse pyg-lib torch-geometric \
  -f https://data.pyg.org/whl/torch-2.5.1+cu118.html

# !pip install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install torch-scatter torch-sparse pyg-lib torch-geometric -f https://data.pyg.org/whl/torch-2.5.1+cpu.html

from torch_geometric.nn import GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader, LinkNeighborLoader
!pip install torchmetrics
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision, BinaryRecall, BinaryF1Score, BinaryAveragePrecision
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "subset_transactions2.csv")

Looking in indexes: https://download.pytorch.org/whl/cu118
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu118.html
Collecting torch-scatter
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu118/torch_scatter-2.1.2%2Bpt25cu118-cp311-cp311-linux_x86_64.whl (10.3 MB)
Collecting torch-sparse
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu118/torch_sparse-0.6.18%2Bpt25cu118-cp311-cp311-linux_x86_64.whl (5.0 MB)
Collecting pyg-lib
  Using cached https://data.pyg.org/whl/torch-2.5.0%2Bcu118/pyg_lib-0.4.0%2Bpt25cu118-cp311-cp311-linux_x86_64.whl (2.6 MB)
Collecting torch-geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Using cached torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
Installing collected packages: torch-scatter, pyg-lib, torch-sparse, torch-geometric
Successfully installed pyg-lib-0.4.0+pt25cu118 torch-geometric-2.6.1 torch-scatter-2.1.2+pt25cu118 torch-sparse-0.6.18+pt25cu118
Collecting torchmetrics
  Downloading torchmetric

In [3]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

# Set up system path, and import our custom modules
# helpers: for cell timer
# pipeline: all data preprocessing
# model: for GNN model & trainer
sys.path.append(os.path.abspath(os.path.join("..", "..", "src")))
from helpers import add_cell_timer
from pipeline import ModelPipeline
from pipeline.gnn_pipeline import GNNPipeline
from pipeline.catboost_pipeline import CatBoostPipeline
import model
add_cell_timer()

Cloning into 'dsi-capstone-spring-2025-TD-anti-money-laundering'...
remote: Enumerating objects: 1124, done.[K
remote: Counting objects: 100% (366/366), done.[K
remote: Compressing objects: 100% (191/191), done.[K
remote: Total 1124 (delta 261), reused 222 (delta 175), pack-reused 758 (from 1)[K
Receiving objects: 100% (1124/1124), 71.33 MiB | 19.88 MiB/s, done.
Resolving deltas: 100% (622/622), done.
Updating files: 100% (99/99), done.
Already up to date.
Branch 'sophie' set up to track remote branch 'sophie' from 'origin'.
Switched to a new branch 'sophie'


## Load and preprocess data

In [4]:
# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "subset_transactions2.csv")

In [5]:
pl = GNNPipeline(data_file)

In [6]:
pl.rename_columns()
pl.drop_duplicates()
pl.check_for_null()
pl.extract_currency_features()
pl.extract_time_features()
pl.create_unique_ids()
pl.extract_additional_time_features()
pl.cyclical_encoding()
pl.apply_one_hot_encoding()

## Split data into train/val/test, and continue with split-specific feature engineering
There are some features that, if engineered or standardized using the whole dataset, could result in data leakage between our train/val/test sets. Therefore, we must split the data prior to these calculations.

In [7]:
import inspect
# Check default split mehod for GNN
print(inspect.getsource(pl.split_train_test_val))

# Temporal split for edges
pl.split_train_test_val()

    def split_train_test_val(self, X_cols=None, y_col="is_laundering", test_size=0.15, val_size=0.15, split_type="temporal_agg"):
            return super().split_train_test_val(X_cols, y_col, test_size, val_size, split_type)



### Create node features
Node features are specific to accounts, and include graph based features like pagerank and degree centrality, as well as some aggregate statistics such as net flow (total amount sent-total amount received for a specific account).

In [8]:
# Compute node features split-specifically
pl.compute_split_specific_node_features()

# Scale only relevant node features (others like pagerank left raw)
pl.scale_node_data_frames()

print(pl.train_nodes.columns) # print node features to peek

✅ Computed node features for train with 107090 nodes.
✅ Computed node features for val with 107355 nodes.
✅ Computed node features for test with 107583 nodes.
Index(['node_id', 'degree_centrality', 'pagerank', 'net_flow', 'avg_txn_out',
       'avg_txn_in', 'std_txn_out', 'std_txn_in', 'num_unique_out_partners',
       'num_unique_in_partners'],
      dtype='object')


### Create graph objects (GNN specific processes)
The `split_train_test_val_graph()` function uses the data split above, and creates PyG-style Data objects. PyG-style Data objects have features like:

- x: node (account) features (without column for "node_id", mind you--so these must be properly sorted and align with our unique edge indexers)
- edge_index: a [2, num_transactions] tensor containing the accounts involved in each transaction
- edge_attr: the edge (transaction) features, listed above, including things like amount, temporal features, and payment type
- y: our labels -- 'is_laundering' column, associated with each transaction

Another feature of our `split_train_test_val_graph` function is reordering columns such that we have 'edge_id' as the first column -- this is important for how our model works, since we use edge_id to determine which transactions to evaluate during model training, but then drop the column before passing the transactions into the model.

In [9]:
print(pl.X_cols)
edge_feats = ['edge_id'] + list(set(pl.X_cols) - set(['timestamp_int','hour_of_day','is_weekend','edge_id','sent_amount','received_amount']))
print(edge_feats)

['day_cos', 'day_sin', 'edge_id', 'hour_of_day', 'is_weekend', 'log_exchange_rate', 'payment_type_ACH', 'payment_type_Bitcoin', 'payment_type_Cash', 'payment_type_Cheque', 'payment_type_Credit Card', 'payment_type_Reinvestment', 'payment_type_Wire', 'received_amount', 'received_currency_Australian Dollar', 'received_currency_Bitcoin', 'received_currency_Brazil Real', 'received_currency_Canadian Dollar', 'received_currency_Euro', 'received_currency_Mexican Peso', 'received_currency_Ruble', 'received_currency_Rupee', 'received_currency_Saudi Riyal', 'received_currency_Shekel', 'received_currency_Swiss Franc', 'received_currency_UK Pound', 'received_currency_US Dollar', 'received_currency_Yen', 'received_currency_Yuan', 'sent_amount', 'sent_amount_usd', 'sent_currency_Australian Dollar', 'sent_currency_Bitcoin', 'sent_currency_Brazil Real', 'sent_currency_Canadian Dollar', 'sent_currency_Euro', 'sent_currency_Mexican Peso', 'sent_currency_Ruble', 'sent_currency_Rupee', 'sent_currency_Saud

In [10]:
# Convert into PyG-style Data objects
pl.split_train_test_val_graph(edge_features=edge_feats)
print(pl.edge_features)

['edge_id', 'received_currency_Rupee', 'received_currency_Bitcoin', 'time_diff_from', 'payment_type_Bitcoin', 'timestamp_scaled', 'day_cos', 'sent_currency_Euro', 'sent_currency_Shekel', 'payment_type_Credit Card', 'sent_currency_Saudi Riyal', 'received_currency_Brazil Real', 'sent_currency_Bitcoin', 'sent_currency_Brazil Real', 'time_of_day_cos', 'payment_type_Cheque', 'sent_currency_Canadian Dollar', 'turnaround_time', 'sent_currency_Yuan', 'sent_currency_Yen', 'received_currency_Canadian Dollar', 'payment_type_Cash', 'received_currency_Euro', 'sent_currency_UK Pound', 'received_currency_Yen', 'sent_currency_Mexican Peso', 'sent_currency_Australian Dollar', 'payment_type_Reinvestment', 'sent_currency_Ruble', 'received_currency_Yuan', 'received_currency_US Dollar', 'received_currency_Ruble', 'sent_currency_US Dollar', 'received_currency_Mexican Peso', 'payment_type_Wire', 'received_currency_Swiss Franc', 'day_sin', 'received_currency_UK Pound', 'received_currency_Australian Dollar', '

In [11]:
pl.scale_edge_features(edge_features_to_scale=['sent_amount_usd','timestamp_scaled','time_diff_from','time_diff_to','turnaround_time'])
print(pl.scaled_edge_features)

['sent_amount_usd', 'timestamp_scaled', 'time_diff_from', 'time_diff_to', 'turnaround_time']


In [12]:
# Prepare data loaders for training
pl.get_data_loaders()

In [13]:
print(inspect.getsource(pl.initialize_training))

    def initialize_training(
        self,
        threshold: float=0.5,
        epochs: int=50,
        patience: int=10,
    ) -> None:
        """Setup the model pipeline for training: metrics, model,
        optimizer, scheduler, and criterion
        """
        self.threshold = threshold
        self.epochs = epochs
        self.patience = patience
        
        

        # Since `initialize_training` is run after preprocessing is
        # done, we can define the node and edge features here. This
        # does assume that column ordering between data frames and
        # tensors is preserved, and it removes node and edge id
        # TODO: ran into issue with this line bc node_id has already been dropped
        if 'node_id' in self.nodes.columns:
            self.node_feature_labels = self.nodes.drop(columns="node_id").columns
        else:
            self.node_feature_labels = self.nodes.columns
        self.edge_feature_labels = self.df[self.edge_features].drop(columns="

In [14]:
pl.initialize_training(epochs=1)

In [15]:
print(pl.model)

GINe(
  (node_emb): Linear(in_features=9, out_features=100, bias=True)
  (edge_emb): Linear(in_features=47, out_features=100, bias=True)
  (convs): ModuleList(
    (0-1): 2 x GINEConv(nn=Sequential(
      (0): Linear(in_features=100, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    ))
  )
  (emlps): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=300, out_features=100, bias=True)
      (1): ReLU()
      (2): Linear(in_features=100, out_features=100, bias=True)
    )
  )
  (batch_norms): ModuleList(
    (0-1): 2 x BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.10527690625126304, inplace=False)
    (3): Linear(in_features=50, out_features=25, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.10527690625126304, inplace=False)
    (6): Linear(in_featur

In [16]:
pl.trainer.train()

Epoch 1 Training: 100%|██████████| 107/107 [00:30<00:00,  3.56it/s]


In [17]:
pl.device

'cuda'