# Practice GNN

## Notebook configuration

In [1]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch-geometric
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, EdgeConv, GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
!pip install torchmetrics
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "HI-XS.csv")

# # Project Source Code
# src_path = os.path.join(content_base, "My Drive/Capstone/src")
# sys.path.append(src_path)
# from helpers import add_cell_timer
# from pipeline import ModelPipeline

# add_cell_timer()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

remote: Enumerating objects: 6, done.[K
remote: Counting objects:  16% (1/6)[Kremote: Counting objects:  33% (2/6)[Kremote: Counting objects:  50% (3/6)[Kremote: Counting objects:  66% (4/6)[Kremote: Counting objects:  83% (5/6)[Kremote: Counting objects: 100% (6/6)[Kremote: Counting objects: 100% (6/6), done.[K
remote: Total 6 (delta 4), reused 6 (delta 4), pack-reused 0 (from 0)[K
Unpacking objects:  16% (1/6)Unpacking objects:  33% (2/6)Unpacking objects:  50% (3/6)Unpacking objects:  66% (4/6)Unpacking objects:  83% (5/6)Unpacking objects: 100% (6/6)Unpacking objects: 100% (6/6), 503 bytes | 251.00 KiB/s, done.
From https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering
   d16c5af..e2157df  sophie     -> origin/sophie
Updating d16c5af..e2157df
Fast-forward
 Code/src/pipeline/__init__.py | 10 [32m+++++[m[31m-----[m
 1 file changed, 5 insertions(+), 5 deletions(-)
Already on 'sophie'
Your branch is up to date with 'origin/sophie'.


In [3]:
from helpers import add_cell_timer
from pipeline import ModelPipeline
add_cell_timer()

## Data preprocessing

In [4]:
# Initialize pipeline with dataset
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 41.26s


In [5]:
# X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
#        'sent_amount', 'sent_currency', 'payment_type',
#        'from_account_id', 'to_account_id', 'from_account_idx',
#        'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
#        'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
#        'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
#        'time_of_day_cos', 'is_weekend']
X_cols = ['from_account_idx', 'to_account_idx','from_bank','received_amount', 'sent_amount',
          'sent_currency', 'payment_type','day_of_week','timestamp_int']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15)


⏱️ Execution time: 0.33s


In [8]:
edge_features = ['received_amount', 'sent_amount', 'sent_currency',
                 'payment_type','day_of_week','timestamp_int']
                 node_features = ['from_bank']
train_data, val_data, test_data = pl.generate_tensors(edge_features, node_features)


⏱️ Execution time: 0.06s


## GNNs

Cannot use GCN or GAT!

* The Graph Convolutional Network (GCN), implemented with `GCNConv`, only aggregates features from neighboring nodes and does not use edge attributes in its message passing.
* Graph Attention Networks (GAT), implemented with `GATConv`, allows edge attention weights, which can indirectly incorporate edge attributes. Problem: If all nodes have the same feature vector (e.g., initialized to 1), then the computed attention scores will be the same for all edges. We'd need to modify GAT to use edge features meaningfully in the attention computation.

`GINeConv`
* Directly includes edge attributes in message passing using an MLP-based edge transformation.

`EdgeConv`
* dynamically computes edge embeddings and updates node features based on edges



### GINeConv

Modified GINConv that includes edge features in message passing.
Update rule is:
h (l+1) = h (l) + sum (MLP(h(l + e


In [9]:
# If on GPU, do as below
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda

⏱️ Execution time: 0.0s


In [10]:
class EdgeGINE(nn.Module):
    def __init__(self, n_node_feats, n_edge_feats, n_hidden=64):
        super(EdgeGINE, self).__init__()

        self.n_hidden = n_hidden
        self.n_node_feats = n_node_feats
        self.n_edge_feats = n_edge_feats

        # MLP to embed node and edges
        self.node_emb = nn.Linear(self.n_node_feats, self.n_hidden)
        self.edge_emb = nn.Linear(self.n_edge_feats, self.n_hidden)

        # MLP that processes edge features, passed into GINEConv
        nn_edge = Sequential(Linear(self.n_hidden, self.n_hidden), ReLU(), Linear(self.n_hidden, self.n_hidden))

        # Two GINEConv layers using nn_edge when it needs to process edge attributes
        self.gine1 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)
        self.gine2 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)

        # MLP for edge classification
        self.mlp = nn.Sequential(
            nn.Linear(3 * self.n_hidden, 128), # src, dest, edge
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features (or placeholder embeddings if None)
        edge_index: Edge list (2, n_edges)
        edge_attr: Edge features (n_edges, self.n_edge_feats)
        """
        if x is None:  # If no node features, use trainable embeddings
            x = torch.ones((edge_index.max().item() + 1, 1), device=device)

        x = self.node_emb(x) # MLP
        edge_attr = self.edge_emb(edge_attr) # MLP
        x, edge_attr, edge_index = x.to(device), edge_attr.to(device), edge_index.to(device)

        # Pass nodes and edges through GINE layers
        x = self.gine1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.gine2(x, edge_index, edge_attr)

        # Get output for classification
        src, dest = edge_index
        src_embed = x[src]
        dest_embed = x[dest]

        edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
        edge_logits = self.mlp(edge_inputs).squeeze(1)

        return edge_logits



⏱️ Execution time: 0.0s


In [12]:
from torchmetrics import AveragePrecision

# Move data to GPU if using
train_data = train_data.to(device)
# val_data = val_data.to(device)
# test_data = test_data.to(device)

# Create DataLoader (batch size=1 because we have one graph)
# all_loader = DataLoader([all_data], batch_size=1, shuffle=False)
train_loader = DataLoader([train_data], batch_size=1, shuffle=True)
# val_loader = DataLoader([val_data], batch_size=1, shuffle=False)
# test_loader = DataLoader([test_data], batch_size=1, shuffle=False)

# Create metrics
accuracy = Accuracy(task="binary").to(device) # 1/N sum(1(y=yhat))
recall = Recall(task='binary').to(device) # TP / (TP+FN), or use BinaryRecall class?
precision = Precision(task="binary").to(device) # TP / (TP + FP)
auroc = AUROC(task="binary").to(device)
pr_auc = AveragePrecision(task="binary").to(device)

# Initialize model & optimizer
num_edge_features = len(edge_features)  # Your selected transaction features
num_node_features = 1
model = EdgeGINE(num_node_features, num_edge_features).to(device)
optimizer = Adam(model.parameters(), lr=0.01)

# USe weighted BCE loss
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([3], device=device))


⏱️ Execution time: 0.05s


In [13]:
# Training loop
def train(model, train_loader, optimizer, criterion, epochs=20):
    for epoch in range(epochs):
        model.train()
        # Reset metrics
        running_loss = 0.0
        accuracy.reset(), recall.reset(), precision.reset(), auroc.reset(), pr_auc.reset()

        for batch in train_loader:
            batch = batch.to(device) # Batch to device
            optimizer.zero_grad() # Zero gradients

            # Forward pass
            logits = model(batch.x, batch.edge_index, batch.edge_attr)
            probs = torch.sigmoid(logits)  # Convert logits to probabilities
            preds = (probs > 0.5).long()  # Convert to binary predictions

            # Compute loss
            loss = criterion(logits, batch.y.float())  # BCE expects float labels
            loss.backward()
            optimizer.step()

            # Accumulate loss
            running_loss += loss.item()

            # Update metrics
            accuracy.update(preds, batch.y)
            recall.update(preds, batch.y)
            precision.update(preds, batch.y)
            auroc.update(probs, batch.y)
            pr_auc.update(probs, batch.y)

        if epoch%100 == 0:
          # Compute epoch-level metrics
          epoch_acc = accuracy.compute()
          epoch_recall = recall.compute()
          epoch_precision = precision.compute()
          epoch_auroc = auroc.compute()
          epoch_pr_auc = pr_auc.compute()

          print(f"Epoch {epoch+1}/{epochs} | Loss: {running_loss / len(train_loader):.4f} | "
                f"Acc: {epoch_acc:.4f} | Rec: {epoch_recall:.4f} | Prec: {epoch_precision:.4f} | AUROC: {epoch_auroc:.4f} | PR-AUC: {epoch_pr_auc:.4f} ")

        # Validation loop
        # validate(model, val_loader, criterion)

# Validation loop
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0

    # Reset metrics
    accuracy.reset(), recall.reset(), precision.reset(), auroc.reset(), pr_auc.reset()

    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)

            # Forward pass
            logits = model(batch.x, batch.edge_index, batch.edge_attr)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            # Compute loss
            loss = criterion(logits, batch.y.float())
            val_loss += loss.item()

            # Update metrics
            accuracy.update(preds, batch.y)
            recall.update(preds, batch.y)
            precision.update(preds, batch.y)
            auroc.update(probs, batch.y)
            pr_auc.update(probs, batch.y)

    # Compute validation metrics
    val_acc = accuracy.compute()
    val_recall = recall.compute()
    val_precision = precision.compute()
    val_auroc = auroc.compute()
    val_pr_auc = pr_auc.compute()

    print(f"             Val Loss: {val_loss / len(val_loader):.4f} | "
          f"Acc: {val_acc:.4f} | Rec: {val_recall:.4f} | Prec: {val_precision:.4f} | AUROC: {val_auroc:.4f} | PR-AUC: {val_pr_auc:.4f} ")


⏱️ Execution time: 0.0s


In [14]:
# Run Training
train(model, train_loader, optimizer, criterion, epochs=5000)

Epoch 1/5000 | Loss: 1590260.1250 | Acc: 0.7323 | Rec: 0.2617 | Prec: 0.0010 | AUROC: 0.4972 | PR-AUC: 0.0010 
Epoch 101/5000 | Loss: 0.3158 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 
Epoch 201/5000 | Loss: 0.1673 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 
Epoch 301/5000 | Loss: 0.1069 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 
Epoch 401/5000 | Loss: 0.0767 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 
Epoch 501/5000 | Loss: 0.0595 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 
Epoch 601/5000 | Loss: 0.0489 | Acc: 0.9990 | Rec: 0.0000 | Prec: 0.0000 | AUROC: 0.5001 | PR-AUC: 0.0010 


KeyboardInterrupt: 


⏱️ Execution time: 70.75s


## EdgeConv

In [None]:
# from torch_geometric.nn import EdgeConv
# from torch.nn import Linear

# class EdgeConvGNN(nn.Module):
#     def __init__(self, num_node_features, self.n_edge_feats, self.n_hidden=64):
#         super(EdgeConvGNN, self).__init__()

#         self.edge_conv1 = EdgeConv(Sequential(Linear(2 * num_node_features, self.n_hidden), ReLU()))
#         self.edge_conv2 = EdgeConv(Sequential(Linear(2 * self.n_hidden, self.n_hidden), ReLU()))

#         self.mlp = nn.Sequential(
#             nn.Linear(2 * self.n_hidden + self.n_edge_feats, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1),
#         )

#     def forward(self, x, edge_index, edge_attr):
#         x = selfself.n_hiddennv1(x, edge_index)
#         x = F.relu(x)
#         x = self.edge_conv2(x, edge_index)

#         src, dest = edge_index
#         src_embed = x[src]
#         dest_embed = x[dest]

#         edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
#         edge_logits = self.mlp(edge_inputs).squeeze(1)

#         return edge_logits
