# Practice GNN

## Notebook configuration

In [None]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch-geometric
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, EdgeConv, GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
!pip install torchmetrics
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "HI-Small_Trans.csv")

# # Project Source Code
# src_path = os.path.join(content_base, "My Drive/Capstone/src")
# sys.path.append(src_path)
# from helpers import add_cell_timer
# from pipeline import ModelPipeline

# add_cell_timer()

Mounted at /content/drive


In [None]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

Cloning into 'dsi-capstone-spring-2025-TD-anti-money-laundering'...
remote: Enumerating objects: 431, done.[K
remote: Counting objects: 100% (266/266), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 431 (delta 151), reused 177 (delta 94), pack-reused 165 (from 1)[K
Receiving objects: 100% (431/431), 27.67 MiB | 15.09 MiB/s, done.
Resolving deltas: 100% (191/191), done.
Already up to date.
Branch 'sophie' set up to track remote branch 'sophie' from 'origin'.
Switched to a new branch 'sophie'


In [None]:
from helpers import add_cell_timer
from pipeline import ModelPipeline
add_cell_timer()

## Data preprocessing

In [None]:
# Initialize pipeline with dataset
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 98.73s


In [None]:
from sklearn.preprocessing import LabelEncoder
bank_encoder = LabelEncoder()
from_banks = pl.df["from_bank"].drop_duplicates().reset_index(drop=True)
to_banks = pl.df["to_bank"].drop_duplicates().reset_index(drop=True)
all_banks = pd.concat([from_banks, to_banks]).drop_duplicates().reset_index(drop=True)
bank_encoder.fit(all_banks)
pl.df["from_bank"] = bank_encoder.transform(pl.df["from_bank"]) # Use same encoder
pl.df["to_bank"] = bank_encoder.transform(pl.df["to_bank"])  # Use same encoder


⏱️ Execution time: 0.78s


In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_from_to(df, colfrom, colto):
  encoder = LabelEncoder()
  from_unique = df[colfrom].drop_duplicates().reset_index(drop=True)
  to_unique = df[colto].drop_duplicates().reset_index(drop=True)
  all_unique = pd.concat([from_unique, to_unique]).drop_duplicates().reset_index(drop=True)
  encoder.fit(all_unique)
  df[colfrom] = encoder.transform(df[colfrom])
  df[colto] = encoder.transform(df[colto])
  return df, encoder

pl.df, _ = encode_from_to(pl.df, "from_bank", "to_bank")
pl.df, _ = encode_from_to(pl.df, "sent_currency", "received_currency")



⏱️ Execution time: 2.14s


In [None]:
pl.apply_label_encoding(["payment_type","day_of_week"])
pl.extract_graph_features(weight_col="sent_amount")


⏱️ Execution time: 214.55s


In [None]:
# X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
#        'sent_amount', 'sent_currency', 'payment_type',
#        'from_account_id', 'to_account_id', 'from_account_idx',
#        'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
#        'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
#        'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
#        'time_of_day_cos', 'is_weekend']

X_cols = ['from_account_idx', 'to_account_idx','received_amount', 'sent_amount',
          'sent_currency', 'payment_type','day_of_week','timestamp_int']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15)


⏱️ Execution time: 3.3s


In [None]:
# edge_features = ['received_amount', 'received_currency','sent_amount', 'sent_currency',
#                  'payment_type','sent_amount_usd', 'received_amount_usd', 'hour_of_day',
#                  'day_of_week', 'seconds_since_midnight', 'timestamp_int']
# edge_features = ['received_amount', 'received_currency','sent_amount', 'sent_currency',
#                  'payment_type','sent_amount_usd', 'received_amount_usd', 'hour_of_day',
#                  'day_of_week', 'seconds_since_midnight', 'timestamp_int', 'timestamp_scaled',
#                  'day_sin', 'day_cos', 'time_of_day_sin', 'time_of_day_cos', 'is_weekend']
edge_features = ['received_amount', 'sent_amount', 'sent_currency',
                 'payment_type','day_of_week','timestamp_int']


⏱️ Execution time: 0.0s


In [None]:
def generate_tensors(X, y, edge_features, edges = ["from_account_idx", "to_account_idx"]):
    """Convert data to PyTorch tensor format for GNNs"""

    # Edge index (defining graph structure)
    edge_index = torch.tensor(X[edges].values.T, dtype=torch.long)  # Shape: [2, num_edges]

    # Edge attributes (transaction-based features)
    edge_attr = torch.tensor(X[edge_features].values, dtype=torch.float)  # Shape: [num_edges, num_features]

    # Labels for edges (transaction classification: laundering or not)
    edge_labels = torch.tensor(y.values, dtype=torch.long)  # Shape: [num_edges]

    # Infer number of nodes based on highest index in edge_index
    num_nodes = edge_index.max().item() + 1  # Ensure it captures all nodes

    # Create PyG Data object
    data = Data(edge_index=edge_index, edge_attr=edge_attr, y=edge_labels, num_nodes=num_nodes)

    return data

# Assuming X_train, y_train are preprocessed dataframes
train_data = generate_tensors(X_train, y_train, edge_features)
val_data = generate_tensors(X_val, y_val, edge_features)
test_data = generate_tensors(X_test, y_test, edge_features)


⏱️ Execution time: 0.28s


In [None]:
all_data = generate_tensors(pl.df[X_cols], pl.df[y_col], edge_features)


⏱️ Execution time: 0.47s


## GNNs

Cannot use GCN or GAT!

* The Graph Convolutional Network (GCN), implemented with `GCNConv`, only aggregates features from neighboring nodes and does not use edge attributes in its message passing.
* Graph Attention Networks (GAT), implemented with `GATConv`, allows edge attention weights, which can indirectly incorporate edge attributes. Problem: If all nodes have the same feature vector (e.g., initialized to 1), then the computed attention scores will be the same for all edges. We'd need to modify GAT to use edge features meaningfully in the attention computation.

`GINeConv`
* Directly includes edge attributes in message passing using an MLP-based edge transformation.

`EdgeConv`
* dynamically computes edge embeddings and updates node features based on edges



### GINeConv

Modified GINConv that includes edge features in message passing.
Update rule is:
h (l+1) = h (l) + sum (MLP(h(l + e


In [None]:
# If on GPU, do as below
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda

⏱️ Execution time: 0.0s


In [None]:
class EdgeGINE(nn.Module):
    def __init__(self, n_node_feats, n_edge_feats, n_hidden=64):
        super(EdgeGINE, self).__init__()

        self.n_hidden = n_hidden
        self.n_node_feats = n_node_feats
        self.n_edge_feats = n_edge_feats

        # MLP to embed node and edges
        self.node_emb = nn.Linear(self.n_node_feats, self.n_hidden)
        self.edge_emb = nn.Linear(self.n_edge_feats, self.n_hidden)

        # MLP that processes edge features, passed into GINEConv
        nn_edge = Sequential(Linear(self.n_hidden, self.n_hidden), ReLU(), Linear(self.n_hidden, self.n_hidden))

        # Two GINEConv layers using nn_edge when it needs to process edge attributes
        self.gine1 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)
        self.gine2 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)

        # MLP for edge classification
        self.mlp = nn.Sequential(
            nn.Linear(3 * self.n_hidden, 128), # src, dest, edge
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features (or placeholder embeddings if None)
        edge_index: Edge list (2, n_edges)
        edge_attr: Edge features (n_edges, self.n_edge_feats)
        """
        if x is None:  # If no node features, use trainable embeddings
            x = torch.ones((edge_index.max().item() + 1, 1), device=device)

        x = self.node_emb(x) # MLP
        edge_attr = self.edge_emb(edge_attr) # MLP
        x, edge_attr, edge_index = x.to(device), edge_attr.to(device), edge_index.to(device)

        # Pass nodes and edges through GINE layers
        x = self.gine1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.gine2(x, edge_index, edge_attr)

        # Get output for classification
        src, dest = edge_index
        src_embed = x[src]
        dest_embed = x[dest]

        edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
        edge_logits = self.mlp(edge_inputs).squeeze(1)

        return edge_logits



⏱️ Execution time: 0.0s


In [None]:
from torchmetrics import AveragePrecision

# Move data to GPU if using
train_data = train_data.to(device)
val_data = val_data.to(device)
test_data = test_data.to(device)
all_data = all_data.to(device)

# Create DataLoader (batch size=1 because we have one graph)
all_loader = DataLoader([all_data], batch_size=1, shuffle=False)
train_loader = DataLoader([train_data], batch_size=1, shuffle=True)
val_loader = DataLoader([val_data], batch_size=1, shuffle=False)
test_loader = DataLoader([test_data], batch_size=1, shuffle=False)

# Create metrics
accuracy = Accuracy(task="binary").to(device) # 1/N sum(1(y=yhat))
recall = Recall(task='binary').to(device) # TP / (TP+FN), or use BinaryRecall class?
precision = Precision(task="binary").to(device) # TP / (TP + FP)
auroc = AUROC(task="binary").to(device)
pr_auc = AveragePrecision(task="binary").to(device)

# Initialize model & optimizer
num_edge_features = len(edge_features)  # Your selected transaction features
num_node_features = 1
model = EdgeGINE(num_node_features, num_edge_features).to(device)
optimizer = Adam(model.parameters(), lr=0.01)

# USe weighted BCE loss
criterion = BCEWithLogitsLoss(pos_weight=torch.tensor([3], device=device))


⏱️ Execution time: 0.01s


In [None]:
# Training loop
def train(model, train_loader, optimizer, criterion, epochs=20):
    for epoch in range(epochs):
        model.train()
        # Reset metrics
        running_loss = 0.0
        accuracy.reset(), recall.reset(), precision.reset(), auroc.reset(), pr_auc.reset()

        for batch in train_loader:
            batch = batch.to(device) # Batch to device
            optimizer.zero_grad() # Zero gradients

            # Forward pass
            logits = model(batch.x, batch.edge_index, batch.edge_attr)
            probs = torch.sigmoid(logits)  # Convert logits to probabilities
            preds = (probs > 0.5).long()  # Convert to binary predictions

            # Compute loss
            loss = criterion(logits, batch.y.float())  # BCE expects float labels
            loss.backward()
            optimizer.step()

            # Accumulate loss
            running_loss += loss.item()

            # Update metrics
            accuracy.update(preds, batch.y)
            recall.update(preds, batch.y)
            precision.update(preds, batch.y)
            auroc.update(probs, batch.y)
            pr_auc.update(probs, batch.y)

        if epoch%100 == 0:
          # Compute epoch-level metrics
          epoch_acc = accuracy.compute()
          epoch_recall = recall.compute()
          epoch_precision = precision.compute()
          epoch_auroc = auroc.compute()
          epoch_pr_auc = pr_auc.compute()

          print(f"Epoch {epoch+1}/{epochs} | Loss: {running_loss / len(train_loader):.4f} | "
                f"Acc: {epoch_acc:.4f} | Rec: {epoch_recall:.4f} | Prec: {epoch_precision:.4f} | AUROC: {epoch_auroc:.4f} | PR-AUC: {epoch_pr_auc:.4f} ")

        # Validation loop
        # validate(model, val_loader, criterion)

# Validation loop
def validate(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0

    # Reset metrics
    accuracy.reset(), recall.reset(), precision.reset(), auroc.reset(), pr_auc.reset()

    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)

            # Forward pass
            logits = model(batch.x, batch.edge_index, batch.edge_attr)
            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            # Compute loss
            loss = criterion(logits, batch.y.float())
            val_loss += loss.item()

            # Update metrics
            accuracy.update(preds, batch.y)
            recall.update(preds, batch.y)
            precision.update(preds, batch.y)
            auroc.update(probs, batch.y)
            pr_auc.update(probs, batch.y)

    # Compute validation metrics
    val_acc = accuracy.compute()
    val_recall = recall.compute()
    val_precision = precision.compute()
    val_auroc = auroc.compute()
    val_pr_auc = pr_auc.compute()

    print(f"             Val Loss: {val_loss / len(val_loader):.4f} | "
          f"Acc: {val_acc:.4f} | Rec: {val_recall:.4f} | Prec: {val_precision:.4f} | AUROC: {val_auroc:.4f} | PR-AUC: {val_pr_auc:.4f} ")


⏱️ Execution time: 0.0s


In [None]:
# Run Training
train(model, train_loader, optimizer, criterion, epochs=5000)

OutOfMemoryError: CUDA out of memory. Tried to allocate 868.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 512.12 MiB is free. Process 3056 has 14.24 GiB memory in use. Of the allocated memory 13.59 GiB is allocated by PyTorch, and 541.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


⏱️ Execution time: 0.1s


## EdgeConv

In [None]:
# from torch_geometric.nn import EdgeConv
# from torch.nn import Linear

# class EdgeConvGNN(nn.Module):
#     def __init__(self, num_node_features, self.n_edge_feats, self.n_hidden=64):
#         super(EdgeConvGNN, self).__init__()

#         self.edge_conv1 = EdgeConv(Sequential(Linear(2 * num_node_features, self.n_hidden), ReLU()))
#         self.edge_conv2 = EdgeConv(Sequential(Linear(2 * self.n_hidden, self.n_hidden), ReLU()))

#         self.mlp = nn.Sequential(
#             nn.Linear(2 * self.n_hidden + self.n_edge_feats, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1),
#         )

#     def forward(self, x, edge_index, edge_attr):
#         x = selfself.n_hiddennv1(x, edge_index)
#         x = F.relu(x)
#         x = self.edge_conv2(x, edge_index)

#         src, dest = edge_index
#         src_embed = x[src]
#         dest_embed = x[dest]

#         edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
#         edge_logits = self.mlp(edge_inputs).squeeze(1)

#         return edge_logits
