# Practice GNN

## Notebook configuration

In [None]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch-geometric
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, EdgeConv, GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
!pip install torchmetrics
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "HI-Small_25.csv")

# # Project Source Code
# src_path = os.path.join(content_base, "My Drive/Capstone/src")
# sys.path.append(src_path)
# from helpers import add_cell_timer
# from pipeline import ModelPipeline

# add_cell_timer()

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Collecting torchmetrics
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.2-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1

In [None]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

Cloning into 'dsi-capstone-spring-2025-TD-anti-money-laundering'...
remote: Enumerating objects: 520, done.[K
remote: Counting objects: 100% (144/144), done.[K
remote: Compressing objects: 100% (88/88), done.[K
remote: Total 520 (delta 79), reused 80 (delta 48), pack-reused 376 (from 1)[K
Receiving objects: 100% (520/520), 25.99 MiB | 16.30 MiB/s, done.
Resolving deltas: 100% (246/246), done.
Already up to date.
Branch 'sophie' set up to track remote branch 'sophie' from 'origin'.
Switched to a new branch 'sophie'


In [None]:
from helpers import add_cell_timer
from pipeline import ModelPipeline
add_cell_timer()

## Data preprocessing

Run initial full-dataset preprocessing

In [None]:
# Initialize pipeline with dataset
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 99.67s


In [None]:
X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
       'sent_amount', 'sent_currency', 'payment_type', 'degree_centrality_sent_amount',
          'pagerank_sent_amount', 'degree_centrality_received_amount', 'pagerank_received_amount', 'from_account_idx',
       'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
       'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
       'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_weekend']
# X_cols = ['from_account_idx', 'to_account_idx','from_bank','received_amount', 'sent_amount',
#           'sent_currency', 'payment_type','day_of_week','timestamp_int']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15, split_type='temporal_agg')


⏱️ Execution time: 1.19s


In [None]:
numerical_feats = ['sent_amount_usd', 'received_amount_usd']
X_train, X_test, X_val = pl.numerical_scaling(numerical_feats)


⏱️ Execution time: 0.06s


In [None]:
edge_features = ['received_amount', 'received_currency',
       'sent_amount', 'sent_currency', 'payment_type', 'sent_amount_usd', 'received_amount_usd',
       'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
       'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_weekend']
# edge_features = ['received_amount', 'sent_amount', 'sent_currency',
#                  'payment_type','day_of_week','timestamp_int']
node_features = ['from_bank','degree_centrality_sent_amount','pagerank_sent_amount', 'degree_centrality_received_amount', 'pagerank_received_amount']
train_data, val_data, test_data = pl.generate_tensors(edge_features, node_features)


⏱️ Execution time: 0.36s


## GNNs

Cannot use GCN or GAT!

* The Graph Convolutional Network (GCN), implemented with `GCNConv`, only aggregates features from neighboring nodes and does not use edge attributes in its message passing.
* Graph Attention Networks (GAT), implemented with `GATConv`, allows edge attention weights, which can indirectly incorporate edge attributes. Problem: If all nodes have the same feature vector (e.g., initialized to 1), then the computed attention scores will be the same for all edges. We'd need to modify GAT to use edge features meaningfully in the attention computation.

`GINeConv`
* Directly includes edge attributes in message passing using an MLP-based edge transformation.

`EdgeConv`
* dynamically computes edge embeddings and updates node features based on edges



### GINeConv

Modified GINConv that includes edge features in message passing.
Update rule is:
h (l+1) = h (l) + sum (MLP(h(l + e


In [None]:
# If on GPU, do as below
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda

⏱️ Execution time: 0.0s


In [None]:
class EdgeGINE(nn.Module):
    def __init__(self, n_node_feats, n_edge_feats, n_hidden=64):
        super(EdgeGINE, self).__init__()

        self.n_hidden = n_hidden
        self.n_node_feats = n_node_feats
        self.n_edge_feats = n_edge_feats

        # Linear to embed node and edges
        self.node_emb = nn.Linear(self.n_node_feats, self.n_hidden) # [num_nodes, n_hidden]
        self.edge_emb = nn.Linear(self.n_edge_feats, self.n_hidden) # [num_edges, n_hidden]

        # MLP that processes edge features, passed into GINEConv
        nn_edge = Sequential(Linear(self.n_hidden, self.n_hidden), ReLU(), Linear(self.n_hidden, self.n_hidden))

        # Two GINEConv layers using nn_edge when it needs to process edge attributes
        self.gine1 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)
        self.gine2 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)

        # Edge updates MLPs
        self.emlp1 = Sequential(
                nn.Linear(3 * self.n_hidden, self.n_hidden),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
            )
        self.emlp2 = Sequential(
                nn.Linear(3 * self.n_hidden, self.n_hidden),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
            )

        # MLP for edge classification
        self.mlp = nn.Sequential(
            nn.Linear(3 * self.n_hidden, 128), # src, dest, edge
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features (or placeholder embeddings if None)
        edge_index: Edge list (2, n_edges)
        edge_attr: Edge features (n_edges, self.n_edge_feats)
        """
        src, dest = edge_index
        if x is None:  # If no node features, use trainable embeddings
            x = torch.ones((edge_index.max().item() + 1, 1), device=device)

        # Create some initial embeddings for nodes and edges
        x = self.node_emb(x) # MLP
        edge_attr = self.edge_emb(edge_attr) # MLP
        x, edge_attr, edge_index = x.to(device), edge_attr.to(device), edge_index.to(device)

        # Pass nodes and edges through GINE layer1
        x = x + F.relu(self.gine1(x, edge_index, edge_attr))

        # Update edges with MLP1
        edge_attr = edge_attr + self.emlp1(torch.cat([x[src], x[dest], edge_attr], dim=-1)) / 2

        # Pass nodes and edges through GINE layer2
        x = F.relu(self.gine1(x, edge_index, edge_attr))

        # Update edges with MLP2
        edge_attr = edge_attr + self.emlp2(torch.cat([x[src], x[dest], edge_attr], dim=-1)) / 2

        # Get output for classification
        src_embed, dest_embed = x[src], x[dest]
        edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
        edge_logits = self.mlp(edge_inputs).squeeze(1)

        return edge_logits



⏱️ Execution time: 0.0s


Create data loaders - split into batches using `LinkNeighborLoader`, incorporating masking in the loading process & batching

LinkNeighborLoader args:

**num_neighbors:** how many neighbors are sampled per node -- only sampling a subgraph around each edge in a batch. it is size [x,y] because we have 2 layers (sample x nodes in layer 1 and y nodes in layer 2).
- Let’s say your batch contains 100 edges, and each edge touches two nodes (source and destination). Then LinkNeighborLoader will:
  - Identify all unique nodes from those 100 edges
  - For each of those nodes:
      - Sample 10 neighbors (for layer 1)
      - Then, for each of those neighbors, sample another 10 neighbors (for layer 2)
  - Build a mini subgraph for this batch using only those sampled nodes and edges
- Imagine you're doing link prediction for a social network:
  - batch_size = 1024 means you're analyzing 1024 friend requests at a time
  - num_neighbors = [10, 10] means for each person in the request, you look at:
    - Their 10 direct friends
    - And 10 friends-of-friends per direct friend

In [None]:
from torch_geometric.loader import LinkNeighborLoader

batch_size=8192
num_neighbors=10

# Move data to GPU if using
train_data = train_data.to(device)
val_data = val_data.to(device)
#test_data = test_data.to(device)

tr_loader = LinkNeighborLoader(
    tr_data,
    num_neighbors=num_neighbors,
    batch_size=batch_size,
    shuffle=True
)

val_loader = LinkNeighborLoader(
    val_data,
    num_neighbors=num_neighbors,
    edge_label_index=val_data.edge_index[:, val_inds],  # Only evaluate on these edges
    edge_label=val_data.y[val_inds],
    batch_size=batch_size,
    shuffle=False
)

# te_loader = LinkNeighborLoader(
#     te_data,
#     num_neighbors=num_neighbors,
#     edge_label_index=te_data.edge_index[:, te_inds],
#     edge_label=te_data.y[te_inds],
#     batch_size=args.batch_size,
#     shuffle=False
# )

In [None]:
from torchmetrics import AveragePrecision, F1Score

# Create DataLoader (batch size=1 because we have one graph)
train_loader = DataLoader([train_data], batch_size=1, shuffle=True)
val_loader = LinkNeighborLoader([val_data], batch_size=8192, num_neighbors=100, shuffle=False)
# test_loader = LinkNeighborLoader([test_data], batch_size=1, shuffle=False)

# # Create metrics
# accuracy = Accuracy(task="binary").to(device) # 1/N sum(1(y=yhat))
# recall = Recall(task='binary').to(device) # TP / (TP+FN), or use BinaryRecall class?
# precision = Precision(task="binary").to(device) # TP / (TP + FP)
# auroc = AUROC(task="binary").to(device)
# pr_auc = AveragePrecision(task="binary").to(device)
# f1 = F1Score(task="binary").to(device)

# Initialize model & optimizer
num_edge_features = len(edge_features)  # Your selected transaction features
num_node_features = 5
model = EdgeGINE(num_node_features, num_edge_features).to(device)
optimizer = Adam(model.parameters(), lr=0.01)

# Use focal loss to focus on rare positives
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, logits, targets):
        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce_loss)  # Probabilities of correct classification
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

class HybridLoss(torch.nn.Module):
    """Hybrid Loss that balances BCE (for accuracy) and Focal Loss (for recall)"""
    def __init__(self, alpha=0.25, gamma=2.0, focal_weight=0.5):
        super().__init__()
        self.bce = torch.nn.BCEWithLogitsLoss()
        self.alpha = alpha
        self.gamma = gamma
        self.focal_weight = focal_weight  # Weighting factor between BCE and Focal Loss

    def forward(self, logits, targets):
        # BCE Loss
        bce_loss = self.bce(logits, targets.float())

        # Focal Loss
        probs = torch.sigmoid(logits)
        bce_loss_per_sample = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
        focal_loss = self.alpha * (1 - torch.exp(-bce_loss_per_sample)) ** self.gamma * bce_loss_per_sample
        focal_loss = focal_loss.mean()

        # Combine BCE and Focal Loss
        total_loss = (1 - self.focal_weight) * bce_loss + self.focal_weight * focal_loss
        return total_loss


#criterion = FocalLoss(gamma=2.0, alpha=0.25)
criterion = BCEWithLogitsLoss(weight=torch.tensor([6.0],device=device))
#criterion = HybridLoss(focal_weight=0.3)  # Adjust weight (0.3-0.6 works well)


⏱️ Execution time: 0.01s


In [None]:
from torchmetrics.functional import accuracy, recall, precision, auroc, average_precision, f1_score

# Training function with one forward pass for training and validation
def train(model, optimizer, criterion, tr_loader, val_loader, train_mask, val_mask, test_mask, threshold=0.5, epochs=20):
    for epoch in range(epochs):
        model.train()
        total_loss = total_examples = 0
        preds, ground_truths = [], []

        for batch in tqdm(tr_loader):

            optimizer.zero_grad()
            batch = batch.to(device)

            # **Forward pass once on the entire graph**
            batch_logits = model(batch.x, batch.edge_index, batch.edge_attr)
            batch_probs = torch.sigmoid(batch_logits)  # Convert logits to probabilities
            batch_preds = (batch_probs > threshold).long()  # Convert to binary predictions
            preds.append(batch_preds)
            ground_truths.append(batch.y)

            # **Compute loss**
            batch_loss = criterion(batch_logits[train_mask], batch.y[train_mask].float())

            # **Training loss (only for training labels)**
            train_loss = criterion(logits[train_mask], data.y[train_mask].float())

        # **Backward pass & optimization**
        train_loss.backward()
        optimizer.step()

        # **Validation loss (computed without gradient updates)**
        with torch.no_grad():
            val_loss = criterion(logits[val_mask], data.y[val_mask].float())

        # **Compute train, val, and test metrics using the same model output**
        def compute_metrics(preds, data, mask):
          acc = accuracy(preds[mask], data.y[mask], task="binary")
          rec = recall(preds[mask], data.y[mask], task="binary")
          prec = precision(preds[mask], data.y[mask], task="binary")
          auroc_score = auroc(probs[mask], data.y[mask], task="binary")
          pr_auc_score = average_precision(probs[mask], data.y[mask], task="binary")
          f1 = f1_score(preds[mask], data.y[mask], task="binary")
          return acc, rec, prec, auroc_score, pr_auc_score, f1

        train_acc, train_rec, train_prec, train_auroc, train_pr_auc, train_f1 = compute_metrics(preds, data, train_mask)
        val_acc, val_rec, val_prec, val_auroc, val_pr_auc, val_f1 = compute_metrics(preds, data, val_mask)
        # test_acc, test_rec, test_prec, test_auroc, test_pr_auc, test_f1 = compute_metrics(preds, data, test_mask)

        if epoch%50==0:
              print(f"Epoch {epoch+1}/{epochs} "
                f"Train Loss: {train_loss.item():.4f}  | Val Loss: {val_loss.item():.4f} |  "
                f"Train Acc: {train_acc:.4f}  | Val Acc: {val_acc:.4f} |  "
                f"Train F1: {train_f1:.4f}  | Val F1: {val_f1:.4f} |  "
                f"Train PR-AUC: {train_pr_auc:.4f}  | Val PR-AUC: {val_pr_auc:.4f} |  "
                f"Train Prec: {train_prec:.4f}  | Val Prec: {val_prec:.4f} |  "
                f"Train Rec: {train_rec:.4f}  | Val Rec: {val_rec:.4f} |  ")



⏱️ Execution time: 0.0s


In [None]:
# Run Training
train(model, optimizer, criterion, val_data, train_mask, val_mask, test_mask, epochs=25000)

OutOfMemoryError: CUDA out of memory. Tried to allocate 264.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 30.12 MiB is free. Process 15620 has 14.71 GiB memory in use. Of the allocated memory 12.75 GiB is allocated by PyTorch, and 1.83 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


⏱️ Execution time: 0.3s


## EdgeConv

In [None]:
# from torch_geometric.nn import EdgeConv
# from torch.nn import Linear

# class EdgeConvGNN(nn.Module):
#     def __init__(self, num_node_features, self.n_edge_feats, self.n_hidden=64):
#         super(EdgeConvGNN, self).__init__()

#         self.edge_conv1 = EdgeConv(Sequential(Linear(2 * num_node_features, self.n_hidden), ReLU()))
#         self.edge_conv2 = EdgeConv(Sequential(Linear(2 * self.n_hidden, self.n_hidden), ReLU()))

#         self.mlp = nn.Sequential(
#             nn.Linear(2 * self.n_hidden + self.n_edge_feats, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1),
#         )

#     def forward(self, x, edge_index, edge_attr):
#         x = selfself.n_hiddennv1(x, edge_index)
#         x = F.relu(x)
#         x = self.edge_conv2(x, edge_index)

#         src, dest = edge_index
#         src_embed = x[src]
#         dest_embed = x[dest]

#         edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
#         edge_logits = self.mlp(edge_inputs).squeeze(1)

#         return edge_logits
