# Practice GNN

## Notebook configuration

In [1]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch-geometric
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, EdgeConv, GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
!pip install torchmetrics
from torchmetrics.classification import Recall, Accuracy, AUROC, Precision
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "HI-Small_25.csv")

# # Project Source Code
# src_path = os.path.join(content_base, "My Drive/Capstone/src")
# sys.path.append(src_path)
# from helpers import add_cell_timer
# from pipeline import ModelPipeline

# add_cell_timer()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

Cloning into 'dsi-capstone-spring-2025-TD-anti-money-laundering'...
remote: Enumerating objects: 507, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 507 (delta 70), reused 72 (delta 44), pack-reused 376 (from 1)[K
Receiving objects: 100% (507/507), 25.98 MiB | 16.34 MiB/s, done.
Resolving deltas: 100% (237/237), done.
Already up to date.
Branch 'sophie' set up to track remote branch 'sophie' from 'origin'.
Switched to a new branch 'sophie'


In [3]:
from helpers import add_cell_timer
from pipeline import ModelPipeline
add_cell_timer()

## Data preprocessing

Run initial full-dataset preprocessing

In [4]:
# Initialize pipeline with dataset
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 94.93s


In [9]:
# X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
#        'sent_amount', 'sent_currency', 'payment_type',
#        'from_account_id', 'to_account_id', 'from_account_idx',
#        'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
#        'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
#        'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
#        'time_of_day_cos', 'is_weekend']
X_cols = ['from_account_idx', 'to_account_idx','from_bank','received_amount', 'sent_amount',
          'sent_currency', 'payment_type','day_of_week','timestamp_int']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15, split_type='temporal_agg')

Data split using temporal_agg method.
Remember to mask labels in GNN evaluation.
 - Train: no mask 
 - Val: mask y_lab[:t1] (only evaluate labels y_lab[t1:t2]) 
 - Test: mask y_lab[:t2] (only evaluate labels y_lab[t2:])

⏱️ Execution time: 1.23s


In [11]:
edge_features = ['received_amount', 'sent_amount', 'sent_currency',
                 'payment_type','day_of_week','timestamp_int']
node_features = ['from_bank']
train_data, val_data, test_data = pl.generate_tensors(edge_features, node_features)


⏱️ Execution time: 0.17s


## GNNs

Cannot use GCN or GAT!

* The Graph Convolutional Network (GCN), implemented with `GCNConv`, only aggregates features from neighboring nodes and does not use edge attributes in its message passing.
* Graph Attention Networks (GAT), implemented with `GATConv`, allows edge attention weights, which can indirectly incorporate edge attributes. Problem: If all nodes have the same feature vector (e.g., initialized to 1), then the computed attention scores will be the same for all edges. We'd need to modify GAT to use edge features meaningfully in the attention computation.

`GINeConv`
* Directly includes edge attributes in message passing using an MLP-based edge transformation.

`EdgeConv`
* dynamically computes edge embeddings and updates node features based on edges



### GINeConv

Modified GINConv that includes edge features in message passing.
Update rule is:
h (l+1) = h (l) + sum (MLP(h(l + e


In [22]:
# If on GPU, do as below
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda

⏱️ Execution time: 0.0s


In [23]:
class EdgeGINE(nn.Module):
    def __init__(self, n_node_feats, n_edge_feats, n_hidden=64):
        super(EdgeGINE, self).__init__()

        self.n_hidden = n_hidden
        self.n_node_feats = n_node_feats
        self.n_edge_feats = n_edge_feats

        # MLP to embed node and edges
        self.node_emb = nn.Linear(self.n_node_feats, self.n_hidden)
        self.edge_emb = nn.Linear(self.n_edge_feats, self.n_hidden)

        # MLP that processes edge features, passed into GINEConv
        nn_edge = Sequential(Linear(self.n_hidden, self.n_hidden), ReLU(), Linear(self.n_hidden, self.n_hidden))

        # Two GINEConv layers using nn_edge when it needs to process edge attributes
        self.gine1 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)
        self.gine2 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)

        # MLP for edge classification
        self.mlp = nn.Sequential(
            nn.Linear(3 * self.n_hidden, 128), # src, dest, edge
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features (or placeholder embeddings if None)
        edge_index: Edge list (2, n_edges)
        edge_attr: Edge features (n_edges, self.n_edge_feats)
        """
        if x is None:  # If no node features, use trainable embeddings
            x = torch.ones((edge_index.max().item() + 1, 1), device=device)

        x = self.node_emb(x) # MLP
        edge_attr = self.edge_emb(edge_attr) # MLP
        x, edge_attr, edge_index = x.to(device), edge_attr.to(device), edge_index.to(device)

        # Pass nodes and edges through GINE layers
        x = self.gine1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.gine2(x, edge_index, edge_attr)

        # Get output for classification
        src, dest = edge_index
        src_embed = x[src]
        dest_embed = x[dest]

        edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
        edge_logits = self.mlp(edge_inputs).squeeze(1)

        return edge_logits



⏱️ Execution time: 0.0s


In [49]:
from torchmetrics import AveragePrecision, F1Score

# Move data to GPU if using
# train_data = train_data.to(device)
# val_data = val_data.to(device)
test_data = test_data.to(device)

# Create DataLoader (batch size=1 because we have one graph)
# train_loader = DataLoader([train_data], batch_size=1, shuffle=True)
# val_loader = DataLoader([val_data], batch_size=1, shuffle=False)
test_loader = DataLoader([test_data], batch_size=1, shuffle=False)

# # Create metrics
# accuracy = Accuracy(task="binary").to(device) # 1/N sum(1(y=yhat))
# recall = Recall(task='binary').to(device) # TP / (TP+FN), or use BinaryRecall class?
# precision = Precision(task="binary").to(device) # TP / (TP + FP)
# auroc = AUROC(task="binary").to(device)
# pr_auc = AveragePrecision(task="binary").to(device)
# f1 = F1Score(task="binary").to(device)

# Initialize model & optimizer
num_edge_features = len(edge_features)  # Your selected transaction features
num_node_features = 1
model = EdgeGINE(num_node_features, num_edge_features).to(device)
optimizer = Adam(model.parameters(), lr=0.01)

# Use focal loss to focus on rare positives
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, logits, targets):
        bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce_loss)  # Probabilities of correct classification
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

class HybridLoss(torch.nn.Module):
    """Hybrid Loss that balances BCE (for accuracy) and Focal Loss (for recall)"""
    def __init__(self, alpha=0.25, gamma=2.0, focal_weight=0.5):
        super().__init__()
        self.bce = torch.nn.BCEWithLogitsLoss()
        self.alpha = alpha
        self.gamma = gamma
        self.focal_weight = focal_weight  # Weighting factor between BCE and Focal Loss

    def forward(self, logits, targets):
        # BCE Loss
        bce_loss = self.bce(logits, targets.float())

        # Focal Loss
        probs = torch.sigmoid(logits)
        bce_loss_per_sample = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
        focal_loss = self.alpha * (1 - torch.exp(-bce_loss_per_sample)) ** self.gamma * bce_loss_per_sample
        focal_loss = focal_loss.mean()

        # Combine BCE and Focal Loss
        total_loss = (1 - self.focal_weight) * bce_loss + self.focal_weight * focal_loss
        return total_loss


#criterion = FocalLoss(gamma=2.0, alpha=0.25)
#criterion = BCEWithLogitsLoss(weight=torch.tensor([50.0],device=device))
criterion = HybridLoss(focal_weight=0.3)  # Adjust weight (0.3-0.6 works well)


⏱️ Execution time: 0.01s


In [31]:
t1 = int((1-(0.3))*len(pl.df))
t2 = int((1-0.15)*len(pl.df))

# Create label masks for validation and test
train_mask = torch.ones(len(y_test), dtype=torch.bool)  # Use all training labels
val_mask = torch.zeros(len(y_test), dtype=torch.bool)
test_mask = torch.zeros(len(y_test), dtype=torch.bool)

# Unmask only the correct labels
val_mask[t1:t2] = True  # Only evaluate labels y_val[t1:t2]
test_mask[t2:] = True    # Only evaluate labels y_test[t2:]


⏱️ Execution time: 0.0s


In [50]:
from torchmetrics.functional import accuracy, recall, precision, auroc, average_precision, f1_score

# Training function with one forward pass for training and validation
def train(model, optimizer, criterion, data, train_mask, val_mask, test_mask, threshold=0.5, epochs=20):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        # **Forward pass once on the entire graph**
        logits = model(data.x.to(device), data.edge_index.to(device), data.edge_attr.to(device))
        probs = torch.sigmoid(logits)  # Convert logits to probabilities
        preds = (probs > threshold).long()  # Convert to binary predictions


        # **Training loss (only for training labels)**
        train_loss = criterion(logits[train_mask], data.y[train_mask].float())

        # **Backward pass & optimization**
        train_loss.backward()
        optimizer.step()

        # **Validation loss (computed without gradient updates)**
        with torch.no_grad():
            val_loss = criterion(logits[val_mask], data.y[val_mask].float())

        # **Compute train, val, and test metrics using the same model output**
        def compute_metrics(preds, data, mask):
          acc = accuracy(preds[mask], data.y[mask], task="binary")
          rec = recall(preds[mask], data.y[mask], task="binary")
          prec = precision(preds[mask], data.y[mask], task="binary")
          auroc_score = auroc(probs[mask], data.y[mask], task="binary")
          pr_auc_score = average_precision(probs[mask], data.y[mask], task="binary")
          f1 = f1_score(preds[mask], data.y[mask], task="binary")
          return acc, rec, prec, auroc_score, pr_auc_score, f1

        train_acc, train_rec, train_prec, train_auroc, train_pr_auc, train_f1 = compute_metrics(preds, data, train_mask)
        val_acc, val_rec, val_prec, val_auroc, val_pr_auc, val_f1 = compute_metrics(preds, data, val_mask)
        test_acc, test_rec, test_prec, test_auroc, test_pr_auc, test_f1 = compute_metrics(preds, data, test_mask)

        if epoch%50==0:
              print(f"Epoch {epoch+1}/{epochs} "
                f"Train Loss: {train_loss.item():.4f}  | Val Loss: {val_loss.item():.4f} |  "
                f"Train Acc: {train_acc:.4f}  | Val Acc: {val_acc:.4f}  | Test Acc: {test_acc:.4f} |  "
                f"Train F1: {train_f1:.4f}  | Val F1: {val_f1:.4f}  | Test F1: {test_f1:.4f} |  "
                f"Train PR-AUC: {train_pr_auc:.4f}  | Val PR-AUC: {val_pr_auc:.4f} | Test PR-AUC: {test_pr_auc:.4f} |  "
                f"Train Prec: {train_prec:.4f}  | Val Prec: {val_prec:.4f}  | Test Prec: {test_prec:.4f} |  "
                f"Train Rec: {train_rec:.4f}  | Val Rec: {val_rec:.4f}  | Test Rec: {test_rec:.4f} |  ")



⏱️ Execution time: 0.0s


In [None]:
# Run Training
train(model, optimizer, criterion, test_data, train_mask, val_mask, test_mask, epochs=5000)

Epoch 1/5000 Train Loss: 9659974.0000  | Val Loss: 10246719.0000 |  Train Acc: 0.9595  | Val Acc: 0.9575  | Test Acc: 0.9660 |  Train F1: 0.0028  | Val F1: 0.0015  | Test F1: 0.0080 |  Train PR-AUC: 0.0010  | Val PR-AUC: 0.0013 | Test PR-AUC: 0.0022 |  Train Prec: 0.0014  | Val Prec: 0.0008  | Test Prec: 0.0042 |  Train Rec: 0.0549  | Val Rec: 0.0243  | Test Rec: 0.0653 |  
Epoch 51/5000 Train Loss: 431809.8125  | Val Loss: 549046.7500 |  Train Acc: 0.9990  | Val Acc: 0.9987  | Test Acc: 0.9979 |  Train F1: 0.0000  | Val F1: 0.0000  | Test F1: 0.0000 |  Train PR-AUC: 0.0010  | Val PR-AUC: 0.0013 | Test PR-AUC: 0.0021 |  Train Prec: 0.0000  | Val Prec: 0.0000  | Test Prec: 0.0000 |  Train Rec: 0.0000  | Val Rec: 0.0000  | Test Rec: 0.0000 |  
Epoch 101/5000 Train Loss: 2203918.5000  | Val Loss: 2836762.7500 |  Train Acc: 0.9990  | Val Acc: 0.9987  | Test Acc: 0.9979 |  Train F1: 0.0000  | Val F1: 0.0000  | Test F1: 0.0000 |  Train PR-AUC: 0.0010  | Val PR-AUC: 0.0013 | Test PR-AUC: 0.00

## EdgeConv

In [None]:
# from torch_geometric.nn import EdgeConv
# from torch.nn import Linear

# class EdgeConvGNN(nn.Module):
#     def __init__(self, num_node_features, self.n_edge_feats, self.n_hidden=64):
#         super(EdgeConvGNN, self).__init__()

#         self.edge_conv1 = EdgeConv(Sequential(Linear(2 * num_node_features, self.n_hidden), ReLU()))
#         self.edge_conv2 = EdgeConv(Sequential(Linear(2 * self.n_hidden, self.n_hidden), ReLU()))

#         self.mlp = nn.Sequential(
#             nn.Linear(2 * self.n_hidden + self.n_edge_feats, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1),
#         )

#     def forward(self, x, edge_index, edge_attr):
#         x = selfself.n_hiddennv1(x, edge_index)
#         x = F.relu(x)
#         x = self.edge_conv2(x, edge_index)

#         src, dest = edge_index
#         src_embed = x[src]
#         dest_embed = x[dest]

#         edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
#         edge_logits = self.mlp(edge_inputs).squeeze(1)

#         return edge_logits
