# Practice GNN

## Notebook configuration

In [1]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint
from google.colab import drive
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss, Sequential, Linear, ReLU
!pip install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install torch-scatter torch-sparse pyg-lib torch-geometric \
  -f https://data.pyg.org/whl/torch-2.5.1+cu118.html
from torch_geometric.nn import GINEConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader, LinkNeighborLoader
!pip install torchmetrics
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision, BinaryRecall, BinaryF1Score, BinaryAveragePrecision
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

content_base = "/content/drive"
drive.mount(content_base)

# Project data
data_dir = os.path.join(content_base, "My Drive/Capstone/data")
data_file = os.path.join(data_dir, "subset_transactions2.csv")

Looking in indexes: https://download.pytorch.org/whl/cu118
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu118.html
Collecting torchmetrics
  Downloading torchmetrics-1.7.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.2-py3-none-any.whl.metadata (5.6 kB)
Downloading torchmetrics-1.7.0-py3-none-any.whl (960 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m960.9/960.9 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.14.2-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.14.2 torchmetrics-1.7.0
Mounted at /content/drive


In [2]:
# Google Colaboratory executes in an environment with a file system
# that has a Linux topography, but where the user should work under
# the `/content` directory
COLAB_ROOT = "/content"

REPO_URL = "https://github.com/engie4800/dsi-capstone-spring-2025-TD-anti-money-laundering.git"
REPO_ROOT = os.path.join(COLAB_ROOT, REPO_URL.split("/")[-1].split(".")[0])
REPO_BRANCH = "sophie"

# Clones the repository at `/content/dsi-capstone-spring-2025-TD-anti-money-laundering`
if not os.path.exists(REPO_ROOT):
  os.chdir(COLAB_ROOT)
  !git clone {REPO_URL}

# Pulls the latest code from the provided branch and adds the
# analysis pipeline source code to the Python system path
os.chdir(REPO_ROOT)
!git pull
!git checkout {REPO_BRANCH}
sys.path.append(os.path.join(REPO_ROOT, "Code/src"))
os.chdir(COLAB_ROOT)

Cloning into 'dsi-capstone-spring-2025-TD-anti-money-laundering'...
remote: Enumerating objects: 622, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (59/59), done.[K
remote: Total 622 (delta 47), reused 35 (delta 33), pack-reused 529 (from 2)[K
Receiving objects: 100% (622/622), 26.07 MiB | 16.01 MiB/s, done.
Resolving deltas: 100% (311/311), done.
Already up to date.
Branch 'sophie' set up to track remote branch 'sophie' from 'origin'.
Switched to a new branch 'sophie'


In [3]:
from helpers import add_cell_timer
from pipeline import ModelPipeline
add_cell_timer()

## Data preprocessing

Run initial full-dataset preprocessing

In [4]:
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 25.78s


In [5]:
node_features = [
    # TODO
    # A list of tuples with this structure >>>
    # (column to include, treatment/method, column rename)

    ('from_bank', 'first', None),
]

pl.extract_nodes(node_features, add_graph_features=False)


⏱️ Execution time: 0.01s


In [6]:
X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
       'sent_amount', 'sent_currency', 'payment_type', 'from_account_idx',
       'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
       'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
       'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_weekend']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15, split_type='temporal_agg')


⏱️ Execution time: 1.65s


In [7]:
numerical_feats = ['sent_amount_usd', 'received_amount_usd', 'timestamp_scaled']
X_train, X_test, X_val = pl.numerical_scaling(numerical_feats)


⏱️ Execution time: 0.12s


In [8]:
edge_features = ['received_amount', 'received_currency','sent_amount',
                 'sent_currency', 'payment_type', 'sent_amount_usd',
                 'hour_of_day', 'day_of_week', 'seconds_since_midnight',
                 'timestamp_scaled']
node_features = ['from_bank'] #,'degree_centrality_sent_amount','pagerank_sent_amount', 'degree_centrality_received_amount', 'pagerank_received_amount']
train_data, val_data, test_data = pl.generate_tensors(edge_features,node_features)


⏱️ Execution time: 0.2s


## GNNs

Cannot use GCN or GAT!

* The Graph Convolutional Network (GCN), implemented with `GCNConv`, only aggregates features from neighboring nodes and does not use edge attributes in its message passing.
* Graph Attention Networks (GAT), implemented with `GATConv`, allows edge attention weights, which can indirectly incorporate edge attributes. Problem: If all nodes have the same feature vector (e.g., initialized to 1), then the computed attention scores will be the same for all edges. We'd need to modify GAT to use edge features meaningfully in the attention computation.

`GINeConv`
* Directly includes edge attributes in message passing using an MLP-based edge transformation.

`EdgeConv`
* dynamically computes edge embeddings and updates node features based on edges

**We'll be using GINeConv moving forward.**

In [9]:
# If on GPU, do as below
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda

⏱️ Execution time: 0.0s


### Model

In [10]:
class EdgeGINE(nn.Module):
    def __init__(self, n_node_feats, n_edge_feats, n_hidden=64):
        super(EdgeGINE, self).__init__()

        self.n_hidden = n_hidden
        self.n_node_feats = n_node_feats
        self.n_edge_feats = n_edge_feats

        # Linear to embed node and edges
        self.node_emb = nn.Linear(self.n_node_feats, self.n_hidden) # [num_nodes, n_hidden]
        self.edge_emb = nn.Linear(self.n_edge_feats, self.n_hidden) # [num_edges, n_hidden]

        # MLP that processes edge features, passed into GINEConv
        nn_edge = Sequential(Linear(self.n_hidden, self.n_hidden), ReLU(), Linear(self.n_hidden, self.n_hidden))

        # Two GINEConv layers using nn_edge when it needs to process edge attributes
        self.gine1 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)
        self.gine2 = GINEConv(nn_edge, edge_dim=self.n_hidden, train_eps=True)

        # Edge updates MLPs
        self.emlp1 = Sequential(
                nn.Linear(3 * self.n_hidden, self.n_hidden),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
            )
        self.emlp2 = Sequential(
                nn.Linear(3 * self.n_hidden, self.n_hidden),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
            )

        # MLP for edge classification
        self.mlp = nn.Sequential(
            nn.Linear(3 * self.n_hidden, 128), # src, dest, edge
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features (or placeholder embeddings if None)
        edge_index: Edge list (2, n_edges)
        edge_attr: Edge features (n_edges, self.n_edge_feats)
        """
        src, dest = edge_index
        if x is None:  # If no node features, use trainable embeddings
            x = torch.ones((edge_index.max().item() + 1, 1), device=device)

        # Create some initial embeddings for nodes and edges
        x = self.node_emb(x) # MLP
        edge_attr = self.edge_emb(edge_attr) # MLP
        x, edge_attr, edge_index = x.to(device), edge_attr.to(device), edge_index.to(device)

        # Pass nodes and edges through GINE layer1
        x = x + F.relu(self.gine1(x, edge_index, edge_attr))

        # Update edges with MLP1
        edge_attr = edge_attr + self.emlp1(torch.cat([x[src], x[dest], edge_attr], dim=-1)) / 2

        # Pass nodes and edges through GINE layer2
        x = F.relu(self.gine1(x, edge_index, edge_attr))

        # Update edges with MLP2
        edge_attr = edge_attr + self.emlp2(torch.cat([x[src], x[dest], edge_attr], dim=-1)) / 2

        # Get output for classification
        src_embed, dest_embed = x[src], x[dest]
        edge_inputs = torch.cat([src_embed, dest_embed, edge_attr], dim=1)
        edge_logits = self.mlp(edge_inputs).squeeze(1)

        return edge_logits



⏱️ Execution time: 0.0s


### Create data loaders with `LinkNeighborLoader`
Goal: Create data loaders - split into batches using `LinkNeighborLoader`, incorporating masking in the loading process & batching

**LinkNeighborLoader:**

**num_neighbors:** how many neighbors are sampled per node -- only sampling a subgraph around each edge in a batch. it is size [x,y] because we have 2 layers (sample x nodes in layer 1 and y nodes in layer 2).
- Let’s say your batch contains 100 edges, and each edge touches two nodes (source and destination). Then LinkNeighborLoader will:
  - Identify all unique nodes from those 100 edges
  - For each of those nodes:
      - Sample 10 neighbors (for layer 1)
      - Then, for each of those neighbors, sample another 10 neighbors (for layer 2)
  - Build a mini subgraph for this batch using only those sampled nodes and edges
- Imagine you're doing link prediction for a social network:
  - batch_size = 1024 means you're analyzing 1024 friend requests at a time
  - num_neighbors = [10, 10] means for each person in the request, you look at:
    - Their 10 direct friends
    - And 10 friends-of-friends per direct friend

In [11]:
# Move data to GPU if using
tr_data = train_data.to(device)
val_data = val_data.to(device)
te_data = test_data.to(device)


⏱️ Execution time: 0.3s


In [21]:
batch_size=8192
num_neighbors=[100,100]

t1 = int(len(pl.df) * 0.7)
t2 = int(len(pl.df) * 0.85)

val_inds = torch.tensor(np.arange(t1, t2), device=device)
te_inds = torch.tensor(np.arange(t2, len(pl.df)), device=device)

tr_loader = LinkNeighborLoader(tr_data, edge_label_index=tr_data.edge_index, edge_label=tr_data.y,
                               num_neighbors=num_neighbors, batch_size=batch_size, shuffle=True)
val_loader = LinkNeighborLoader(val_data,num_neighbors=num_neighbors, edge_label_index=val_data.edge_index[:, val_inds],
                                edge_label=val_data.y[val_inds], batch_size=batch_size, shuffle=False)
te_loader =  LinkNeighborLoader(te_data,num_neighbors=num_neighbors, edge_label_index=te_data.edge_index[:, te_inds],
                        edge_label=te_data.y[te_inds], batch_size=batch_size, shuffle=False)


⏱️ Execution time: 0.06s


In [18]:
# Initialize model & optimizer
num_edge_features = len(edge_features)  # Your selected transaction features
num_node_features = 1
model = EdgeGINE(num_node_features, num_edge_features).to(device)
optimizer = Adam(model.parameters(), lr=0.01)
criterion = BCEWithLogitsLoss(weight=torch.tensor([6.0], device=device))

print(sum(p.numel() for p in model.parameters() if p.requires_grad))
model

83523


EdgeGINE(
  (node_emb): Linear(in_features=1, out_features=64, bias=True)
  (edge_emb): Linear(in_features=10, out_features=64, bias=True)
  (gine1): GINEConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (gine2): GINEConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  ))
  (emlp1): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  )
  (emlp2): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
  )
  (mlp): Sequential(
    (0): Linear(in_features=192, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (


⏱️ Execution time: 0.02s


### Train model

In [19]:
def train(model, optimizer, criterion, tr_loader, val_loader, threshold=0.5, epochs=20):

    # Metrics
    acc_fn = BinaryAccuracy(threshold=threshold).to(device)
    prec_fn = BinaryPrecision(threshold=threshold).to(device)
    rec_fn = BinaryRecall(threshold=threshold).to(device)
    f1_fn = BinaryF1Score(threshold=threshold).to(device)
    pr_auc_fn = BinaryAveragePrecision().to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_preds, train_targets, train_probs = [], [], []

        for batch in tqdm(tr_loader, desc=f"Epoch {epoch+1} Training"):
            batch = batch.to(device)
            optimizer.zero_grad()

            logits = model(batch.x, batch.edge_index, batch.edge_attr)
            logits = logits[batch.edge_label_index[0]]
            loss = criterion(logits, batch.edge_label.float())
            loss.backward()
            optimizer.step()

            probs = torch.sigmoid(logits)
            preds = (probs > threshold).long()

            train_loss += loss.item() * logits.size(0)
            train_preds.append(preds)
            train_targets.append(batch.edge_label)
            train_probs.append(probs)

        # Concatenate all training results
        train_preds = torch.cat(train_preds)
        train_targets = torch.cat(train_targets)
        train_probs = torch.cat(train_probs)
        train_loss /= len(train_targets)

        # Compute training metrics
        train_acc = acc_fn(train_preds, train_targets)
        train_prec = prec_fn(train_preds, train_targets)
        train_rec = rec_fn(train_preds, train_targets)
        train_f1 = f1_fn(train_preds, train_targets)
        train_pr_auc = pr_auc_fn(train_probs, train_targets)

        # === Validation ===
        model.eval()
        val_loss = 0
        val_preds, val_targets, val_probs = [], [], []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
                batch = batch.to(device)
                logits = model(batch.x, batch.edge_index, batch.edge_attr)
                loss = criterion(logits, batch.edge_label.float())

                probs = torch.sigmoid(logits)
                preds = (probs > threshold).long()

                val_loss += loss.item() * logits.size(0)
                val_preds.append(preds)
                val_targets.append(batch.edge_label)
                val_probs.append(probs)

        val_preds = torch.cat(val_preds)
        val_targets = torch.cat(val_targets)
        val_probs = torch.cat(val_probs)
        val_loss /= len(val_targets)

        val_acc = acc_fn(val_preds, val_targets)
        val_prec = prec_fn(val_preds, val_targets)
        val_rec = rec_fn(val_preds, val_targets)
        val_f1 = f1_fn(val_preds, val_targets)
        val_pr_auc = pr_auc_fn(val_probs, val_targets)

        # Print every epoch
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        print(f"Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")
        print(f"Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}")
        print(f"Train PR-AUC: {train_pr_auc:.4f} | Val PR-AUC: {val_pr_auc:.4f}")
        print(f"Train Prec: {train_prec:.4f} | Val Prec: {val_prec:.4f}")
        print(f"Train Rec: {train_rec:.4f} | Val Rec: {val_rec:.4f}")
        print("-" * 80)


⏱️ Execution time: 0.0s


In [22]:
# Run Training
train(model, optimizer, criterion, tr_loader, val_loader, threshold=0.5, epochs=20)

Epoch 1 Training:   0%|          | 0/107 [00:00<?, ?it/s]


ValueError: Target size (torch.Size([8192])) must be the same as input size (torch.Size([312253]))


⏱️ Execution time: 0.11s


# Alternative loss functions

In [None]:
# # Use focal loss to focus on rare positives
# class FocalLoss(torch.nn.Module):
#     def __init__(self, gamma=2.0, alpha=0.25):
#         super().__init__()
#         self.gamma = gamma
#         self.alpha = alpha

#     def forward(self, logits, targets):
#         bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
#         pt = torch.exp(-bce_loss)  # Probabilities of correct classification
#         focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
#         return focal_loss.mean()

# class HybridLoss(torch.nn.Module):
#     """Hybrid Loss that balances BCE (for accuracy) and Focal Loss (for recall)"""
#     def __init__(self, alpha=0.25, gamma=2.0, focal_weight=0.5):
#         super().__init__()
#         self.bce = torch.nn.BCEWithLogitsLoss()
#         self.alpha = alpha
#         self.gamma = gamma
#         self.focal_weight = focal_weight  # Weighting factor between BCE and Focal Loss

#     def forward(self, logits, targets):
#         # BCE Loss
#         bce_loss = self.bce(logits, targets.float())

#         # Focal Loss
#         probs = torch.sigmoid(logits)
#         bce_loss_per_sample = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
#         focal_loss = self.alpha * (1 - torch.exp(-bce_loss_per_sample)) ** self.gamma * bce_loss_per_sample
#         focal_loss = focal_loss.mean()

#         # Combine BCE and Focal Loss
#         total_loss = (1 - self.focal_weight) * bce_loss + self.focal_weight * focal_loss
#         return total_loss

# criterion = FocalLoss(gamma=2.0, alpha=0.25)
#criterion = HybridLoss(focal_weight=0.3)  # Adjust weight (0.3-0.6 works well)