# ANALYSIS OF GRAPH CLUSTERING WITH METIS TO ISOLATE LAUNDERING SUBGRAPHS

In [1]:
# setup
from modules.data_loader import *
from modules.feature_engineering import *
from modules.visualizer import *
import networkx as nx
import dgl
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix
import numpy as np

In [2]:
# #run if you dont have the data downloaded this is a few gb of data
# import kagglehub
# path = kagglehub.dataset_download("ealtman2019/ibm-transactions-for-anti-money-laundering-aml")
# print("Path to dataset files:", path)

## 1. Create our dataframes from raw data

In [2]:
dataset_name = "HI-Small"

print(f"Loading {dataset_name}...\n")
trans_df = load_transactions(dataset_size=dataset_name)
# accounts_df = load_accounts(dataset_size=dataset_name)
# patterns_df = load_patterns(dataset_size=dataset_name)
print(trans_df.head())


Loading HI-Small...


Loading transactions from: /home/linch/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_Trans.csv
File size: 453.6 MB

Loaded 5,078,345 transactions
Date range: 2022-09-01 00:00:00 to 2022-09-18 16:18:00
Laundering transactions: 5,177 (0.102%)
   transaction_id           timestamp from_bank from_account to_bank  \
0               0 2022-09-01 00:20:00       010    8000EBD30     010   
1               1 2022-09-01 00:20:00     03208    8000F4580     001   
2               2 2022-09-01 00:00:00     03209    8000F4670   03209   
3               3 2022-09-01 00:02:00       012    8000F5030     012   
4               4 2022-09-01 00:06:00       010    8000F5200     010   

  to_account  amount_received receiving_currency  amount_paid  \
0  8000EBD30          3697.34          US Dollar      3697.34   
1  8000F5340             0.01          US Dollar         0.01   
2  8000F4670         14675.57          US Dollar  

In [3]:
# convert all currencies to USD for normaliztion

trans_df = convert_currency_to_USD(trans_df)
print(trans_df.columns.tolist())

['transaction_id', 'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering']


In [4]:
# compute sinusoidal temporal encodings and normalized unix timestamp

trans_df = temporal_encoding(trans_df)
print(trans_df.columns.tolist())

['transaction_id', 'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering', 'hour_sin', 'hour_cos', 'time_normalized']


In [5]:
# give each currency and payment method a unique integer ID

trans_df = encode_currency_ids(trans_df)
trans_df = encode_payment_format_ids(trans_df)
print(trans_df.columns.tolist())

['transaction_id', 'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering', 'hour_sin', 'hour_cos', 'time_normalized', 'payment_currency_id', 'receiving_currency_id', 'payment_format_id']


In [6]:
# give each account a unique integer ID

trans_df, account_to_id, id_to_account = encode_account_ids(trans_df)
print(trans_df.columns.tolist())

['transaction_id', 'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering', 'hour_sin', 'hour_cos', 'time_normalized', 'payment_currency_id', 'receiving_currency_id', 'payment_format_id', 'from_account_id', 'to_account_id']


In [7]:
trans_df = normalize_amounts(trans_df)

## 2. Temporal train/test split and account statistics

In [8]:
# split data temporally: first 80% for training, last 20% for testing

train_df, test_df = temporal_train_test_split(trans_df, train_ratio=0.8)

import gc
del trans_df
gc.collect()


TEMPORAL TRAIN/TEST SPLIT

Train Set:
  Date range: 2022-09-01 00:00:00 to 2022-09-08 16:12:00
  Transactions: 4,062,676
  Laundering: 3,380 (0.083%)

Test Set:
  Date range: 2022-09-08 16:12:00 to 2022-09-18 16:18:00
  Transactions: 1,015,669
  Laundering: 1,797 (0.177%)


0

In [9]:
print(train_df.columns.tolist())
print(train_df.head(1))

['transaction_id', 'timestamp', 'from_bank', 'from_account', 'to_bank', 'to_account', 'amount_received', 'receiving_currency', 'amount_paid', 'payment_currency', 'payment_format', 'is_laundering', 'hour_sin', 'hour_cos', 'time_normalized', 'payment_currency_id', 'receiving_currency_id', 'payment_format_id', 'from_account_id', 'to_account_id']
   transaction_id  timestamp from_bank from_account to_bank to_account  \
0          316720 2022-09-01      0121    8123FB9B0    0121  8123FB9B0   

   amount_received receiving_currency  amount_paid payment_currency  \
0        -1.529313        Saudi Riyal    -1.529313      Saudi Riyal   

  payment_format  is_laundering  hour_sin  hour_cos  time_normalized  \
0   Reinvestment              0       0.0       1.0         -1.37763   

  payment_currency_id receiving_currency_id payment_format_id  \
0                   8                     8                 5   

   from_account_id  to_account_id  
0           458857         458857  


# Construct Graph

In [10]:
def build_graph_from_df(df):
    source_accounts = df['from_account_id'].values
    dest_accounts = df['to_account_id'].values

    graph = dgl.graph((source_accounts, dest_accounts))
    print(f"Number of nodes: {graph.num_nodes()}")
    print(f"Number of edges: {graph.num_edges()}")
    # EDGE FEATS

    # bookkeeping
    graph.edata['transaction_id'] = torch.tensor(df['transaction_id'])

    # numericals: we can include them directly
    numericals = ['amount_received', 'amount_paid', 'hour_sin', 'hour_cos', 'time_normalized']
    edge_numerical = torch.tensor(df[numericals].values, dtype=torch.float32)
    graph.edata['numericals'] = edge_numerical

    # categoricals: we use learned embeddings. This makes them transductive, but that is ok because currencies and payment types don't change often

    payment_currency = torch.tensor(df['payment_currency_id'])
    receiving_currency = torch.tensor(df['receiving_currency_id'])
    payment_format = torch.tensor(df['payment_format_id'])
    graph.edata['payment_currency'] = payment_currency
    graph.edata['receiving_currency'] = receiving_currency
    graph.edata['payment_format'] = payment_format

    # NODE FEATS

    # For each account, we compute their in and outdegrees
    # We zero-center and log transform this value and normalize by the graph's average transformed in and outdegrees so that our model is not sensitive to the graph size. Our training graph has 80% of the edges, so naturally the raw degree counts will be higher than the testing graph.
    indegrees = graph.in_degrees().float()
    outdegrees = graph.out_degrees().float()

    log_indegrees = torch.log(indegrees + 1)
    log_outdegrees = torch.log(outdegrees + 1)

    avg_log_indegree = log_indegrees.mean()
    avg_log_outdegree = log_outdegrees.mean()

    # zc and normalize
    normalized_indegree = (log_indegrees - avg_log_indegree) / avg_log_indegree
    normalized_outdegree = (log_outdegrees - avg_log_outdegree) / avg_log_outdegree

    node_features = torch.stack([normalized_indegree, normalized_outdegree], dim=1)
    graph.ndata['node_feats'] = node_features

    return graph

In [11]:
training_graph = build_graph_from_df(train_df)

Number of nodes: 515080
Number of edges: 4062676


In [None]:
# # EDGE FEATS

# # bookkeeping
# training_graph.edata['transaction_id'] = torch.tensor(train_df['transaction_id'])

# # numericals: we can include them directly
# numericals = ['amount_received', 'amount_paid', 'hour_sin', 'hour_cos', 'time_normalized']
# edge_numerical = torch.tensor(train_df[numericals].values, dtype=torch.float32)
# training_graph.edata['numericals'] = edge_numerical

# # categoricals: we use learned embeddings. This makes them transductive, but that is ok because currencies and payment types don't change often

# payment_currency = torch.tensor(train_df['payment_currency_id'])
# receiving_currency = torch.tensor(train_df['receiving_currency_id'])
# payment_format = torch.tensor(train_df['payment_format_id'])
# training_graph.edata['payment_currency'] = payment_currency
# training_graph.edata['receiving_currency'] = receiving_currency
# training_graph.edata['payment_format'] = payment_format


In [None]:
# # NODE FEATS

# # For each account, we compute their in and outdegrees
# # We zero-center and log transform this value and normalize by the graph's average transformed in and outdegrees so that our model is not sensitive to the graph size. Our training graph has 80% of the edges, so naturally the raw degree counts will be higher than the testing graph.
# indegrees = training_graph.in_degrees().float()
# outdegrees = training_graph.out_degrees().float()

# log_indegrees = torch.log(indegrees + 1)
# log_outdegrees = torch.log(outdegrees + 1)

# avg_log_indegree = log_indegrees.mean()
# avg_log_outdegree = log_outdegrees.mean()

# # zc and normalize
# normalized_indegree = (log_indegrees - avg_log_indegree) / avg_log_indegree
# normalized_outdegree = (log_outdegrees - avg_log_outdegree) / avg_log_outdegree

# node_features = torch.stack([normalized_indegree, normalized_outdegree], dim=1)
# training_graph.ndata['node_feats'] = node_features

In [None]:
# Check graph sanity

# print("\n===== Node Features (normalized degrees) =====")
# print("\n----- Node Degrees (first 10 nodes) -----")
# indegrees = training_graph.in_degrees().float()
# outdegrees = training_graph.out_degrees().float()

# for i in range(5):
#     norm_in = training_graph.ndata['node_feats'][i, 0].item()
#     norm_out = training_graph.ndata['node_feats'][i, 1].item()
#     print(f"Node {i}:")
#     print(f"  In-degree:  {indegrees[i].item():.0f}  → normalized: {norm_in:+.4f}")
#     print(f"  Out-degree: {outdegrees[i].item():.0f} → normalized: {norm_out:+.4f}")

# print("\n===== Edge Features =====")
# print(f"Numerical features shape: {training_graph.edata['numericals'].shape}")
# print(f"Categorical feature 1 shape: {training_graph.edata['payment_currency'].shape}")
# print(f"Categorical feature 2 shape: {training_graph.edata['receiving_currency'].shape}")
# print(f"Categorical feature 3 shape: {training_graph.edata['payment_format'].shape}")

# print("\n===== Sample Edge Features =====")
# for i in range(5):
#     src, dst = training_graph.edges()
#     print(f"Edge {i}: {src[i].item()} -> {dst[i].item()}")
#     print(f"  Numerical: {training_graph.edata['numericals'][i].tolist()}")
#     print(f"  payment currency: {training_graph.edata['payment_currency'][i].item()}, receiving currency: {training_graph.edata['receiving_currency'][i].item()} , payment format: {training_graph.edata['payment_format'][i].item()}")


===== Node Features (normalized degrees) =====

----- Node Degrees (first 10 nodes) -----
Node 0:
  In-degree:  545  → normalized: +3.1889
  Out-degree: 132597 → normalized: +7.6085
Node 1:
  In-degree:  328  → normalized: +2.8523
  Out-degree: 80883 → normalized: +7.2477
Node 2:
  In-degree:  55  → normalized: +1.6754
  Out-degree: 14711 → normalized: +6.0038
Node 3:
  In-degree:  53  → normalized: +1.6512
  Out-degree: 10810 → normalized: +5.7789
Node 4:
  In-degree:  61  → normalized: +1.7430
  Out-degree: 13641 → normalized: +5.9487

===== Edge Features =====
Numerical features shape: torch.Size([4062676, 5])
Categorical feature 1 shape: torch.Size([4062676])
Categorical feature 2 shape: torch.Size([4062676])
Categorical feature 3 shape: torch.Size([4062676])

===== Sample Edge Features =====
Edge 0: 458857 -> 458857
  Numerical: [-1.5293132066726685, -1.529313087463379, 0.0, 1.0, -1.377630352973938]
  payment currency: 8, receiving currency: 8 , payment format: 5
Edge 1: 236631 -

# Define model architecture

In [None]:
# since we have both categorical and numerical edge features, we need a class to process and combine them

class EdgeEmbedding(nn.Module):
    def __init__(
            self, 
            num_currencies=15, 
            num_payment_methods=7, 
            currencies_embed_dim=8, 
            payment_embed_dim=4, 
            num_numericals=5):
        super().__init__()
        self.currencies_embed = nn.Embedding(num_currencies, currencies_embed_dim)
        self.payment_embed = nn.Embedding(num_payment_methods, payment_embed_dim)

        # each edge has payment currency, receiving currency, payment method and numericals
        self.out_dim = currencies_embed_dim + currencies_embed_dim + payment_embed_dim + num_numericals

    def forward(self, payment_curr, receiving_curr, payment_method, numericals):
        payment_curr_embed = self.currencies_embed(payment_curr)
        receiving_curr_embed = self.currencies_embed(receiving_curr)
        payment_method_embed = self.payment_embed(payment_method)

        edge_feats = torch.cat([payment_curr_embed, receiving_curr_embed, payment_method_embed, numericals], dim=1)
        return edge_feats

# this returns our GNN-ready edge features

In [None]:
# message passing layers

class GCNLayer(nn.Module):
    def __init__(self, input_dim, output_dim, residual=True):
        super().__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.residual = residual

        if residual and input_dim != output_dim:
            self.res_linear = nn.Linear(input_dim, output_dim) # project residual if diff dims
        elif residual:
            self.res_linear = nn.Identity()

    def message_func(self, edges):
        return {'m': edges.src['Wh'] * edges.src['norm']} # normalize by source degree first

    def reduce_func(self, nodes):
        return {'h': torch.sum(nodes.mailbox['m'], dim=1)}
    
    def forward(self, graph, node_feats):
        with graph.local_scope():
            h_in = node_feats

            graph.ndata['Wh'] = self.linear(node_feats) # W is linear transform, h is node feats

            degs = graph.in_degrees().float().clamp(min=1)
            norm = torch.pow(degs, -0.5).unsqueeze(1)
            graph.ndata['norm'] = norm

            graph.update_all(self.message_func, self.reduce_func)

            h_out = graph.ndata['h'] * norm # then normalize by destination degree

            h_out = F.relu(h_out)
            if self.residual:
                return h_out + self.res_linear(h_in)
            else:
                return h_out



In [None]:
# Full classifier neural network

class GCNEdgeClassifier(nn.Module):
    
    def __init__(self,
                # node params
                node_in_feats=2, # log transformed and normalized indegree outdegree
                hidden_dim=64,
                num_gcn_layers=3,

                # for EdgeEmbedding
                num_currencies=15,
                num_payment_methods=7,
                currencies_embed_dim=8,
                payment_embed_dim=4,
                num_numericals=5,

                # output classes
                num_classes=2,

                # regularization
                dropout=0.2,
                use_batch_norm=False):
        super().__init__()

        self.num_gcn_layers = num_gcn_layers
        self.hidden_dim = hidden_dim
        self.dropout = dropout
        self.use_batch_norm = use_batch_norm

        # GCN layers
        self.gcn_layers = nn.ModuleList()
        self.gcn_layers.append(GCNLayer(node_in_feats, hidden_dim, residual=False)) # no residual for layer 1 since in dim and out dim dont match, simpler to just skip
        for i in range(num_gcn_layers - 1):
            self.gcn_layers.append(GCNLayer(hidden_dim, hidden_dim, residual=True))
        if use_batch_norm: # create batch norm layers if using them
            self.batch_norms = nn.ModuleList([nn.BatchNorm1d(hidden_dim) for i in range(num_gcn_layers)])

        # Edge Feature processor
        self.edge_embed = EdgeEmbedding(num_currencies, num_payment_methods, currencies_embed_dim, payment_embed_dim, num_numericals)

        # Combiner to do edge classification: append 2 node embeddings with edge embedding
        edge_repr_dim = 2*hidden_dim + self.edge_embed.out_dim

        # MLP for edge classification
        self.edge_classifier = nn.Sequential(
            nn.Linear(edge_repr_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim) if use_batch_norm else nn.Identity(),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2) if use_batch_norm else nn.Identity(),
            nn.ReLU(),
            nn.Dropout(dropout),

            nn.Linear(hidden_dim // 2, num_classes)
        )


    def forward(self, graph, node_features, payment_curr, receiving_curr, payment_method, edge_numericals):

        # GCN message passing to learn node embeddings
        h = node_features
        for i, gcn_layer in enumerate(self.gcn_layers):
            h = gcn_layer(graph, h)
            if self.use_batch_norm:
                h = self.batch_norms[i](h)
            if i < self.num_gcn_layers - 1: # dropout between each layer except after last
                h = F.dropout(h, p=self.dropout, training=self.training)
        
        # get node embedding for each edge
        src_nodes, dst_nodes = graph.edges()
        src_embed = h[src_nodes]
        dst_embed = h[dst_nodes]

        # process edge features
        edge_features = self.edge_embed(payment_curr, receiving_curr, payment_method, edge_numericals)

        # concat 2 nodes embeddings with edge features to get complete edge embedding
        edge_repr = torch.cat([src_embed, dst_embed, edge_features], dim=1)

        # classify edges
        logits = self.edge_classifier(edge_repr)

        return logits

In [None]:
# Define focal CE loss since we have extreme class imbalance

def compute_class_weights(labels): 
    class_counts = torch.bincount(labels)
    total = len(labels)
    num_classes = len(class_counts)
    weights = total / (num_classes * class_counts.float())
    return weights # to pass into focal loss as alpha

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, logits, targets):
        probs = F.softmax(logits, dim=1)
        targets_one_hot = F.one_hot(targets, num_classes=logits.shape[1])
        p_t = (probs * targets_one_hot).sum(dim=1) # probability of true class in dataset
        focal_weight = (1 - p_t) ** self.gamma # easy examples p_t near 1, focal weight near 0

        ce_loss = F.cross_entropy(logits, targets, reduction='none')
        focal_loss = focal_weight * ce_loss

        if self.alpha is not None:
            alpha_t = self.alpha[targets]  # [N]
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean() #default
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [None]:
def evaluate(model, g, split_mask=None):
    """
    Evaluate model performance.
    
    Args:
        split_mask: Boolean mask for edges to evaluate (None = all edges)
    """
    model.eval()
    
    with torch.no_grad():
        logits = model(g, g.ndata['feat'], g.edata['cat1'], 
                      g.edata['cat2'], g.edata['numerical'])
        
        if split_mask is not None:
            logits = logits[split_mask]
            labels = g.edata['label'][split_mask]
        else:
            labels = g.edata['label']
        
        preds = logits.argmax(dim=1)
        probs = torch.softmax(logits, dim=1)[:, 1]  # Probability of positive class
        
        # Metrics
        accuracy = (preds == labels).float().mean().item()
        
        # Per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            labels.cpu().numpy(), 
            preds.cpu().numpy(),
            average=None,
            zero_division=0
        )
        
        # AUC (if both classes present)
        try:
            auc = roc_auc_score(labels.cpu().numpy(), probs.cpu().numpy())
        except:
            auc = 0.0
        
        # Confusion matrix
        cm = confusion_matrix(labels.cpu().numpy(), preds.cpu().numpy())
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': support,
            'auc': auc,
            'confusion_matrix': cm
        }
        
        return metrics, preds, probs

# Training

In [2]:
# Setup model and loss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")


Using device: cuda


In [None]:
# Training loop

