In [None]:

!pip uninstall torch torch-scatter torch-sparse torch-geometric torch-cluster torchvision --y
!pip install torch==2.2.0 torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cu118

import torch
TORCH_VERSION = "2.2.0"
CUDA_VERSION = "118"

!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install pyg-lib -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html
!pip install torch-geometric==2.5.0 -f https://data.pyg.org/whl/torch-{TORCH_VERSION}+cu{CUDA_VERSION}.html

!pip uninstall numpy --yes
!pip install numpy==1.26.4

In [1]:
import os
!pip install kaggle
os.makedirs('/root/.kaggle', exist_ok=True)



In [2]:
import json

kaggle_credentials = {"username":"qmulberry","key":"KGAT_0b8dba1122eb1899edcec446e08f9011"}
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_credentials, f)

!chmod 600 /root/.kaggle/kaggle.json

print("Kaggle API key configured successfully!")

Kaggle API key configured successfully!


In [3]:
# Create the dataset directory if it doesn't exist
import os
os.makedirs('dataset', exist_ok=True)

# Download and unzip the dataset into the 'dataset' directory
!kaggle datasets download -d ealtman2019/ibm-transactions-for-anti-money-laundering-aml -p dataset --unzip

print("Dataset downloaded and unzipped successfully into 'dataset' directory!")

Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml
License(s): Community Data License Agreement - Sharing - Version 1.0
Downloading ibm-transactions-for-anti-money-laundering-aml.zip to dataset
 99% 7.54G/7.61G [00:18<00:00, 476MB/s]
100% 7.61G/7.61G [00:18<00:00, 439MB/s]
Dataset downloaded and unzipped successfully into 'dataset' directory!


In [4]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print(device)

cuda


In [5]:
import pandas as pd
from pathlib import Path

# get data (need git LFS)
node_path = Path("dataset") / "HI-Small_accounts.csv"
edge_path = Path("dataset") / "HI-Small_Trans.csv"
node_data = pd.read_csv(node_path)
edge_data = pd.read_csv(edge_path)

In [6]:
import numpy as np

# use indices from accounts dataset for node indices
accounts = node_data.reset_index()[['Account Number', 'index']]
num_nodes = accounts.shape[0]
compact = {accounts['Account Number'][i]: accounts['index'][i] for i in range(num_nodes)}
to_node = np.vectorize(lambda x: compact[x])

In [7]:
from torch_geometric.data import Data
import torch

# create adjacency list in COO format
source = to_node(edge_data['Account'])
target = to_node(edge_data['Account.1'])
edge_index = torch.from_numpy(np.vstack([source, target])).to(device)

num_edges = edge_index.shape[1]

g = Data(edge_index=edge_index, num_nodes=num_nodes)

In [18]:
from torch_geometric.utils import degree
import torch.nn.functional as F
from sklearn.preprocessing import OneHotEncoder

# create bank frequency column
freq = node_data['Bank ID'].value_counts()
id_freq = np.vectorize(lambda x: freq[x])
node_data['Bank Frequency'] = id_freq(node_data['Bank ID'])
node_data['Bank Frequency'] = pd.cut(node_data['Bank Frequency'], bins=[0, 2, 10, 100, 4000], labels=[0, 1, 2, 3])

# use one hot encoding for categorical variables
bank_enc = OneHotEncoder(sparse_output=False)
bank_frequency = bank_enc.fit_transform(node_data['Bank Frequency'].to_numpy().reshape(-1, 1))

paid_enc = OneHotEncoder(sparse_output=False)
currency_sent = paid_enc.fit_transform(edge_data['Payment Currency'].to_numpy().reshape(-1, 1))

received_enc = OneHotEncoder(sparse_output=False)
currency_received = received_enc.fit_transform(edge_data['Receiving Currency'].to_numpy().reshape(-1, 1))

format_enc = OneHotEncoder(sparse_output=False)
pay_format = format_enc.fit_transform(edge_data['Payment Format'].to_numpy().reshape(-1, 1))

# create numerical variables
time_trans = pd.to_datetime(edge_data['Timestamp']).astype('int64') / 1e9
amount_sent = edge_data['Amount Paid'].to_numpy()
amount_received = edge_data['Amount Received'].to_numpy()

# combine all edge features into one tensor with dtype = float32
edge_features = torch.from_numpy(np.column_stack([time_trans, amount_received,
                                                  currency_received, pay_format])).float().to(device)
edge_dim = edge_features.shape[1]

# edge label with 0 = not laundering, 1 = is laundering
label = torch.from_numpy(edge_data['Is Laundering'].to_numpy()).long().to(device)

g.x = torch.from_numpy(bank_frequency).float().to(device)
g.edge_attr = F.normalize(edge_features)
g.edge_label = label

In [19]:
# chronological 60/20/20 split

train_end = int(0.6 * num_edges)
val_end = int(0.8 * num_edges)

train_idx = torch.zeros(num_edges, dtype=torch.bool)
train_idx[:train_end] = True

val_idx = torch.zeros(num_edges, dtype=torch.bool)
val_idx[train_end:val_end] = True

test_idx = torch.zeros(num_edges, dtype=torch.bool)
test_idx[val_end:] = True

#g.train_mask = train_idx
#g.val_mask = val_idx
#g.test_mask = test_idx

In [31]:
# hyperparameters

epochs = 200

pos_weight = 10

embedding_dim = 64

hidden = 64

learn_rate = 0.005

dropout = 0.4

num_neighbors = [10, 5, 5, 2]

batch_size = 10000

In [32]:
from torch_geometric.nn import GINEConv
import torch.nn as nn

# node embedding model
class GNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, embedding_dim, edge_feat_dim, dropout):
        super().__init__()
        self.in_channels = in_channels
        self.hidden = hidden_channels
        self.dim = embedding_dim
        self.edge_dim = edge_feat_dim
        self.drop = dropout

        # linear embeddings
        self.node_embedding = nn.Linear(self.in_channels, self.dim)
        self.edge_embedding = nn.Linear(self.edge_dim, self.dim)

        # convolution layers for node embedding
        self.conv1 = GINEConv(nn.Sequential(
                    nn.Linear(self.dim, self.dim),
                    nn.ReLU(),
                    nn.Linear(self.dim, self.dim)
                    ), edge_dim=self.dim)
        self.conv2 = GINEConv(nn.Sequential(
                    nn.Linear(self.dim, self.dim),
                    nn.ReLU(),
                    nn.Linear(self.dim, self.dim)
                    ), edge_dim=self.dim)

        # mlp for edge embedding
        self.mlp_edge = nn.Sequential(
            nn.Linear(self.dim*3, self.dim),
            nn.ReLU(),
            nn.Linear(self.dim, self.dim),
        )

        # mlp for edge classifier
        self.mlp_classifier = nn.Sequential(
            nn.Linear(self.dim*3, self.hidden),
            nn.ReLU(),
            nn.Dropout(self.drop),
            nn.Linear(self.hidden, self.hidden // 2),
            nn.ReLU(),
            nn.Dropout(self.drop),
            nn.Linear(self.hidden // 2, 2)
        )

    def forward(self, x, edge_index, edge_attr, edge_label_index=None, edge_label_attr=None):
        # 1. Message Passing to update node embeddings
        x = self.node_embedding(x)
        edge_attr_mp = self.edge_embedding(edge_attr)

        # Layer 1
        u, v = edge_index
        x = 0.5 * (x + F.relu(self.conv1(x, edge_index, edge_attr_mp)))
        edge_attr_mp = edge_attr_mp + 0.5*self.mlp_edge(torch.cat([x[u], x[v], edge_attr_mp], dim=-1))

        # Layer 2
        x = 0.5 * (x + F.relu(self.conv2(x, edge_index, edge_attr_mp)))
        edge_attr_mp = edge_attr_mp + 0.5*self.mlp_edge(torch.cat([x[u], x[v], edge_attr_mp], dim=-1))

        # 2. Prediction
        if edge_label_index is not None:
            # Use specific target edges
            src, dst = edge_label_index
            x_src = x[src]
            x_dst = x[dst]

            # Handle edge features for target edges
            if edge_label_attr is not None:
                # Re-calculate edge embedding updates for the target edges
                # This ensures they benefit from the trained MLPs even if not in MP graph
                h_edge = self.edge_embedding(edge_label_attr)

                # Layer 1 update (simulated)
                h_edge = h_edge + 0.5 * self.mlp_edge(torch.cat([x_src, x_dst, h_edge], dim=-1))
                # Layer 2 update (simulated)
                h_edge = h_edge + 0.5 * self.mlp_edge(torch.cat([x_src, x_dst, h_edge], dim=-1))
            else:
                # Fallback if no attributes provided (should not happen with correct loop)
                h_edge = torch.zeros((src.size(0), self.dim), device=x.device)

            # Combine for classification
            # Original logic: x[edge_pairs].reshape(-1, 2*self.dim).relu()
            x_pair = torch.cat([x_src, x_dst], dim=-1).relu()
            return self.mlp_classifier(torch.cat([x_pair, h_edge], dim=1))

        else:
            # Fallback to computing for all edges in edge_index (old behavior)
            edge_pairs = torch.transpose(edge_index, 0, 1)
            x_pair = x[edge_pairs].reshape(-1, 2*self.dim).relu()
            return self.mlp_classifier(torch.cat([x_pair, edge_attr_mp], dim=1))

In [33]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score

# Helper to get edge attrs
def get_target_edge_attrs(batch, original_idx_mask):
    # batch.input_id gives global indices into the original graph (g)
    # because LinkNeighborLoader was initialized with 'g'.
    if hasattr(batch, 'input_id'):
        return g.edge_attr[batch.input_id]
    return None

# evaluate performance
def evaluate(loader, split_idx):
    gnn.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in loader:
            batch.to(device)
            target_attr = get_target_edge_attrs(batch, split_idx)

            logits = gnn(batch.x, batch.edge_index, batch.edge_attr,
                         edge_label_index=batch.edge_label_index,
                         edge_label_attr=target_attr)

            preds = logits.argmax(dim=-1)

            all_preds.append(preds.cpu())
            all_labels.append(batch.edge_label.cpu())
            all_probs.append(logits[:, 1].cpu())

    y = torch.cat(all_labels)
    preds = torch.cat(all_preds)
    probs = torch.cat(all_probs)

    accuracy = accuracy_score(y, preds)
    precision = precision_score(y, preds, zero_division=0.0)
    recall = recall_score(y, preds, zero_division=0.0)
    f1 = f1_score(y, preds, zero_division=0.0)
    return [accuracy, precision, recall, f1], y, preds, probs

def plots(y, preds, probs):
    cm = confusion_matrix(y, preds)

    # plot confusion matrix for default threshold
    plt.figure(1)
    ConfusionMatrixDisplay(cm).plot()
    plt.title('Confusion Matrix')
    plt.show()
    # plt.savefig('Confusion Matrix')

    # plot loss curve
    plt.figure(2)
    plt.plot(loss_values)
    plt.xticks(range(0,epochs))
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Validation Loss')
    plt.show()
    # plt.savefig('Loss Curve')

    # plot precision-recall curve
    plt.figure(3)
    precision, recall, thresholds = precision_recall_curve(y, probs)
    plt.plot(recall, precision, label=f'Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()
    # plt.savefig('Precision-Recall Curve')

In [34]:
from torch_geometric.loader import LinkNeighborLoader

train_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                  edge_label_index=edge_index[:, train_idx],
                                  edge_label=g.edge_label[train_idx],
                                  batch_size=batch_size,
                                  shuffle=True)

val_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                edge_label_index=edge_index[:, val_idx],
                                edge_label=g.edge_label[val_idx],
                                batch_size=batch_size,
                                shuffle=True)

test_loader = LinkNeighborLoader(g, num_neighbors=num_neighbors,
                                 edge_label_index=edge_index[:, test_idx],
                                 edge_label=g.edge_label[test_idx],
                                 batch_size=batch_size,
                                 shuffle=True)

In [35]:
import torch.optim as optim

# instantiate model
gnn = GNN(4, hidden, embedding_dim, edge_dim, dropout).to(device)
weight = torch.tensor([1, pos_weight]).float().to(device)
optimizer = optim.Adam(gnn.parameters(), lr=learn_rate)
criterion = nn.CrossEntropyLoss(weight=weight).to(device)

# training loop
loss_values = []
for epoch in range(epochs):
    total_loss = 0
    batches = 0
    for batch in train_loader:
        batch.to(device)
        gnn.train()
        optimizer.zero_grad()

        # Fetch raw edge attributes for the target edges
        target_attr = get_target_edge_attrs(batch, train_idx)

        # Compute logits ONLY for the target edges using edge_label_index
        logits = gnn(batch.x, batch.edge_index, batch.edge_attr,
                     edge_label_index=batch.edge_label_index,
                     edge_label_attr=target_attr)

        loss = criterion(logits, batch.edge_label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        batches += 1

    avg_loss = total_loss / batches
    loss_values.append(avg_loss)

    # Use the updated evaluate function
    val_metrics, _, _, _ = evaluate(val_loader, val_idx)
    print(f'Epoch {epoch+1:02d} | loss {avg_loss:.4f} | accuracy {val_metrics[0]:.4f} | precision {val_metrics[1]:.4f}| recall {val_metrics[2]:.4f}| f1 {val_metrics[3]:.4f}')

# evaluate metrics on test data
test_metrics, y_test, preds_test, probs_test = evaluate(test_loader, test_idx)
print('\n')
print(f'Test Accuracy: {test_metrics[0]:.4f}')
print(f'Test Precision: {test_metrics[1]:.4f}')
print(f'Test Recall: {test_metrics[2]:.4f}')
print(f'Test f1: {test_metrics[3]:.4f}')

plots(y_test, preds_test, probs_test)
torch.save(gnn, 'model.pth')

Epoch 01 | loss 0.0581 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 02 | loss 0.0476 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 03 | loss 0.0460 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 04 | loss 0.0457 | accuracy 0.9989 | precision 0.1806| recall 0.0245| f1 0.0432
Epoch 05 | loss 0.0444 | accuracy 0.9990 | precision 0.5000| recall 0.0066| f1 0.0130
Epoch 06 | loss 0.0440 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 07 | loss 0.0436 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 08 | loss 0.0430 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 09 | loss 0.0428 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 10 | loss 0.0429 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 11 | loss 0.0426 | accuracy 0.9990 | precision 0.0000| recall 0.0000| f1 0.0000
Epoch 12 | loss 0.0419 | accuracy 0.9990 | precision 0

KeyboardInterrupt: 

In [None]:
torch.save(gnn, 'model.pth')