# Setup


In [None]:
import os
os.environ["DGLBACKEND"] = "pytorch"


##--Modelling--##
from sklearn.preprocessing import LabelEncoder
import dgl
import dgl.data

import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.dataloading import NeighborSampler, DataLoader

import pandas as pd
import numpy as np


# 0️⃣ Utilities Functions

In [183]:
def label_encoder(df:pd.DataFrame, column_1:str, column_2:str, label_col_name:str):
    '''Generate the labels for the specify column for the dataframe'''
    a = set(df[column_1].unique())
    b = set(df[column_2].unique())
    label_table = a.union(b)
    label_table = pd.DataFrame(label_table,columns=[label_col_name])
    label_encoder = LabelEncoder()
    label_table['label'] = label_encoder.fit_transform(label_table[label_col_name])
    _mapping = dict(zip(label_table[label_col_name], label_table['label']))
    df[column_1] = df[column_1].map(_mapping)
    df[column_2] = df[column_2].map(_mapping)
    return df, label_table

def get_node_label(df:pd.DataFrame):
    '''Return label 1 if both the payer and receiver account are involved in a laundering transaction'''
    laundering_records = df[df['is_laundering']==1]
    from_accounts = set(laundering_records['from_account'])
    to_accounts = set(laundering_records['to_account'])
    laundering_accounts = pd.DataFrame()
    laundering_accounts['account'] = list(from_accounts | to_accounts) 
    laundering_accounts['is_laundering'] = 1
    return laundering_accounts
    


# 1️⃣ Graph Construction

The graph will use all unique account numbers (Both the paying end and receving end) as node, with the transaction between the account as the edges of the graph.

The following pre-processing steps has been prior the construction of the graph:
1. Create account IDs for each account by prefix each account number with the bank code
2. Create labels fro account (now encode with bank code), currency and payment format

In [184]:
trans_df = pd.read_parquet("../00_data/trans_df.parquet")
DTYPE = np.float32
print("Raw Sample Data:")
display(trans_df.head(1))
tmp = trans_df['is_laundering'].value_counts()
print(f"Size of the Dataset: {trans_df.shape}")
print(f"Number of non-laundering samples: {tmp[0]}( {round(tmp[0]/tmp.sum()*100, 3)}% ) \nNumber of laundering samples: {tmp[1]} ( {round(tmp[1]/tmp.sum()*100, 3)}% )")


##Create unique ID account prefix with bank
trans_df['from_account'] = trans_df['from_bank'] + "_" + trans_df['from_account']
trans_df['to_account'] = trans_df['to_bank'] + "_" + trans_df['to_account']

##--Label encoding for account number--##
trans_df, account_label_table = label_encoder(trans_df, "from_account", "to_account", 'account')
trans_df, currency_label_table = label_encoder(trans_df, "payment_currency", "receiving_currency", 'currency')

##--Label encoding for payment_format--##
payment_format_label_encoder = LabelEncoder()
trans_df['payment_format'] = payment_format_label_encoder.fit_transform( trans_df['payment_format'] )
print("/nSample  Data after Transformation")

##--PreProcesss Timestamp with min max normalization--##
trans_df['timestamp'] = pd.to_datetime(trans_df['timestamp'])
trans_df['timestamp'] = trans_df['timestamp'].apply(lambda x: x.value)
trans_df['timestamp'] = (trans_df['timestamp']-trans_df['timestamp'].min())/(trans_df['timestamp'].max()-trans_df['timestamp'].min())


display(trans_df.head())


Raw Sample Data:


Unnamed: 0,transaction_id,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0


Size of the Dataset: (5078345, 12)
Number of non-laundering samples: 5073168( 99.898% ) 
Number of laundering samples: 5177 ( 0.102% )
/nSample  Data after Transformation


Unnamed: 0,transaction_id,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,0,0.000786,10,132152,10,132152,3697.34,12,3697.34,12,5,0
1,1,0.000786,3208,417156,1,39897,0.01,12,0.01,12,3,0
2,2,0.0,3209,417458,3209,417458,14675.57,12,14675.57,12,5,0
3,3,7.9e-05,12,192342,12,192342,2806.97,12,2806.97,12,5,0
4,4,0.000236,10,132160,10,132160,36682.97,12,36682.97,12,5,0


## 1.1 Node Features



In [185]:
account_amt_paid_agg_df  = pd.pivot_table(trans_df, 
               index='from_account',
               columns = 'payment_currency',
               aggfunc={
                   'amount_paid':np.mean
               }
               ).reset_index()
account_amt_paid_agg_df.columns = ['account'] + ["_".join(map(str, col)).strip("_") for col in account_amt_paid_agg_df.columns.to_flat_index()[1:]]
account_amt_paid_agg_df.fillna(0, inplace=True)

account_amt_received_agg_df  = pd.pivot_table(trans_df, 
               index='to_account',
               columns = 'receiving_currency',
               aggfunc={
                   'amount_received':np.mean
               }
               ).reset_index()
account_amt_received_agg_df.columns = ['account'] + ["_".join(map(str, col)).strip("_") for col in account_amt_received_agg_df.columns.to_flat_index()[1:]]
account_amt_received_agg_df.fillna(0, inplace=True)

node_features = pd.merge(account_amt_paid_agg_df,account_amt_received_agg_df, on='account', how="outer")
node_features.fillna(0, inplace=True)
node_labels = get_node_label(trans_df)
node_label_df = pd.merge(account_label_table, node_labels.rename({"account":"label"}, axis=1), on='label', how="left")
node_label_df.fillna(0, inplace=True)
node_label_df.sort_values(by='label', inplace=True)

node_features.head()


  account_amt_paid_agg_df  = pd.pivot_table(trans_df,
  account_amt_paid_agg_df  = pd.pivot_table(trans_df,
  account_amt_received_agg_df  = pd.pivot_table(trans_df,
  account_amt_received_agg_df  = pd.pivot_table(trans_df,


Unnamed: 0,account,amount_paid_0,amount_paid_1,amount_paid_3,amount_paid_4,amount_paid_5,amount_paid_6,amount_paid_7,amount_paid_11,amount_paid_12,...,amount_received_6,amount_received_7,amount_received_11,amount_received_12,amount_received_13,amount_received_14,amount_received_2,amount_received_8,amount_received_9,amount_received_10
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,146954.3,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1590448.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,79150.11,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3038217.0,0.0,0.0,0.0,0.0,0.0


## 1.2 Edge Features

In [186]:
edge_index = torch.stack([torch.from_numpy(trans_df['from_account'].values), torch.from_numpy(trans_df['to_account'].values)], dim=0)
print(edge_index)


edge_features = trans_df[['timestamp','amount_received','receiving_currency','amount_paid','payment_currency','payment_format']]
edge_features.head()

tensor([[132152, 417156, 417458,  ..., 237173, 363030, 237441],
        [132152,  39897, 417458,  ..., 363031, 363031, 363031]])


Unnamed: 0,timestamp,amount_received,receiving_currency,amount_paid,payment_currency,payment_format
0,0.000786,3697.34,12,3697.34,12,5
1,0.000786,0.01,12,0.01,12,3
2,0.0,14675.57,12,14675.57,12,5
3,7.9e-05,2806.97,12,2806.97,12,5
4,0.000236,36682.97,12,36682.97,12,5


## 1.3 Graph Representation

In [187]:
g = dgl.graph(
    (edge_index[0], # From Accounts
    edge_index[1]), # To Accounts
    num_nodes=account_label_table.shape[0]
)

print("Num of nodes: ", g.num_nodes())
print("Num of edges: ",g.num_edges())


##--Assign Node Features & Label--##
g.ndata["X"] = torch.from_numpy(node_features.to_numpy(dtype='float32'))
g.ndata['y'] = torch.from_numpy(
    node_label_df['is_laundering'].to_numpy() ).to(torch.int64)       

##--Assign Edge Features--##
g.edata['a'] = torch.from_numpy(edge_features.to_numpy(dtype='float32'))

##--Save Graph--##
dgl.save_graphs("trasaction_graph.dgl", g)

##--Load Graphs--##
(g,), _ = dgl.load_graphs("trasaction_graph.dgl")


Num of nodes:  515088
Num of edges:  5078345


# 2️⃣ Node Classification

## 2.1 Train/Test Split

In [188]:
y_np = g.ndata['y'].cpu().numpy()
idx = np.arange(g.num_nodes())

# --- stratified split (needs sklearn). If you don't want sklearn, use random split instead.
from sklearn.model_selection import train_test_split
idx_tr, idx_te = train_test_split(idx, test_size=0.15, stratify=y_np, random_state=42)
idx_tr, idx_va = train_test_split(idx_tr, test_size=0.1765, stratify=y_np[idx_tr], random_state=42)  # ~15% val

train_mask = torch.zeros(g.num_nodes(), dtype=torch.bool); train_mask[idx_tr] = True
val_mask   = torch.zeros(g.num_nodes(), dtype=torch.bool); val_mask[idx_va] = True
test_mask  = torch.zeros(g.num_nodes(), dtype=torch.bool); test_mask[idx_te] = True

g.ndata['train_mask'] = train_mask
g.ndata['val_mask']   = val_mask
g.ndata['test_mask']  = test_mask


In [189]:
sampler = NeighborSampler([15, 10])  # 2-layer sampling

train_nids = torch.nonzero(g.ndata['train_mask']).squeeze(1)
val_nids   = torch.nonzero(g.ndata['val_mask']).squeeze(1)
test_nids  = torch.nonzero(g.ndata['test_mask']).squeeze(1)

train_loader = DataLoader(
    g, train_nids, sampler,
    batch_size=2048, shuffle=True, drop_last=False, num_workers=0
)
val_loader = DataLoader(
    g, val_nids, sampler,
    batch_size=4096, shuffle=False, drop_last=False, num_workers=0
)
test_loader = DataLoader(
    g, test_nids, sampler,
    batch_size=4096, shuffle=False, drop_last=False, num_workers=0
)

## 2.2 Modelling

In [190]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn as dglnn

class SAGE(nn.Module):
    def __init__(self, in_feats, hidden, num_classes, dropout=0.2):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(in_feats, hidden, 'mean')
        self.conv2 = dglnn.SAGEConv(hidden, num_classes, 'mean')
        self.dropout = nn.Dropout(dropout)

    def forward(self, blocks, x):
        h = self.conv1(blocks[0], x)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.conv2(blocks[1], h)
        return h  # logits for output nodes of the last block


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
in_feats = g.ndata['X'].shape[1]
num_classes = int(g.ndata['y'].max().item() + 1)

model = SAGE(in_feats, hidden=64, num_classes=num_classes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)


y = g.ndata['y']
counts = torch.bincount(y, minlength=2).float()
weights = (counts.sum() / counts.clamp(min=1))
weights = (weights / weights.sum())

weights = torch.tensor([0.3, 0.7])  # Define own weights to handle classs imbalance
loss_fn = nn.CrossEntropyLoss(weight=weights)


def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    total_loss, total_correct, total_count = 0.0, 0, 0
    for input_nodes, output_nodes, blocks in loader:
        blocks = [b.to(device) for b in blocks]
        x = blocks[0].srcdata['X'].to(device)
        y = g.ndata['y'][output_nodes].to(device)

        logits = model(blocks, x)
        loss = loss_fn(logits, y)

        if train:
            opt.zero_grad()
            loss.backward()
            opt.step()

        total_loss += loss.item() * y.shape[0]
        pred = logits.argmax(1)
        total_correct += (pred == y).sum().item()
        total_count += y.shape[0]
    return total_loss / max(1,total_count), total_correct / max(1,total_count)

for epoch in range(1, 51):
    tr_loss, tr_acc = run_epoch(train_loader, True)
    va_loss, va_acc = run_epoch(val_loader, False)
    print(f"Epoch {epoch:02d} | train {tr_loss:.4f}/{tr_acc:.4f} | val {va_loss:.4f}/{va_acc:.4f}")

te_loss, te_acc = run_epoch(test_loader, False)
print(f"Test  {te_loss:.4f}/{te_acc:.4f}")




Epoch 01 | train 1772089.8842/0.9442 | val 604105.7134/0.9861
Epoch 02 | train 1034269.1757/0.9591 | val 512874.0233/0.9863
Epoch 03 | train 1085615.1183/0.9570 | val 650353.5461/0.9851
Epoch 04 | train 737672.0914/0.9567 | val 478497.9124/0.9843
Epoch 05 | train 816832.8606/0.9562 | val 415804.6212/0.9860
Epoch 06 | train 702357.1430/0.9558 | val 351962.9201/0.9837
Epoch 07 | train 470553.7070/0.9563 | val 479339.6071/0.9833
Epoch 08 | train 568403.6127/0.9581 | val 413587.4052/0.9837
Epoch 09 | train 512042.3323/0.9559 | val 443254.1584/0.9849
Epoch 10 | train 368580.2256/0.9556 | val 386412.5908/0.9832
Epoch 11 | train 405352.7556/0.9586 | val 418065.4694/0.9839
Epoch 12 | train 357704.8465/0.9515 | val 466965.7150/0.9838
Epoch 13 | train 365804.5013/0.9547 | val 453898.4394/0.9841
Epoch 14 | train 216655.3312/0.9538 | val 389599.5178/0.9809
Epoch 15 | train 503452.9789/0.9578 | val 316942.8899/0.9726
Epoch 16 | train 476095.2961/0.9320 | val 405720.2159/0.9548
Epoch 17 | train 3198