In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import torch
import dgl
# import dgl.graphbolt as gb
from dataclasses import dataclass
from functools import partial
# from tqdm import tqdm
from IPython.display import display
from typing import Dict, Tuple, Optional, List
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from eda_src.data_loader import load_transactions, load_accounts, load_patterns

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_name = "HI-Small"

print(f"Loading {dataset_name}...\n")
trans_df = load_transactions(dataset_size=dataset_name)
accounts_df = load_accounts(dataset_size=dataset_name)
patterns_df = load_patterns(dataset_size=dataset_name)

display(trans_df.head(1))
display(accounts_df.head(1))
display(patterns_df.head(1))

Loading HI-Small...


Loading transactions from: /home/bernard/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_Trans.csv
File size: 453.6 MB

Loaded 5,078,345 transactions
Date range: 2022-09-01 00:00:00 to 2022-09-18 16:18:00
Laundering transactions: 5,177 (0.102%)

Loading accounts from: /home/bernard/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_accounts.csv

Loaded 518,581 accounts from 30470 banks

Loading patterns from: /home/bernard/.cache/kagglehub/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml/versions/8/HI-Small_Patterns.txt

Loaded patterns


Unnamed: 0,transaction_id,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,0,2022-09-01 00:20:00,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0


Unnamed: 0,bank_name,bank_id,account_id,entity_id,entity_name
0,Portugal Bank #4507,331579,80B779D80,8.0062e+244,Sole Proprietorship #50438


Unnamed: 0,pattern_id,pattern_type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering
0,1,FAN-OUT,2022-09-01 00:06:00,21174,800737690,12,80011F990,2848.96,Euro,2848.96,Euro,ACH,1


In [4]:
# Additional transformations - Transaction-level
trans_df['timestamp'] = trans_df['timestamp'] - trans_df['timestamp'].min()
# trans_df['from_account'] =  trans_df['from_account'].apply(lambda x: int(x,16))
# trans_df['to_account'] =  trans_df['to_account'].apply(lambda x: int(x,16))
trans_df['from_bank_account_id'] = trans_df['from_bank'] + '_' + trans_df['from_account']
trans_df['to_bank_account_id'] = trans_df['to_bank'] + '_' + trans_df['to_account']

# Additional transformations - Account-level
accounts_df['bank_account_id'] = accounts_df['bank_id'] + '_' + accounts_df['account_id']
accounts_df[['entity_type', 'entity_number']] = accounts_df['entity_name'].str.split(' #', expand=True)
accounts_df = accounts_df[['bank_account_id','entity_type']].drop_duplicates()

In [11]:
test = pd.get_dummies(accounts_df[['entity_type']], dtype=float).to_numpy()
test.shape

(518581, 6)

### DGL Pipeline

In [5]:
@dataclass
class GraphData:
    """Container for processed AML graph data ready for GNN training."""
    graph: dgl.DGLGraph
    labels: torch.Tensor
    train_mask: torch.Tensor
    val_mask: torch.Tensor
    test_mask: torch.Tensor
    num_classes: int
    node_features_dim: int
    edge_features_dim: int
    account_mapping: Dict[str, int]

In [6]:
class DataLoader:
    """
    Data loader for Anti-Money Laundering graph construction with DGL.    
    Supports GIN, PNA, and GAT models with temporal and edge features.
    """
    
    def __init__(
        self,
        transactions_df: pd.DataFrame,
        accounts_df: pd.DataFrame,
        add_self_loops: bool = False
    ):
        """
        Initialize AML data loader.
        
        Args:
            transactions_df: Transaction data from load_transactions()
            accounts_df: Account metadata from load_accounts()
            add_self_loops: Whether to add self-loops to graph
        """
        self.transactions = transactions_df.copy(deep=True)
        self.accounts = accounts_df.copy(deep=True)
        self.add_self_loops = add_self_loops

        print("Initializing AML Data Loader...")
        print(f"Transactions: {len(self.transactions):,}")
        print(f"Accounts: {len(self.accounts):,}")
    
    def _create_account_mapping(self) -> Dict[str, int]:
        """Create mapping from account IDs to node indices."""
        # Get unique accounts from both source and target
        all_accounts = pd.concat([
            self.transactions['from_bank_account_id'],
            self.transactions['to_bank_account_id']
        ]).unique()
        
        account_mapping = {acc: idx for idx, acc in enumerate(all_accounts)}
        print(f"Created account mapping: {len(account_mapping):,} unique accounts")
        return account_mapping
    
    def _encode_edge_features(self) -> np.ndarray:
        """
        Encode edge features from transaction and account data.
        
        Returns:
            Array of shape (n_edges, edge_features_dim)
        """
        features_list = []
        
        # 1. Amount features (log transform to reduce skew)
        amount_received = self.transactions['amount_received'].values.reshape(-1, 1)
        amount_paid = self.transactions['amount_paid'].values.reshape(-1, 1)

        log_amount_received = np.log10(amount_received)
        log_amount_paid = np.log10(amount_paid)
        
        features_list.extend([log_amount_received, log_amount_paid])
        
        # 2. Amount ratio and difference
        amount_ratio = (amount_received / (amount_paid + 1e-8)).reshape(-1, 1)
        amount_diff = (amount_received - amount_paid).reshape(-1, 1)
        
        scaler_ratio = StandardScaler()
        scaler_diff = StandardScaler()
        
        features_list.append(scaler_ratio.fit_transform(amount_ratio))
        features_list.append(scaler_diff.fit_transform(amount_diff))
        
        # 3. Categorical features
        one_hot_array = pd.get_dummies(self.transactions[['payment_format', 'receiving_currency', 'payment_currency']], dtype=float).to_numpy()
        features_list.append(one_hot_array)
        
        # 4. Currency matching flags
        currency_match = (
            self.transactions['receiving_currency'] == self.transactions['payment_currency']
        ).astype(float).values.reshape(-1, 1)
        features_list.append(currency_match)
        
        # 5. Bank relationship features
        same_bank = (
            self.transactions['from_bank'] == self.transactions['to_bank']
        ).astype(float).values.reshape(-1, 1)
        features_list.append(same_bank)
        
        # Concatenate all features
        edge_features = np.hstack(features_list)
        
        print(f"Edge features shape: {edge_features.shape}")
        return edge_features
    
    def _create_node_features(
        self,
        account_mapping: Dict[str, int]
    ) -> np.ndarray:
        """
        Create node features from account metadata and transaction statistics.
        
        Args:
            account_mapping: Mapping from account IDs to node indices
            
        Returns:
            Array of shape (n_nodes, node_features_dim)
        """        
        node_features_df = pd.DataFrame.from_dict(account_mapping, orient='index', columns=['node_id']).reset_index().rename(columns={'index': 'bank_account_id'})
        
        node_features_df = node_features_df.merge(
            self.accounts,
            on='bank_account_id',
            how='left'
        ).drop_duplicates()

        node_features = pd.get_dummies(node_features_df[['entity_type']], dtype=float).to_numpy()

        # Add degree centrality features (?)
        print(f"Node features shape: {node_features.shape}")
        return node_features
    
    def build_graph(
        self,
        train_ratio: float = 0.7,
        val_ratio: float = 0.15,
        test_ratio: float = 0.15,
        seed: int = 42
    ) -> GraphData:
        """
        Build DGL heterogeneous graph for AML detection.
        
        Args:
            train_ratio: Proportion of edges for training
            val_ratio: Proportion of edges for validation
            test_ratio: Proportion of edges for testing
            seed: Random seed for reproducibility
            
        Returns:
            AMLGraphData object containing graph and associated data
        """
        np.random.seed(seed)
        torch.manual_seed(seed)
        
        print("\n" + "="*60)
        print("Building DGL Graph for AML Detection")
        print("="*60)
        
        # 1. Create account mapping
        account_mapping = self._create_account_mapping()
        
        # 2. Create edge list
        src_nodes = [account_mapping[acc] for acc in self.transactions['from_bank_account_id']]
        dst_nodes = [account_mapping[acc] for acc in self.transactions['to_bank_account_id']]

        # 3. Create edge features
        edge_features = self._encode_edge_features()
        
        # 4. Create node features
        node_features = self._create_node_features(account_mapping)
        
        # 5. Build DGL graph
        graph = dgl.graph((src_nodes, dst_nodes), num_nodes=len(account_mapping))
        
        # Add self-loops if requested
        if self.add_self_loops:
            graph = dgl.add_self_loop(graph)
            print(f"Added self-loops to graph")
        
        # 6. Add features to graph
        graph.ndata['feat'] = torch.FloatTensor(node_features)
        graph.edata['feat'] = torch.FloatTensor(edge_features)

        # 6a. Compute additional graph statistics
        in_degrees = graph.in_degrees().float()
        out_degrees = graph.out_degrees().float()
        
        # Add degree features to nodes
        degree_features = torch.stack([in_degrees, out_degrees], dim=1)
        graph.ndata['feat'] = torch.cat([graph.ndata['feat'], degree_features], dim=1)
        
        # 7. Add edge labels (laundering indicator)
        edge_labels = torch.LongTensor(self.transactions['is_laundering'].values)
        graph.edata['label'] = edge_labels
        
        # 8. Create train/val/test masks for edges
        n_edges = graph.num_edges()
        indices = np.random.permutation(n_edges)
        
        train_size = int(train_ratio * n_edges)
        val_size = int(val_ratio * n_edges)
        
        train_idx = indices[:train_size]
        val_idx = indices[train_size:train_size + val_size]
        test_idx = indices[train_size + val_size:]
        
        train_mask = torch.zeros(n_edges, dtype=torch.bool)
        val_mask = torch.zeros(n_edges, dtype=torch.bool)
        test_mask = torch.zeros(n_edges, dtype=torch.bool)
        
        train_mask[train_idx] = True
        val_mask[val_idx] = True
        test_mask[test_idx] = True
        
        graph.edata['train_mask'] = train_mask
        graph.edata['val_mask'] = val_mask
        graph.edata['test_mask'] = test_mask
        
        print("\n" + "-"*60)
        print("Graph Statistics:")
        print("-"*60)
        print(f"Nodes: {graph.num_nodes():,}")
        print(f"Edges: {graph.num_edges():,}")
        print(f"Node feature dim: {graph.ndata['feat'].shape[1]}")
        print(f"Edge feature dim: {graph.edata['feat'].shape[1]}")
        print(f"Average degree: {graph.num_edges() / graph.num_nodes():.2f}")
        print(f"\nTrain edges: {train_mask.sum().item():,} ({train_ratio*100:.1f}%)")
        print(f"Val edges: {val_mask.sum().item():,} ({val_ratio*100:.1f}%)")
        print(f"Test edges: {test_mask.sum().item():,} ({test_ratio*100:.1f}%)")
        print(f"\nLaundering edges: {edge_labels.sum().item():,} ({edge_labels.float().mean()*100:.3f}%)")
        print("="*60 + "\n")
        
        return GraphData(
            graph=graph,
            labels=edge_labels,
            train_mask=train_mask,
            val_mask=val_mask,
            test_mask=test_mask,
            num_classes=2,
            node_features_dim=graph.ndata['feat'].shape[1],
            edge_features_dim=graph.edata['feat'].shape[1],
            account_mapping=account_mapping
        )
    
    def get_subgraph_sampler(
        self,
        graph_data: GraphData,
        batch_size: int = 1024,
        num_neighbors: List[int] = [10, 5],
        mode: str = 'train'
    ) -> dgl.dataloading.DataLoader:
        """
        Create a neighborhood sampler for mini-batch training.
        
        Args:
            graph_data: Processed graph data
            batch_size: Number of edges per batch
            num_neighbors: Number of neighbors to sample per layer
            mode: One of 'train', 'val', or 'test'
            
        Returns:
            DGL DataLoader for mini-batch training
        """
        if mode == 'train':
            mask = graph_data.train_mask
        elif mode == 'val':
            mask = graph_data.val_mask
        else:
            mask = graph_data.test_mask
        
        edge_ids = torch.where(mask)[0]
        
        sampler = dgl.dataloading.MultiLayerFullNeighborSampler(len(num_neighbors))
        
        # For edge classification, we need EdgeDataLoader
        # Convert edge IDs to node pairs
        # src, dst = graph_data.graph.find_edges(edge_ids)
        edge_sampler = dgl.dataloading.as_edge_prediction_sampler(
            sampler,
            negative_sampler=None  # No negative sampling for supervised classification
        )
        
        dataloader = dgl.dataloading.DataLoader(
            graph_data.graph,
            edge_ids,
            edge_sampler,
            batch_size=batch_size,
            shuffle=(mode == 'train'),
            drop_last=False,
            num_workers=2
        )
        
        return dataloader

In [7]:
loader = DataLoader(trans_df, accounts_df)
graph_data = loader.build_graph()

Initializing AML Data Loader...
Transactions: 5,078,345
Accounts: 518,581

Building DGL Graph for AML Detection
Created account mapping: 515,088 unique accounts


: 