# Practice GNN

## Notebook config

In [1]:
import random
import networkx as nx
import pandas as pd
import numpy as np
import ipywidgets as widgets
import os
import sys
import matplotlib.pyplot as plt
import warnings
from tabulate import tabulate
from tqdm import trange
from IPython import get_ipython
from IPython.display import display
from time import monotonic
from pprint import pprint

# GNNs 
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, GATConv, EdgeConv
from torch_geometric.data import Data

# Project Source Code
cwd = os.getcwd()
content_base = os.path.dirname(os.path.dirname(cwd))
data_dir = os.path.join(content_base, "data")
src_path = os.path.join(content_base, "src")
sys.path.append(src_path)
from helpers import add_cell_timer
from pipeline import ModelPipeline

print("CWD: ", cwd)
print("content_base: ", content_base)
print("data_dir: ", data_dir)
print("src_path ", src_path)

add_cell_timer()

data_file = os.path.join(data_dir, "HI-XS.csv") # random strat 10% of data

CWD:  c:\repos\dsi-capstone-spring-2025-TD-anti-money-laundering\Code\notebooks\Sophie
content_base:  c:\repos\dsi-capstone-spring-2025-TD-anti-money-laundering\Code
data_dir:  c:\repos\dsi-capstone-spring-2025-TD-anti-money-laundering\Code\data
src_path  c:\repos\dsi-capstone-spring-2025-TD-anti-money-laundering\Code\src


In [2]:
# Initialize pipeline with dataset
pl = ModelPipeline(data_file)
pl.run_preprocessing()


⏱️ Execution time: 10.22s


In [3]:
pl.preprocessed

{'renamed': True,
 'duplicates_removed': True,
 'unique_ids_created': True,
 'currency_normalized': True,
 'time_features_extracted': True,
 'cyclical_encoded': True,
 'weekend_encoded': True,
 'features_encoded': False,
 'neighbor_context_computed': False,
 'normalized': False}


⏱️ Execution time: 0.02s


In [4]:
pl.df.columns

Index(['from_bank', 'to_bank', 'received_amount', 'received_currency',
       'sent_amount', 'sent_currency', 'payment_type', 'is_laundering',
       'from_account_id', 'to_account_id', 'from_account_idx',
       'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
       'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
       'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_weekend'],
      dtype='object')


⏱️ Execution time: 0.02s


In [5]:
pl.apply_label_encoding(["received_currency", "sent_currency", "payment_type"])
pl.extract_graph_features(weight_col= 'sent_amount')


⏱️ Execution time: 18.95s


In [6]:
X_cols = ['from_bank', 'to_bank', 'received_amount', 'received_currency',
       'sent_amount', 'sent_currency', 'payment_type',
       'from_account_id', 'to_account_id', 'from_account_idx',
       'to_account_idx', 'sent_amount_usd', 'received_amount_usd',
       'hour_of_day', 'day_of_week', 'seconds_since_midnight', 'timestamp_int',
       'timestamp_scaled', 'day_sin', 'day_cos', 'time_of_day_sin',
       'time_of_day_cos', 'is_weekend']
y_col = 'is_laundering'
X_train, X_val, X_test, y_train, y_val, y_test = pl.split_train_test_val(X_cols, y_col, test_size=0.15, val_size=0.15)


⏱️ Execution time: 0.48s


In [7]:
edge_features = ['received_amount', 'received_currency','sent_amount', 'sent_currency', 
                 'payment_type','sent_amount_usd', 'received_amount_usd', 'hour_of_day', 
                 'day_of_week', 'seconds_since_midnight', 'timestamp_int', 'timestamp_scaled', 
                 'day_sin', 'day_cos', 'time_of_day_sin', 'time_of_day_cos', 'is_weekend']
pl.generate_tensors(edge_features)

(Data(x=[355483, 17], edge_index=[2, 355483], y=[355483]),
 Data(x=[76175, 17], edge_index=[2, 76175], y=[76175]),
 Data(x=[76176, 17], edge_index=[2, 76176], y=[76176]))


⏱️ Execution time: 0.09s


In [None]:
pl.apply_label_encoding()

## Simple GNN

In [8]:
def generate_tensors(X, y, edge_features, edges = ["from_account_idx", "to_account_idx"]):
    """Convert data to PyTorch tensor format for GNNs"""

    # Edge index (defining graph structure)
    edge_index = torch.tensor(X[edges].values.T, dtype=torch.long)  # Shape: [2, num_edges]

    # Edge attributes (transaction-based features)
    edge_attr = torch.tensor(X[edge_features].values, dtype=torch.float)  # Shape: [num_edges, num_features]

    # Labels for edges (transaction classification: laundering or not)
    edge_labels = torch.tensor(y.values, dtype=torch.long)  # Shape: [num_edges]

    # Create PyG Data object
    data = Data(edge_index=edge_index, edge_attr=edge_attr, y=edge_labels)
    
    return data

# Assuming X_train, y_train are preprocessed dataframes
train_data = generate_tensors(X_train, y_train, edge_features=["sent_amount", "received_amount", "day_of_week"])
val_data = generate_tensors(X_val, y_val, edge_features=["sent_amount", "received_amount", "day_of_week"])
test_data = generate_tensors(X_test, y_test, edge_features=["sent_amount", "received_amount", "day_of_week"])



⏱️ Execution time: 0.05s


In [None]:
from torch_geometric.nn import SAGEConv

# Node-level GNN for embedding learning
class GNN(torch.nn.Module):
    def __init__(self, hidden_dim):
        super(GNN, self).__init__()
        
        self.conv1 = SAGEConv(hidden_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, hidden_dim)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        return x

# Edge-level classifier using dot product similarity
class Classifier(torch.nn.Module):
    def forward(self, node_embeddings, edge_index):
        src, dest = edge_index  # Extract node indices for each edge
        
        # Compute dot product between source and destination node embeddings
        return (node_embeddings[src] * node_embeddings[dest]).sum(dim=-1)

# Full Model Combining GNN and Edge Classification
class EdgeGNN(torch.nn.Module):
    def __init__(self, in_edge_dim, hidden_dim):
        super(EdgeGNN, self).__init__()
        
        # Node embeddings initialized randomly
        self.node_emb = torch.nn.Embedding(num_embeddings=data.num_nodes, embedding_dim=hidden_dim)
        
        # GNN for learning node representations
        self.gnn = GNN(hidden_dim)
        
        # Edge Classifier
        self.classifier = Classifier()

    def forward(self, data):
        # Initialize node embeddings
        x = self.node_emb.weight  # Learnable node embeddings
        
        # Update node embeddings using GNN
        x = self.gnn(x, data.edge_index)
        
        # Compute edge-level predictions
        return self.classifier(x, data.edge_index)

# Model Initialization
model = EdgeGNN(in_edge_dim=data.edge_attr.shape[1], hidden_dim=32)

# Forward Pass (Example)
out = model(data)
print("Edge Predictions Shape:", out.shape)  # Should be [num_edges]




RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [355483, 3] but got: [355483, 32].


⏱️ Execution time: 1.8s


# Hetero graph

A heterogeneous graph with two types of nodes—**transactions** and **accounts**—can be structured such that **edges represent relationships** between them (e.g., an account initiates a transaction, a transaction is received by an account). Given this setup, we can develop a **Graph Neural Network (GNN)-based framework** to classify transaction nodes and identify suspicious activity.

---

### **Proposed Framework for Transaction Node Classification**
#### **1. Graph Construction**
- **Nodes**
  - **Transaction Nodes (T)**: Represent individual financial transactions.
  - **Account Nodes (A)**: Represent entities that send or receive transactions.
- **Edges**
  - **(A → T)**: Account initiates a transaction.
  - **(T → A)**: Transaction is received by an account.
  - **(A ↔ A)**: Accounts that frequently transact with each other (optional, for richer context).
  - **(T ↔ T)**: Similar transactions (e.g., same amount, time window, or recipient).

#### **2. Node and Edge Feature Engineering**
- **Transaction Node Features**
  - Amount, time, location, type of transaction
  - Statistical features (e.g., frequency of transactions, deviation from mean transaction amount)
  - Relationship-based features (e.g., how often a sender has sent transactions of similar size)
- **Account Node Features**
  - Account type (individual, corporate, etc.)
  - Transaction history (e.g., total volume, count of unique counterparties)
  - Risk score (if available from previous AML checks)
- **Edge Features**
  - Time difference between linked transactions
  - Relationship strength (e.g., frequency of interaction)

#### **3. Model Architecture**
- **Heterogeneous Graph Neural Network (HGNN)**
  - **Relational Graph Convolutional Network (R-GCN)** to handle different node and edge types.
  - **Heterogeneous Graph Transformer (HGT)** for learning cross-type interactions.
  - **Graph Attention Network (GAT)** for capturing important transaction flows.

- **Hierarchical Message Passing**
  - **Step 1:** Transactions receive signals from linked accounts to encode account behavior.
  - **Step 2:** Transactions aggregate information from similar transactions to detect patterns.
  - **Step 3:** Accounts update their embeddings based on transaction patterns.
  - **Step 4:** Final transaction embeddings are passed through a classifier.

#### **4. Training and Classification**
- **Supervised Learning**
  - Labels: Suspicious vs. non-suspicious transactions.
  - Loss function: Binary cross-entropy for classification.
- **Semi-Supervised Learning**
  - Use a small set of labeled suspicious transactions and propagate risk signals through the graph.
- **Self-Supervised Learning**
  - Contrastive learning to learn transaction representations based on normal vs. anomalous transaction patterns.

#### **5. Post-processing and Risk Scoring**
- Once transaction nodes are classified, assign a **risk score** based on:
  - Model confidence in classification
  - Proximity to high-risk accounts
  - Transaction anomaly score

---

### **Advantages of This Framework**
✅ **Captures contextual dependencies**: Uses account behavior and transaction relationships.  
✅ **Flexible to new patterns**: Can adapt to changes in laundering methods.  
✅ **Graph augmentation**: Can incorporate external data like blacklists or known fraud networks.

Would you like help implementing any specific part of this in Python? 🚀

In [22]:
from sklearn.preprocessing import LabelEncoder
bank_encoder = LabelEncoder()
from_banks = pl.df["from_bank"].drop_duplicates().reset_index(drop=True)
to_banks = pl.df["to_bank"].drop_duplicates().reset_index(drop=True)
all_banks = pd.concat([from_banks, to_banks]).drop_duplicates().reset_index(drop=True)
bank_encoder.fit(all_banks)
pl.df["from_bank"] = bank_encoder.transform(pl.df["from_bank"]) # Use same encoder
pl.df["to_bank"] = bank_encoder.transform(pl.df["to_bank"])  # Use same encoder


⏱️ Execution time: 0.11s


In [None]:
import torch
from torch_geometric.data import HeteroData
from torch_geometric.utils import from_networkx
import networkx as nx

def construct_heterogeneous_graph(df):
    """
    Constructs a heterogeneous graph from the preprocessed dataframe.
    """

    # Initialize the heterogeneous data object
    data = HeteroData()

    # Extract unique accounts and transactions
    transaction_nodes = df.index  # Assuming each row is a unique transaction
    
    # Create mappings for indexing
    transaction_mapping = {idx: idx for idx in transaction_nodes}
    df["transaction_idx"] = df.index.map(transaction_mapping)

    # Account nodes
    data["account"].x = torch.tensor(df.groupby("from_account_id").mean()[["degree_centrality", "pagerank"]].values, dtype=torch.float)

    # Transaction nodes
    transaction_features = ["sent_amount_usd", "received_amount_usd", "hour_of_day", "is_weekend"]
    data["transaction"].x = torch.tensor(df[transaction_features].values, dtype=torch.float)

    # Edges from account → transaction (initiates)
    edge_index_initiates = torch.tensor([df["from_account_idx"].values, df["transaction_idx"].values], dtype=torch.long)
    data["account", "initiates", "transaction"].edge_index = edge_index_initiates

    # Edges from transaction → account (received_by)
    edge_index_received_by = torch.tensor([df["transaction_idx"].values, df["to_account_idx"].values], dtype=torch.long)
    data["transaction", "received_by", "account"].edge_index = edge_index_received_by

    # Labels for transaction classification (1 = laundering, 0 = normal)
    data["transaction"].y = torch.tensor(df["is_laundering"].values, dtype=torch.long)

    return data

# Construct the heterogeneous graph
hetero_graph = construct_heterogeneous_graph(ModelPipeline("your_dataset.csv").df)
print(hetero_graph)


# GINe

In [None]:
import torch.nn as nn
from torch_geometric.nn import GINEConv, BatchNorm, Linear, GATConv, PNAConv, RGCNConv
import torch.nn.functional as F
import torch
import logging

class GINe(torch.nn.Module):
    def __init__(self, num_features, num_gnn_layers, n_classes=2, 
                n_hidden=100, edge_updates=False, residual=True, 
                edge_dim=None, dropout=0.0, final_dropout=0.5):
        super().__init__()
        self.n_hidden = n_hidden
        self.num_gnn_layers = num_gnn_layers
        self.edge_updates = edge_updates
        self.final_dropout = final_dropout

        self.node_emb = nn.Linear(num_features, n_hidden)
        self.edge_emb = nn.Linear(edge_dim, n_hidden)

        self.convs = nn.ModuleList()
        self.emlps = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        for _ in range(self.num_gnn_layers):
            conv = GINEConv(nn.Sequential(
                nn.Linear(self.n_hidden, self.n_hidden), 
                nn.ReLU(), 
                nn.Linear(self.n_hidden, self.n_hidden)
                ), edge_dim=self.n_hidden)
            if self.edge_updates: self.emlps.append(nn.Sequential(
                nn.Linear(3 * self.n_hidden, self.n_hidden),
                nn.ReLU(),
                nn.Linear(self.n_hidden, self.n_hidden),
            ))
            self.convs.append(conv)
            self.batch_norms.append(BatchNorm(n_hidden))

        self.mlp = nn.Sequential(Linear(n_hidden*3, 50), nn.ReLU(), nn.Dropout(self.final_dropout),Linear(50, 25), nn.ReLU(), nn.Dropout(self.final_dropout),
                              Linear(25, n_classes))

    def forward(self, x, edge_index, edge_attr):
        src, dst = edge_index

        x = self.node_emb(x)
        edge_attr = self.edge_emb(edge_attr)

        for i in range(self.num_gnn_layers):
            x = (x + F.relu(self.batch_norms[i](self.convs[i](x, edge_index, edge_attr)))) / 2
            if self.edge_updates: 
                edge_attr = edge_attr + self.emlps[i](torch.cat([x[src], x[dst], edge_attr], dim=-1)) / 2

        x = x[edge_index.T].reshape(-1, 2 * self.n_hidden).relu()
        x = torch.cat((x, edge_attr.view(-1, edge_attr.shape[1])), 1)
        out = x
        
        return self.mlp(out)