In [1]:
!pip install numpy pandas matplotlib torch torchmetrics



In [2]:
# Minimal PyTorch CUDA availability check - fail fast if CUDA is unavailable
try:
    import torch
except Exception as e:
    raise RuntimeError(f"PyTorch is not installed or failed to import: {e}")

# Ensure CUDA is available - if not, raise an error because we rely on GPU
if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available. Please ensure an NVIDIA GPU, drivers, and CUDA are installed and that the CUDA toolkit is compatible with your PyTorch build.")

# Report devices
print('PyTorch version:', torch.__version__)
cnt = torch.cuda.device_count()
print('CUDA device count:', cnt)
for i in range(cnt):
    try:
        name = torch.cuda.get_device_name(i)
    except Exception:
        name = f'unknown-device-{i}'
    print(f'Device {i}:', name)

# Quick allocation test to confirm functional GPU access
try:
    _ = torch.zeros(1, device='cuda')
    print('Successfully allocated a tensor on CUDA.')
except Exception as e:
    raise RuntimeError('CUDA appears available but tensor allocation failed: ' + str(e))

# may need to use different dgl if not on linux or using different version of CUDA
!pip install  dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html

RuntimeError: CUDA is not available. Please ensure an NVIDIA GPU, drivers, and CUDA are installed and that the CUDA toolkit is compatible with your PyTorch build.

In [3]:
# alternate dependencies for running on mac without CUDA (only run if previous block fails)

!pip install pydantic
!pip install PyYAML
!pip install numpy==1.26.4
!pip install torch==2.1.1 torchdata==0.7.1
!pip install dgl -f https://data.dgl.ai/wheels/repo.html

import torch
import dgl

Looking in links: https://data.dgl.ai/wheels/repo.html


In [4]:
import pandas as pd
from pathlib import Path

# Path to the small transactions CSV (relative to this notebook).
DATA_PATH = Path("dataset") / "HI-Small_Trans.csv"

# Load into a DataFrame
small_trans = pd.read_csv(DATA_PATH)

# Quick summary and preview
print(f"Loaded {len(small_trans)} rows; columns: {list(small_trans.columns)}")
small_trans.head()

Loaded 5078345 rows; columns: ['Timestamp', 'From Bank', 'Account', 'To Bank', 'Account.1', 'Amount Received', 'Receiving Currency', 'Amount Paid', 'Payment Currency', 'Payment Format', 'Is Laundering']


Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [5]:
# Basic analysis: currencies, banks, and other summaries
# This cell is robust to different column names: it searches for currency-like and bank-like columns

# Show shape and a small sample
rows, cols = small_trans.shape
print(f"Data shape: {rows} rows x {cols} columns")
print()
print("Sample rows:")
display(small_trans.head())

# Missing values by column (top 10)
missing_by_col = small_trans.isnull().sum().sort_values(ascending=False).head(15)
print("Top missing values by column:")
print(missing_by_col.to_string())
print()

# Find likely currency column(s)
currency_candidates = [c for c in small_trans.columns if any(k in c.lower() for k in ('currency','ccy','curr'))]
if currency_candidates:
    cur_col = currency_candidates[0]
    num_currencies = small_trans[cur_col].nunique(dropna=True)
    top_currencies = small_trans[cur_col].value_counts().head(10)
    print(f"Found currency column: '{cur_col}' — {num_currencies} unique values")
    print("Top currencies (by count):")
    print(top_currencies.to_string())
else:
    cur_col = None
    print("No currency-like column found.")
    print("Columns:", list(small_trans.columns))

print()
# Find likely bank-related columns
bank_candidates = [c for c in small_trans.columns if any(k in c.lower() for k in ('bank','institution','bic','iban','bankid','bank_id','bankname','bank_name'))]
if bank_candidates:
    # Count unique bank identifiers across candidate columns
    unique_banks = set()
    for c in bank_candidates:
        unique_banks.update(small_trans[c].dropna().astype(str).unique())
    num_unique_banks = len(unique_banks)
    print(f"Found bank-like columns: {bank_candidates} — approx. {num_unique_banks} unique bank identifiers (aggregated)")
else:
    num_unique_banks = None
    print("No bank-like columns found.")

print()
# Other basic summaries: amount column candidates and top senders/receivers if available
amt_candidates = [c for c in small_trans.columns if any(k in c.lower() for k in ('amount','amt','value'))]
if amt_candidates:
    amt_col = amt_candidates[0]
    print(f"Found amount column: {amt_col} — summary:")
    print(small_trans[amt_col].describe())
else:
    print("No amount-like column found.")

# If sender/receiver columns exist, show top participants
party_candidates = [c for c in small_trans.columns if any(k in c.lower() for k in ('sender','receiver','originator','beneficiary','from_','to_','account'))]
if party_candidates:
    print()
    print("Top participants in party-like columns:")
    for c in party_candidates:
        print(f"Column: {c}")
# Final small sample
display(small_trans.sample(min(5, len(small_trans))))

Data shape: 5078345 rows x 11 columns

Sample rows:


Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


Top missing values by column:
Timestamp             0
From Bank             0
Account               0
To Bank               0
Account.1             0
Amount Received       0
Receiving Currency    0
Amount Paid           0
Payment Currency      0
Payment Format        0
Is Laundering         0

Found currency column: 'Receiving Currency' — 15 unique values
Top currencies (by count):
Receiving Currency
US Dollar      1879341
Euro           1172017
Swiss Franc     237884
Yuan            206551
Shekel          194988
Rupee           192065
UK Pound        181255
Ruble           157361
Yen             156319
Bitcoin         148151

Found bank-like columns: ['From Bank', 'To Bank'] — approx. 30470 unique bank identifiers (aggregated)

Found amount column: Amount Received — summary:
count    5.078345e+06
mean     5.988726e+06
std      1.037183e+09
min      1.000000e-06
25%      1.833700e+02
50%      1.411010e+03
75%      1.234627e+04
max      1.046302e+12
Name: Amount Received, dtype: float64

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
2674717,2022/09/05 19:16,1669,80106A0C0,410,802A30AD0,73.27,Euro,73.27,Euro,Cash,0
1308823,2022/09/02 05:38,70,1004286A8,233603,810CBA6F0,120.17,Euro,120.17,Euro,Credit Card,0
2396701,2022/09/05 04:46,116,80DDA8990,238190,80E5140D0,11.85,Swiss Franc,11.85,Swiss Franc,Cheque,0
2574110,2022/09/05 14:17,2843,800DE07F0,4523,801DB38E0,6693.54,US Dollar,6693.54,US Dollar,Credit Card,0
2442315,2022/09/05 07:05,3335,80B78FF90,11657,810972AF0,7701.22,Euro,7701.22,Euro,Cash,0


In [6]:
# create adjacency lists to represent the graph
source = small_trans['Account'].apply(int, base=16)
target = small_trans['Account.1'].apply(int, base=16)

In [7]:
# create directed graph using dgl
g_accounts = dgl.graph((source, target))

#remove isolated nodes
g = dgl.compact_graphs(g_accounts)

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# extract individual edge features
time = pd.to_datetime(small_trans['Timestamp']).astype('int64') / 1e9
amount_paid = small_trans['Amount Paid'].to_numpy()
amount_received = small_trans['Amount Received'].to_numpy()

# use one-hot encoding for categorical variables
paid_enc = OneHotEncoder(sparse_output=False)
paid_currency = paid_enc.fit_transform(small_trans['Payment Currency'].to_numpy().reshape(-1, 1))

received_enc = OneHotEncoder(sparse_output=False)
received_currency = received_enc.fit_transform(small_trans['Receiving Currency'].to_numpy().reshape(-1, 1))

format_enc = OneHotEncoder(sparse_output=False)
pay_format = format_enc.fit_transform(small_trans['Payment Format'].to_numpy().reshape(-1, 1))

# combine edge features into single tensor
numeric_features = np.column_stack([time, amount_paid, amount_received])
edge_features = torch.cat((torch.from_numpy(numeric_features),
                           torch.from_numpy(paid_currency),
                           torch.from_numpy(received_currency),
                           torch.from_numpy(pay_format)), 1).float()

# create edge labels
fraud_label = torch.tensor(small_trans['Is Laundering'].to_numpy())

# attach features and labels to graph
g.edata['features'] = edge_features
g.edata['label'] = fraud_label

In [8]:
# use chronological 60/20/20 train/val/test split based on documentation
train_edges, val_edges, test_edges = dgl.data.utils.split_dataset(g.edges('eid'), frac_list=[0.6, 0.2, 0.2])

train = dgl.edge_subgraph(g, train_edges)
val = dgl.edge_subgraph(g, val_edges)
test = dgl.edge_subgraph(g, test_edges)