In [2]:
#
# This script implements the Graph Convolutional Network (GCN) model described in the paper:
# "Anti-Money Laundering in Bitcoin: Experimenting with Graph Convolutional Networks for Financial Forensics"
#
# This implementation uses the GCN as an end-to-end classifier, leveraging both the graph
# structure and the node features to predict illicit vs. licit transactions. It specifically
# implements the Skip-GCN architecture, which the paper found to be more performant.
#


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
import time

SEED = 42

import os, random

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True   # 100 % repeatable
torch.backends.cudnn.benchmark = False      # 〃  (slower, but reproducible)

os.environ["PYTHONHASHSEED"] = str(SEED)

# --- 1. Data Loading and Preprocessing ---
def load_data():
    """
    Loads the Elliptic dataset from CSV files, merges them, and performs
    initial preprocessing.
    """
    print("Loading data...")
    try:
        features_df = pd.read_csv('data/elliptic_txs_features.csv', header=None)
        edgelist_df = pd.read_csv('data/elliptic_txs_edgelist.csv')
        classes_df = pd.read_csv('data/elliptic_txs_classes.csv')
    except FileNotFoundError as e:
        print(f"Error: {e}. Please make sure the dataset files are in the same directory.")
        return None, None

    # Name the feature columns for clarity
    features_df.columns = ['txId', 'timestep'] + [f'feature_{i}' for i in range(165)]

    # Map class labels to meaningful names
    classes_df['class'] = classes_df['class'].map({'1': 'illicit', '2': 'licit', 'unknown': 'unknown'})

    # Merge features and classes
    data_df = pd.merge(features_df, classes_df, on='txId', how='left')
    
    # Sort by timestep for temporal split
    data_df = data_df.sort_values('timestep').reset_index(drop=True)

    print("Data loaded and merged successfully.")
    print(f"Total transactions: {len(data_df)}")
    print("Class distribution:")
    print(data_df['class'].value_counts())
    
    return data_df, edgelist_df

# --- 2. GCN Model Definition ---
class SkipGCN(nn.Module):
    """
    A 2-layer Skip-Graph Convolutional Network (Skip-GCN) as described in the paper.
    This variant adds a skip connection from the input features to the final output layer.
    """
    def __init__(self, n_features, n_hidden, n_classes, dropout_rate=0.5):
        super(SkipGCN, self).__init__()
        self.gc1 = nn.Linear(n_features, n_hidden)
        self.gc2 = nn.Linear(n_hidden, n_classes)
        self.skip_layer = nn.Linear(n_features, n_classes, bias=False) # Skip connection
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, adj):
        """
        Forward pass for the Skip-GCN.
        adj: The normalized adjacency matrix (sparse).
        x: The node feature matrix.
        """
        # First GCN layer - use sparse matrix multiplication
        h1 = F.relu(self.gc1(torch.sparse.mm(adj, x)))
        h1_d = self.dropout(h1)
        
        # Second GCN layer - use sparse matrix multiplication
        gcn_out = self.gc2(torch.sparse.mm(adj, h1_d))
        
        # Skip connection from original features
        skip_out = self.skip_layer(x)
        
        # Combine GCN path and skip path
        logits = gcn_out + skip_out
        
        return F.log_softmax(logits, dim=1)

def normalize_adjacency_matrix_sparse(adj_sparse, n_nodes):
    """
    Computes the symmetrically normalized adjacency matrix for sparse tensors.
    A_hat = D^{-1/2} * (A + I) * D^{-1/2}
    """
    print("Normalizing sparse adjacency matrix...")
    
    # Add self-loops to sparse matrix
    self_loop_indices = torch.arange(n_nodes).unsqueeze(0).repeat(2, 1)
    self_loop_values = torch.ones(n_nodes)
    self_loops = torch.sparse_coo_tensor(self_loop_indices, self_loop_values, (n_nodes, n_nodes))
    
    # Combine adjacency matrix with self-loops
    adj_with_selfloops = adj_sparse + self_loops
    adj_with_selfloops = adj_with_selfloops.coalesce()
    
    # Calculate degree matrix
    row_sum = torch.sparse.sum(adj_with_selfloops, dim=1).to_dense()
    d_inv_sqrt = torch.pow(row_sum, -0.5)
    d_inv_sqrt[torch.isinf(d_inv_sqrt)] = 0.
    
    # Create sparse diagonal matrix for normalization
    diag_indices = torch.arange(n_nodes).unsqueeze(0).repeat(2, 1)
    d_mat_inv_sqrt_sparse = torch.sparse_coo_tensor(diag_indices, d_inv_sqrt, (n_nodes, n_nodes))
    
    # Normalize: D^{-1/2} * A * D^{-1/2}
    normalized = torch.sparse.mm(d_mat_inv_sqrt_sparse, adj_with_selfloops)
    normalized = torch.sparse.mm(normalized, d_mat_inv_sqrt_sparse)
    
    return normalized.coalesce()

# --- 3. Model Training and Evaluation ---
def run_gcn_experiment(data_df, edgelist_df):
    """
    Sets up the data, trains the Skip-GCN model, and evaluates its performance.
    """
    print("\n--- Starting Skip-GCN Classification Experiment ---")
    
    # --- Data Preparation ---
    # Map txId to an index for matrix construction
    txid_to_idx = {txid: i for i, txid in enumerate(data_df['txId'])}
    n_nodes = len(data_df)
    
    # Scale features (exclude the class column which is the last column)
    features = data_df.iloc[:, 2:-1].values  # Exclude class column
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    
    # Create sparse adjacency matrix using COO format
    print("Creating sparse adjacency matrix...")
    print(f"Processing {len(edgelist_df)} edges...")
    
    # More memory-efficient edge processing
    src_indices = []
    tgt_indices = []
    
    for _, row in edgelist_df.iterrows():
        src_idx = txid_to_idx.get(row['txId1'])
        tgt_idx = txid_to_idx.get(row['txId2'])
        if src_idx is not None and tgt_idx is not None:
            src_indices.extend([src_idx, tgt_idx])
            tgt_indices.extend([tgt_idx, src_idx])  # Make symmetric
    
    if src_indices:
        edge_indices = torch.LongTensor([src_indices, tgt_indices])
        edge_values = torch.ones(len(src_indices))
        adj_sparse = torch.sparse_coo_tensor(edge_indices, edge_values, (n_nodes, n_nodes))
        print(f"Created sparse adjacency matrix with {len(src_indices)} edges")
    else:
        # Create empty sparse matrix if no edges
        edge_indices = torch.LongTensor([[0], [0]])
        edge_values = torch.zeros(1)
        adj_sparse = torch.sparse_coo_tensor(edge_indices, edge_values, (n_nodes, n_nodes))
        print("Created empty adjacency matrix")
    
    adj_normalized = normalize_adjacency_matrix_sparse(adj_sparse, n_nodes)
    
    # Prepare labels and temporal masks
    labels_map = {'licit': 0, 'illicit': 1, 'unknown': 2}
    labels = torch.LongTensor(data_df['class'].map(labels_map).values)
    features_tensor = torch.FloatTensor(features)
    
    # Temporal split: train on timesteps 1-34, test on 35-49
    train_mask = torch.BoolTensor(data_df['timestep'] <= 34)
    test_mask = torch.BoolTensor(data_df['timestep'] > 34)
    
    # Filter out 'unknown' classes from training and testing for loss/accuracy calculation
    train_mask &= (labels != 2)
    test_mask &= (labels != 2)

    # --- Model Setup ---
    n_features = features_tensor.shape[1]
    n_hidden = 100 # As per the paper's hyperparameter tuning
    n_classes = 2  # Licit vs Illicit
    
    model = SkipGCN(n_features=n_features, n_hidden=n_hidden, n_classes=n_classes)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    
    # Weighted loss for imbalanced classes (licit vs illicit)
    # The paper mentions a 0.3/0.7 weight ratio
    loss_weights = torch.FloatTensor([0.3, 0.7])
    criterion = nn.NLLLoss(weight=loss_weights)
    
    # --- Training Loop ---
    print("Training Skip-GCN model...")
    start_time = time.time()
    for epoch in range(200): # The paper trained for 1000 epochs, 200 is good for demonstration
        model.train()
        optimizer.zero_grad()
        
        output = model(features_tensor, adj_normalized)
        loss = criterion(output[train_mask], labels[train_mask])
        
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 20 == 0:
            # Calculate accuracy on the training set for monitoring
            model.eval()
            with torch.no_grad():
                pred = model(features_tensor, adj_normalized).max(1)[1]
                correct = pred[train_mask].eq(labels[train_mask]).sum().item()
                acc = correct / train_mask.sum().item()
                print(f"Epoch {epoch+1:03d} | Loss: {loss.item():.4f} | Train Acc: {acc:.4f}")

    print(f"GCN training finished in {time.time() - start_time:.2f} seconds.")

    # --- Evaluation ---
    print("\nEvaluating model performance on the test set...")
    model.eval()
    with torch.no_grad():
        output = model(features_tensor, adj_normalized)
        preds = output.max(1)[1]
        
        test_labels = labels[test_mask].numpy()
        test_preds = preds[test_mask].numpy()
        
        print("\nClassification Report (Test Set):")
        # 0 is licit, 1 is illicit
        print(classification_report(test_labels, test_preds, target_names=['Licit', 'Illicit']))

        illicit_f1 = f1_score(test_labels, test_preds, pos_label=1)
        print(f"F1 Score for Illicit class: {illicit_f1:.4f}")


# --- Main Execution ---
if __name__ == "__main__":
    data_df, edgelist_df = load_data()
    if data_df is not None:
        run_gcn_experiment(data_df, edgelist_df)


Loading data...
Data loaded and merged successfully.
Total transactions: 203769
Class distribution:
class
unknown    157205
licit       42019
illicit      4545
Name: count, dtype: int64

--- Starting Skip-GCN Classification Experiment ---
Creating sparse adjacency matrix...
Processing 234355 edges...
Created sparse adjacency matrix with 468710 edges
Normalizing sparse adjacency matrix...
Training Skip-GCN model...
Epoch 020 | Loss: 0.3159 | Train Acc: 0.9042
Epoch 040 | Loss: 0.2518 | Train Acc: 0.9189
Epoch 060 | Loss: 0.2189 | Train Acc: 0.9302
Epoch 080 | Loss: 0.1971 | Train Acc: 0.9413
Epoch 100 | Loss: 0.1805 | Train Acc: 0.9473
Epoch 120 | Loss: 0.1675 | Train Acc: 0.9518
Epoch 140 | Loss: 0.1580 | Train Acc: 0.9550
Epoch 160 | Loss: 0.1524 | Train Acc: 0.9573
Epoch 180 | Loss: 0.1443 | Train Acc: 0.9594
Epoch 200 | Loss: 0.1407 | Train Acc: 0.9615
GCN training finished in 70.12 seconds.

Evaluating model performance on the test set...

Classification Report (Test Set):
        