In [1]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))

import os, glob, re
import pandas as pd
import numpy as np

from tqdm import tqdm
from pathlib import Path
from torch_geometric.data import Data
from code_lib.graph_builder import build_emergence_graphs_for_time_range
from code_lib.utils import load_parts

In [2]:
DATA_DIR = "../elliptic_dataset"
WALLETS_FEATURES = "wallets_features.csv"
WALLETS_CLASSES = "wallets_classes.csv"
EDGES_PREFIX = "AddrTxAddr_edgelist_part_"

In [3]:
nodes = pd.read_csv(os.path.join(DATA_DIR, WALLETS_FEATURES))
node_labels = pd.read_csv(os.path.join(DATA_DIR, WALLETS_CLASSES))
edges_with_edge_labels = load_parts(DATA_DIR, EDGES_PREFIX)
nodes_with_labels = nodes.merge(node_labels, on='address', how='left')

### Training a baseline

Let's train a simple baseline on a bianry graph with walk length 2 and looking 3 time steps ahead.
Let's say we train n time steps 1-37 and evaluate on 40-46 (37 + time horizon, otherwise it's cheating)

In [4]:
graphs = build_emergence_graphs_for_time_range(
    edges_with_labels_df=edges_with_edge_labels,
    nodes_with_classes_df=nodes_with_labels,
    first_time_step=1,
    last_time_step=49,
    max_walk_length=2,
    time_horizon=3,
    use_distance_labels=False,
    keep_class_labels_as_features=False,
    ignore_illict=True,
    ignore_previously_transacting_with_illicit=True
)

Total unique addresses across all time: 822942
Total time steps: 49
Generating 46 graphs (time steps 1 to 46)...

t=1: nodes= 34853, edges=  66836, labels={np.int64(0): 34853}
t=2: nodes= 59236, edges= 199129, labels={np.int64(0): 59236}
t=3: nodes= 78510, edges= 264124, labels={np.int64(0): 78489, np.int64(1): 21}
t=4: nodes= 98707, edges= 331393, labels={np.int64(0): 98668, np.int64(1): 39}
t=5: nodes=120865, edges= 399829, labels={np.int64(0): 119639, np.int64(1): 1226}
t=6: nodes=131985, edges= 436559, labels={np.int64(0): 130744, np.int64(1): 1241}
t=7: nodes=152051, edges= 492636, labels={np.int64(0): 147918, np.int64(1): 4133}
t=8: nodes=176366, edges= 578493, labels={np.int64(0): 171122, np.int64(1): 5244}
t=9: nodes=194983, edges= 638467, labels={np.int64(0): 190355, np.int64(1): 4628}
t=10: nodes=220639, edges= 701970, labels={np.int64(0): 216981, np.int64(1): 3658}
t=11: nodes=239172, edges= 763390, labels={np.int64(0): 238511, np.int64(1): 661}
t=12: nodes=248071, edges= 78

In [18]:
TRAIN_START = 1
TRAIN_END = 37
EVAL_START = 40
EVAL_END = 46

train_graphs = [graphs[19], graphs[22], graphs[29], graphs[30]]
test_graphs = [graphs[33], graphs[34], graphs[38]] # indexed from 0, so -1

In [19]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np

In [20]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.classifier = torch.nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x, edge_index):
        # First GCN layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        # Second GCN layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        
        # Classifier
        x = self.classifier(x)
        return x

In [21]:
DEVICE = "mps"
HIDDEN_DIM = 64
NUM_EPOCHS = 10
LEARNING_RATE = 0.01

num_features = train_graphs[0].x.shape[1]
num_classes = 2

model = GCN(num_features, HIDDEN_DIM, num_classes).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=5e-4)

In [22]:
def train_epoch(model, graphs, optimizer):
    model.train()
    total_loss = 0
    total_correct = 0
    total_nodes = 0
    
    for graph in graphs:
        graph = graph.to(DEVICE)
        
        optimizer.zero_grad()
        out = model(graph.x, graph.edge_index)
        loss = F.cross_entropy(out, graph.y)
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        pred = out.argmax(dim=1)
        correct = (pred == graph.y).sum().item()
        
        total_loss += loss.item() * graph.num_nodes
        total_correct += correct
        total_nodes += graph.num_nodes
    
    avg_loss = total_loss / total_nodes
    avg_acc = total_correct / total_nodes
    return avg_loss, avg_acc

In [23]:
def evaluate(model, graphs):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for graph in graphs:
            graph = graph.to(DEVICE)
            out = model(graph.x, graph.edge_index)
            probs = F.softmax(out, dim=1)
            preds = out.argmax(dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(graph.y.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
    
    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

In [24]:
train_loop = tqdm(range(1, NUM_EPOCHS + 1))
for epoch in train_loop:
    train_loss, train_acc = train_epoch(model, train_graphs, optimizer)

    text = f"Epoch {epoch:3d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}"
    
    if epoch % 10 == 0 or epoch == 1:
        # Evaluate on eval set periodically
        eval_preds, eval_labels, eval_probs = evaluate(model, test_graphs)
        eval_acc = (eval_preds == eval_labels).mean()
        
        text = f"Epoch {epoch:3d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Eval Acc: {eval_acc:.4f}"

    train_loop.set_description(text)

Epoch  10 | Train Loss: 4.9451 | Train Acc: 0.8655 | Eval Acc: 0.9075: 100%|██████████| 10/10 [02:22<00:00, 14.21s/it]   


In [25]:
# print("\n--- Training Set ---")
# train_preds, train_labels, train_probs = evaluate(model, train_graphs)
# print(classification_report(train_labels, train_preds, target_names=['No Emergence', 'Emergence']))
# if len(np.unique(train_labels)) == 2:
#     auc = roc_auc_score(train_labels, train_probs[:, 1])
#     print(f"ROC-AUC: {auc:.4f}")

print("\n--- Evaluation Set ---")
eval_preds, eval_labels, eval_probs = evaluate(model, test_graphs)
print(classification_report(eval_labels, eval_preds, target_names=['No Emergence', 'Emergence']))
if len(np.unique(eval_labels)) == 2:
    auc = roc_auc_score(eval_labels, eval_probs[:, 1])
    print(f"ROC-AUC: {auc:.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(eval_labels, eval_preds))

print("\n--- Per-Timestep Evaluation ---")
for i, graph in enumerate(test_graphs):
    graph = graph.to(DEVICE)
    model.eval()
    with torch.no_grad():
        out = model(graph.x, graph.edge_index)
        preds = out.argmax(dim=1).cpu().numpy()
        labels = graph.y.cpu().numpy()
        
        acc = (preds == labels).mean()
        pos_count = (labels == 1).sum()
        pos_correct = ((preds == 1) & (labels == 1)).sum()
        
        print(f"t={EVAL_START + i}: Acc={acc:.4f} | Positives: {pos_correct}/{pos_count} | Total: {len(labels)}")


--- Evaluation Set ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

No Emergence       0.91      1.00      0.95   1546413
   Emergence       0.00      0.00      0.00    157638

    accuracy                           0.91   1704051
   macro avg       0.45      0.50      0.48   1704051
weighted avg       0.82      0.91      0.86   1704051

ROC-AUC: 0.5000

Confusion Matrix:
[[1546413       0]
 [ 157638       0]]

--- Per-Timestep Evaluation ---
t=40: Acc=0.8931 | Positives: 0/56751 | Total: 530840
t=41: Acc=0.9139 | Positives: 0/47552 | Total: 552376
t=42: Acc=0.9141 | Positives: 0/53335 | Total: 620835


So yeah the signal is too sprase and model predicts that no nodes will have illicit emergence. Maybe a long walk length distance reward would yield better gradients, but then we need to speed up computation to even build such a dataset.