<a href="https://colab.research.google.com/github/dvoils/neural-network-experiments/blob/main/elliptic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Eliptic Dataset

The [Elliptic Dataset](https://www.kaggle.com/datasets/ellipticco/elliptic-data-set) is a graph of over 200,000 Bitcoin transactions labeled as licit, illicit, or unknown. Each node has 166 features and belongs to a time step. Edges represent fund flows. It supports temporal, graph-based machine learning for detecting financial crime, with real-world labels from exchanges, scams, ransomware, and more.

# Download Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Unzip the Dataset to local

In [3]:
import zipfile

# Path to your zip file in Google Drive
zip_path = "/content/drive/MyDrive/elliptic/archive.zip"

# Extract it into current directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content")


# Load the CSVs

In [4]:
import pandas as pd
import os

extract_path = "/content/elliptic_bitcoin_dataset"
print("Extracted files:", os.listdir(extract_path))

features_df = pd.read_csv(f"{extract_path}/elliptic_txs_features.csv", header=None)
classes_df = pd.read_csv(f"{extract_path}/elliptic_txs_classes.csv")
edgelist_df = pd.read_csv(f"{extract_path}/elliptic_txs_edgelist.csv")

print("Features shape:", features_df.shape)
print("Classes shape:", classes_df.shape)
print("Edgelist shape:", edgelist_df.shape)


Extracted files: ['elliptic_txs_edgelist.csv', 'elliptic_txs_features.csv', 'elliptic_txs_classes.csv']
Features shape: (203769, 167)
Classes shape: (203769, 2)
Edgelist shape: (234355, 2)


# Preprocess CSVs into PyTorch Geometric Format

Extract the 166 numerical features for each transaction (excluding the tx ID), then converts them into a PyTorch float tensor. It creates the node feature matrix `x`, used by GNN models to learn from transaction behavior patterns.

In [5]:
import torch

# First column is tx ID (string), rest are features
tx_ids = features_df[0].values
features = features_df.iloc[:, 1:].values

# Convert to tensor
x = torch.tensor(features, dtype=torch.float)
print("Node features tensor shape:", x.shape)


Node features tensor shape: torch.Size([203769, 166])


# Map tx_ids to node indices

This creates a dictionary mapping each transaction ID to its row index in the feature matrix. It enables quick lookup of node positions when assigning labels or building graph edges from transaction IDs found in other files.

In [6]:
# Create a mapping from tx ID to index
tx_id_to_idx = {tx_id: idx for idx, tx_id in enumerate(tx_ids)}

# Build the edge index from the edgelist

This maps transaction IDs in the edge list to integer indices using the feature matrix. It filters out invalid edges, then creates a PyTorch tensor edge_index representing graph connectivity for use in graph neural networks.

In [7]:
# Map transaction IDs in edge list to integer indices
src = edgelist_df['txId1'].map(tx_id_to_idx)
dst = edgelist_df['txId2'].map(tx_id_to_idx)

# Drop rows with missing mappings (if any txId isn't in features_df)
mask = src.notnull() & dst.notnull()
src = src[mask].astype(int)
dst = dst[mask].astype(int)

# Build edge index
edge_index = torch.tensor([src.values, dst.values], dtype=torch.long)
print("Edge index shape:", edge_index.shape)


Edge index shape: torch.Size([2, 234355])


  edge_index = torch.tensor([src.values, dst.values], dtype=torch.long)


# Assign Labels

Convert label strings into integers so that the model can understand them.

Unknown transactions lack ground truth but can be useful for testing. While not suitable for training or evaluation metrics, they’re valuable for simulating how the model performs on unlabeled blockchain data.

In [8]:
# Apply unknown mapping
label_map = {'unknown': -1, '1': 0, '2': 1}

# Apply mapping
classes_df['label'] = classes_df['class'].map(label_map)

# Check mapping worked
print(classes_df['label'].value_counts(dropna=False))


label
-1    157205
 1     42019
 0      4545
Name: count, dtype: int64


# Create Label Tensor

This builds a label tensor y matching node indices. It sets -1 for unknowns and assigns labels (0=licit, 1=illicit) where available. It ensures labels align with feature rows and supports later filtering for training or evaluation.

In [9]:
import torch
import pandas as pd

# Rebuild mapping from txId to index in features
tx_ids = features_df[0].values
tx_id_to_idx = {tx_id: idx for idx, tx_id in enumerate(tx_ids)}

# Initialize labels tensor
y = torch.full((len(tx_ids),), -1, dtype=torch.long)

# Assign labels where known
for row in classes_df.itertuples(index=False):
    tx_id = row.txId
    label = row.label
    if tx_id in tx_id_to_idx and not pd.isna(label) and label != -1:
        y[tx_id_to_idx[tx_id]] = int(label)

print("Label vector shape:", y.shape)
print("Labeled nodes:", (y != -1).sum().item())


Label vector shape: torch.Size([203769])
Labeled nodes: 46564


# Install PyTorch Geometric

In [10]:
# Then reinstall with the correct CUDA version for Colab (PyTorch 2.0 + CUDA 11.8)
!pip install -q torch-scatter torch-sparse torch-geometric \
  -f https://data.pyg.org/whl/torch-2.0.0+cu118.html


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25h

# Build the training subgraph

Use transactions from time steps 1–30 for training. Map tx IDs to node indices, filter edges accordingly, and extract features and labels for training. The graph is relabeled to ensure proper indexing for GNN training.



In [11]:
# Load time step information from features_df
time_info = features_df.iloc[:, 0:2]  # tx_id and time_step
time_info.columns = ['tx_id', 'time_step']

# Define time ranges
train_range = range(1, 31)   # time steps 1–30
test_range = range(31, 50)   # time steps 31–49

# Extract tx_ids by time range
train_ids = time_info[time_info['time_step'].isin(train_range)]['tx_id'].values
test_ids = time_info[time_info['time_step'].isin(test_range)]['tx_id'].values

# Convert tx_ids to node indices (filter missing)
train_idx = [tx_id_to_idx[tx_id] for tx_id in train_ids if tx_id in tx_id_to_idx]
test_idx = [tx_id_to_idx[tx_id] for tx_id in test_ids if tx_id in tx_id_to_idx]

# Extract subgraph for training
from torch_geometric.utils import subgraph

train_idx_tensor = torch.tensor(train_idx, dtype=torch.long)
train_edge_index, _ = subgraph(train_idx_tensor, edge_index, relabel_nodes=True)

# Map original indices to new (relabeling)
old_to_new = {old.item(): new for new, old in enumerate(train_idx_tensor)}

# Update labels and features for training
x_train = x[train_idx_tensor]
y_train = y[train_idx_tensor]

# Build train_data object
from torch_geometric.data import Data

train_data = Data(x=x_train, edge_index=train_edge_index, y=y_train)

print(f"Train subgraph: {train_data}")
print(f"# of training nodes: {train_data.num_nodes}")
print(f"# of edges: {train_data.num_edges}")




Train subgraph: Data(x=[123287, 166], edge_index=[2, 142784], y=[123287])
# of training nodes: 123287
# of edges: 142784


In [12]:
print(classes_df.head())
print(classes_df.columns)
print(classes_df['class'].unique())


        txId    class  label
0  230425980  unknown     -1
1    5530458  unknown     -1
2  232022460  unknown     -1
3  232438397        2      1
4  230460314  unknown     -1
Index(['txId', 'class', 'label'], dtype='object')
['unknown' '2' '1']


In [13]:
import torch
import torch_geometric
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch: 2.6.0+cu124
CUDA available: True


# Create Data Object

Create a PyTorch Geometric Data object representing the full graph. It includes node features x, edge connections edge_index, and node labels y. This gives GNN models access to structured transaction data during training and inference.



In [14]:
from torch_geometric.data import Data
data = Data(x=x, edge_index=edge_index, y=y)


# The VGAE Encoder

The `GCNEncoder` class defines the encoder used in our Variational Graph Autoencoder (VGAE). It leverages **Graph Convolutional Networks (GCNs)** to extract meaningful latent representations from node features and the graph structure.

The encoder includes three GCN layers:

1. `conv1`: Expands input features to 2× the latent size with ReLU activation  
2. `conv_mu`: Outputs the **mean** vector of the latent distribution  
3. `conv_logvar`: Outputs the **log-variance** vector

This design follows the standard VAE framework, where each input is encoded as a Gaussian distribution:

$$
q(z \mid x) = \mathcal{N}(\mu, \sigma^2)
$$

The VGAE then samples from this distribution and decodes the latent embeddings to reconstruct the graph structure. This is a probabilistic approach that builds more robust and generalizable representations.

In [15]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, VGAE

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logvar = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index)


# Set up the model and device

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

model = VGAE(GCNEncoder(in_channels=train_data.num_features, out_channels=32)).to(device)
train_data = train_data.to(device)


Using device: cuda


#  Training loop

In [17]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.edge_index)
    loss += (1 / train_data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return loss.item()

for epoch in range(1, 101):  # Try 20 if you're tight on time/memory
    loss = train()
    if epoch % 10 == 0:
        print(f"Epoch {epoch:03d}, Loss: {loss:.4f}")


Epoch 010, Loss: 5.1321
Epoch 020, Loss: 2.2253
Epoch 030, Loss: 1.5108
Epoch 040, Loss: 1.1906
Epoch 050, Loss: 1.0492
Epoch 060, Loss: 0.9819
Epoch 070, Loss: 0.9506
Epoch 080, Loss: 0.9301
Epoch 090, Loss: 0.9184
Epoch 100, Loss: 0.9044


#  Prepare Test Nodes

In [18]:
import numpy as np

# Convert to tensor
test_idx_tensor = torch.tensor(test_idx, dtype=torch.long)

# Get features and labels for test nodes
x_test = x[test_idx_tensor]
y_test = y[test_idx_tensor]

# Mask: only nodes with known labels (licit = 0, illicit = 1)
test_mask = (y_test == 0) | (y_test == 1)
x_test_labeled = x_test[test_mask]
y_test_labeled = y_test[test_mask]


# Embed the test nodes using trained encoder

In [19]:
with torch.no_grad():
    z_train = model.encode(train_data.x, train_data.edge_index)


In [20]:
print("x_train.shape:", x_train.shape)
print("train_edge_index.max():", train_edge_index.max().item())
print("train_edge_index.min():", train_edge_index.min().item())


x_train.shape: torch.Size([123287, 166])
train_edge_index.max(): 123286
train_edge_index.min(): 0


In [21]:
# Create self-loop edge index for test encoding
num_test_nodes = x_test_labeled.shape[0]
edge_index_test = torch.arange(0, num_test_nodes, dtype=torch.long).repeat(2, 1)

# Encode test nodes without full graph
with torch.no_grad():
    z_test_mu, _ = model.encoder(x_test_labeled.to(device), edge_index_test.to(device))

# Score based on distance from training centroid
from sklearn.metrics.pairwise import euclidean_distances
licit_mask_train = (train_data.y == 0)
z_train_licit = z_train[licit_mask_train]
centroid = z_train_licit.mean(dim=0, keepdim=True)

distances = euclidean_distances(z_test_mu.cpu(), centroid.cpu())
anomaly_scores = torch.tensor(distances).squeeze()

# Evaluate
from sklearn.metrics import roc_auc_score

y_true = y_test_labeled.cpu().numpy()
y_scores = anomaly_scores.cpu().numpy()

roc_auc = roc_auc_score(y_true, y_scores)
print(f"🚀 ROC AUC on unseen future data: {roc_auc:.4f}")


🚀 ROC AUC on unseen future data: 0.7962
