# EDA of the elliptic dataset

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
data_path = Path("../data/01_raw/elliptic_btc/BTC/")
labels = pd.read_csv(data_path / "elliptic_txs_classes.csv")
labels["class"] = labels["class"].map({"unknown": 0, "0": 1, "2": 2})
edges = pd.read_csv(data_path / "elliptic_txs_edgelist.csv")
features = pd.read_csv(data_path / "elliptic_txs_features.csv", header=None)
features.columns = ["txId", *[f"F{i}" for i in features.columns[1:]]]

In [3]:
print(
    f"""Shapes
\tFeatures : {features.shape[0]:8,} (rows)  {features.shape[1]:4,} (cols)
\tClasses  : {labels.shape[0]:8,} (rows)  {labels.shape[1]:4,} (cols)
\tEdgelist : {edges.shape[0]:8,} (rows)  {edges.shape[1]:4,} (cols)
"""
)

Shapes
	Features :  203,769 (rows)   167 (cols)
	Classes  :  203,769 (rows)     2 (cols)
	Edgelist :  234,355 (rows)     2 (cols)



In [4]:
labels.head()

Unnamed: 0,txId,class
0,230425980,0.0
1,5530458,0.0
2,232022460,0.0
3,232438397,2.0
4,230460314,0.0


In [5]:
labels["class"].value_counts()

class
0.0    157205
2.0     42019
Name: count, dtype: int64

In [6]:
edges.head()

Unnamed: 0,txId1,txId2
0,230425980,5530458
1,232022460,232438397
2,230460314,230459870
3,230333930,230595899
4,232013274,232029206


In [7]:
features.head()

Unnamed: 0,txId,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F157,F158,F159,F160,F161,F162,F163,F164,F165,F166
0,230425980,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162097,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [8]:
features.describe()

Unnamed: 0,txId,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F157,F158,F159,F160,F161,F162,F163,F164,F165,F166
count,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,...,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0,203769.0
mean,171131000.0,23.843961,2.2316810000000003e-17,-7.531922000000001e-18,2.2316810000000003e-17,6.213836000000001e-17,7.570279e-17,5.80237e-17,3.068561e-17,3.5706890000000004e-17,...,-2.9569770000000003e-17,1.657023e-16,-7.225066000000001e-17,7.47613e-17,-4.658634e-17,2.231681e-18,5.0212820000000004e-17,1.562177e-17,4.072817e-17,5.913954000000001e-17
std,110465500.0,15.17217,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,1076.0,1.0,-0.1729826,-0.2105526,-1.756361,-0.1219696,-0.06372457,-0.113002,-0.06158379,-0.1636459,...,-0.5770994,-0.6262286,-0.9790738,-0.978556,-0.2160569,-0.1259391,-0.1311553,-0.2698175,-1.760926,-1.760984
25%,84334520.0,9.0,-0.1725317,-0.1803266,-1.201369,-0.1219696,-0.04387455,-0.113002,-0.06158379,-0.1635168,...,-0.5696264,-0.5946915,-0.9790738,-0.978556,-0.09888874,-0.08749016,-0.1311553,-0.1405971,-0.1206134,-0.1197925
50%,162437500.0,23.0,-0.1692045,-0.1328975,0.4636092,-0.1219696,-0.04387455,-0.113002,-0.06158379,-0.162044,...,-0.4799511,-0.4559278,0.2411283,0.2414064,0.0182794,-0.08749016,-0.1311553,-0.09752359,-0.1206134,-0.1197925
75%,245479800.0,38.0,-0.1318553,-0.05524241,1.018602,-0.1219696,-0.04387455,-0.113002,-0.06158379,-0.1355932,...,0.1552495,0.1212026,1.305594,1.398764,0.0182794,-0.08749016,-0.08467423,-0.09752359,0.1520067,0.119971
max,403244600.0,49.0,71.68197,73.59505,2.68358,49.0276,260.0907,54.56518,113.4409,73.35457,...,7.862953,7.914041,1.46133,1.461369,117.0692,251.849,238.7835,105.734,1.5197,1.521399


# Preprocessing the features

In [9]:
from functools import partial, reduce
from typing import Callable

import numpy as np
import torch
import torch.nn.functional as F
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv


def apply_functions(df: pd.DataFrame, functions: list[Callable]) -> pd.DataFrame:
    """applies a list of functions to a dataframe"""
    return reduce(lambda df, func: func(df), functions, df)

In [10]:
transaction_mapper = {tx_id: idx for idx, tx_id in enumerate(features["txId"].unique())}

def filter_edges(edges: pd.DataFrame, transaction_mapper: dict) -> pd.DataFrame:
    """removes all edges that do not have features in the features dataframe"""

    edge_ids = set(transaction_mapper.keys())
    left_condition = edges.txId1.isin(edge_ids)
    right_condition = edges.txId2.isin(edge_ids)
    return edges[left_condition & right_condition]


def add_edge_ids(edges: pd.DataFrame, transaction_mapper: dict) -> pd.DataFrame:
    """adds edge ids to the edges dataframe"""
    edges["left_id"] = edges.txId1.map(transaction_mapper)
    edges["right_id"] = edges.txId2.map(transaction_mapper)

    return edges


def convert_to_tensor(edges: pd.DataFrame) -> torch.Tensor:
    """converts the edges dataframe to a tensor"""
    return torch.tensor(edges[["left_id", "right_id"]].values.T, dtype=torch.long)


edge_index = apply_functions(
    df=edges,
    functions=[
        partial(filter_edges, transaction_mapper=transaction_mapper),
        partial(add_edge_ids, transaction_mapper=transaction_mapper),
        convert_to_tensor,
    ]
)

edge_index

tensor([[     0,      2,      4,  ..., 201921, 201480, 201954],
        [     1,      3,      5,  ..., 202042, 201368, 201756]])

In [11]:
def get_feature_tensor(features: pd.DataFrame) -> torch.Tensor:
    """converts the features dataframe to a tensor"""
    features = features.drop(columns=["txId"])
    return torch.tensor(features.values, dtype=torch.float)


feature_tensor = get_feature_tensor(features)
print(feature_tensor.shape)
feature_tensor

torch.Size([203769, 166])


tensor([[ 1.0000e+00, -1.7147e-01, -1.8467e-01,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 1.0000e+00, -1.7148e-01, -1.8467e-01,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 1.0000e+00, -1.7211e-01, -1.8467e-01,  ..., -1.8367e-01,
         -1.2061e-01, -1.1979e-01],
        ...,
        [ 4.9000e+01, -1.7201e-01, -7.8182e-02,  ..., -9.7524e-02,
         -1.2061e-01, -1.1979e-01],
        [ 4.9000e+01, -1.7284e-01, -1.7662e-01,  ..., -1.4060e-01,
          1.5197e+00,  1.5214e+00],
        [ 4.9000e+01, -1.2037e-02, -1.3228e-01,  ..., -1.4060e-01,
          1.5197e+00,  1.5214e+00]])

In [12]:
def get_label_tensor(labels: pd.DataFrame) -> torch.Tensor:
    """converts the labels dataframe to a tensor"""
    label_encoder = LabelEncoder()
    class_labels = label_encoder.fit_transform(labels["class"])
    return torch.tensor(class_labels, dtype=torch.long)

node_labels = get_label_tensor(labels)
print(node_labels.shape)
node_labels

torch.Size([203769])


tensor([0, 0, 0,  ..., 2, 0, 0])

In [13]:
def combine_data(edge_index: torch.Tensor, feature_tensor: torch.Tensor, node_labels: torch.Tensor) -> Data:
    """combines the edge_index, feature_tensor, and node_labels into a single Data object"""
    return Data(edge_index=edge_index, x=feature_tensor, y=node_labels)

data = combine_data(edge_index, feature_tensor, node_labels)

In [14]:
known_mask = (data.y == 1) | (data.y == 2)
unknown_mask = data.y == 0

In [15]:
number_known_nodes = known_mask.sum().item()
permutation = torch.randperm(number_known_nodes)
train_size = int(0.8 * number_known_nodes)
val_size = int(0.1 * number_known_nodes)
test_size = number_known_nodes - train_size - val_size

total = np.sum([train_size, val_size, test_size])
print(
    f"""Number of observations per split
    Training   : {train_size:10,} ({100*train_size/total:0.2f} %)
    Validation : {val_size:10,} ({100*val_size/total:0.2f} %)
    Testing    : {test_size:10,} ({100*test_size/total:0.2f} %)
"""
)

Number of observations per split
    Training   :     37,251 (80.00 %)
    Validation :      4,656 (10.00 %)
    Testing    :      4,657 (10.00 %)



In [16]:
def set_training_masks(data: Data, train_size: int, val_size: int, permutation: torch.Tensor) -> Data:
    """
    sets the training, validation, and testing masks

    Args:
        data (Data): the dataset
        train_size (int): the number of training nodes
        val_size (int): the number of validation nodes
        permutation (torch.Tensor): the permutation of nodes

    Returns:
        Data: the updated dataset with the training, validation, and testing masks set
    """

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[permutation[:train_size]] = True
    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask[permutation[train_size:train_size + val_size]] = True
    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask[permutation[train_size + val_size:]] = True

    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data

data = set_training_masks(data, train_size, val_size, permutation)

Let's check some statistincs about train, test and validation sets

In [17]:

train_licit, train_illicit = (data.y[data.train_mask] == 1).sum().item(), (data.y[data.train_mask] == 0).sum().item()
val_licit, val_illicit = (data.y[data.val_mask] == 1).sum().item(), (data.y[data.val_mask] == 0).sum().item()
test_licit, test_illicit = (data.y[data.test_mask] == 1).sum().item(), (data.y[data.test_mask] == 0).sum().item()

# Calculate total counts.
train_total = train_licit + train_illicit
val_total = val_licit + val_illicit
test_total = test_licit + test_illicit

# Calculate percentages.
train_licit_pct = (train_licit / train_total) * 100
train_illicit_pct = (train_illicit / train_total) * 100
val_licit_pct = (val_licit / val_total) * 100
val_illicit_pct = (val_illicit / val_total) * 100
test_licit_pct = (test_licit / test_total) * 100
test_illicit_pct = (test_illicit / test_total) * 100

pd.DataFrame({
    'Set': ['Training', 'Validation', 'Testing'],
    'Total Count': [train_total, val_total, test_total],
    'Licit': [train_licit, val_licit, test_licit],
    'Licit (%)': [train_licit_pct, val_licit_pct, test_licit_pct],
    'Illicit': [train_illicit, val_illicit, test_illicit],
    'Illicit (%)': [train_illicit_pct, val_illicit_pct, test_illicit_pct]
})


Unnamed: 0,Set,Total Count,Licit,Licit (%),Illicit,Illicit (%)
0,Training,37036,8358,22.567232,28678,77.432768
1,Validation,4625,1073,23.2,3552,76.8
2,Testing,4633,1057,22.814591,3576,77.185409


In [18]:
class GCNModel(torch.nn.Module):
    """Graph Convolutional Network model"""

    def __init__(self, hidden_dim, num_features, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [27]:
DEVICE_MAPPING = {
    "gpu": torch.cuda,
    "mps": torch.backends.mps,
}

HIDDEN_DIM = 64
def get_torch_device() -> torch.device:
    """gets the best performant device (CPU or MPS)"""
    for device_name, device in DEVICE_MAPPING.items():
        if device.is_available():
            return torch.device(device_name)

    return torch.device("cpu")


device = get_torch_device()
data = data.to(device)
model = GCNModel(hidden_dim=HIDDEN_DIM, num_features=data.x.shape[1], num_classes=2).to(
    device
)

In [61]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
from tqdm import tqdm

def train_gnn(num_epochs: int, model: torch.nn.Module, data: Data, optimizer: torch.optim.Optimizer) -> dict:
    """trains the GNN model"""
    train_losses = []
    train_accuracies = []
    train_precisions = []
    train_recals = []
    train_f1_scores = []

    val_losses = []
    val_accuracies = []
    val_precisions = []
    val_recals = []
    val_f1_scores = []

    for epoch in tqdm(range(1, num_epochs + 1)):
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask].long())
        loss.backward()
        optimizer.step()

        train_loss = loss.item()

        train_predictions = out[data.train_mask].argmax(dim=1).cpu().detach().numpy()
        train_true = data.y[data.train_mask].cpu().detach().numpy()

        train_accuracy = metrics.accuracy_score(train_true, train_predictions)
        train_precision = metrics.precision_score(
            train_true, train_predictions, average="weighted"
        ).item()
        train_recall = metrics.recall_score(
            train_true, train_predictions, average="weighted"
        ).item()
        train_f1_score = metrics.f1_score(
            train_true, train_predictions, average="weighted"
        ).item()

        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
        train_precisions.append(train_precision)
        train_recals.append(train_recall)
        train_f1_scores.append(train_f1_score)

        model.eval()
        with torch.no_grad():
            out = model(data)
            validation_predictions = out[data.val_mask].argmax(dim=1).cpu().detach().numpy()
            validation_true = data.y[data.val_mask].cpu().detach().numpy()

            val_loss = F.nll_loss(out[data.val_mask], data.y[data.val_mask]).item()
            val_accuracy = metrics.accuracy_score(
                validation_predictions, validation_true
            )
            val_precision = metrics.precision_score(
                validation_predictions, validation_true, average="weighted"
            )
            val_recall = metrics.recall_score(
                validation_predictions, validation_true, average="weighted"
            )
            val_f1_score = metrics.f1_score(
                validation_predictions, validation_true, average="weighted"
            )

            val_losses.append(val_loss)
            val_accuracies.append(val_accuracy)
            val_precisions.append(val_precision)
            val_recals.append(val_recall)
            val_f1_scores.append(val_f1_score)

        if epoch % 10 == 0:
            print(
                f"Epoch: {epoch},\n\tTrain Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}, Train F1 Score: {train_f1_score:.4f}\n\tVal Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val Precision: {val_precision:.4f}, Val Recall: {val_recall:.4f}, Val F1 Score: {val_f1_score:.4f}"
            )

    return {
        "model": model,
        "train": {
            "losses": train_losses,
            "accuracies": train_accuracies,
            "precisions": train_precisions,
            "recals": train_recals,
            "f1_scores": train_f1_scores
        },
        "val": {
            "losses": val_losses,
            "accuracies": val_accuracies,
            "precisions": val_precisions,
            "recals": val_recals,
            "f1_scores": val_f1_scores,
        }
    }

In [62]:
results = train_gnn(num_epochs=100, model=model, data=data, optimizer=torch.optim.Adam(model.parameters(), lr=0.03))

 11%|█         | 11/100 [00:01<00:11,  7.99it/s]

Epoch: 10,
	Train Loss: 0.3554, Train Accuracy: 0.8725, Train Precision: 0.8626, Train Recall: 0.8725, Train F1 Score: 0.8653
	Val Loss: 0.3508, Val Accuracy: 0.8709, Val Precision: 0.8905, Val Recall: 0.8709, Val F1 Score: 0.8786


 21%|██        | 21/100 [00:02<00:09,  8.05it/s]

Epoch: 20,
	Train Loss: 0.3254, Train Accuracy: 0.8714, Train Precision: 0.8621, Train Recall: 0.8714, Train F1 Score: 0.8607
	Val Loss: 0.3281, Val Accuracy: 0.8716, Val Precision: 0.8987, Val Recall: 0.8716, Val F1 Score: 0.8812


 31%|███       | 31/100 [00:03<00:08,  8.36it/s]

Epoch: 30,
	Train Loss: 0.3048, Train Accuracy: 0.8790, Train Precision: 0.8699, Train Recall: 0.8790, Train F1 Score: 0.8704
	Val Loss: 0.3125, Val Accuracy: 0.8703, Val Precision: 0.8971, Val Recall: 0.8703, Val F1 Score: 0.8799


 41%|████      | 41/100 [00:05<00:07,  8.32it/s]

Epoch: 40,
	Train Loss: 0.2914, Train Accuracy: 0.8828, Train Precision: 0.8742, Train Recall: 0.8828, Train F1 Score: 0.8744
	Val Loss: 0.3059, Val Accuracy: 0.8733, Val Precision: 0.8990, Val Recall: 0.8733, Val F1 Score: 0.8825


 51%|█████     | 51/100 [00:06<00:05,  8.41it/s]

Epoch: 50,
	Train Loss: 0.2874, Train Accuracy: 0.8821, Train Precision: 0.8737, Train Recall: 0.8821, Train F1 Score: 0.8733
	Val Loss: 0.3010, Val Accuracy: 0.8759, Val Precision: 0.9009, Val Recall: 0.8759, Val F1 Score: 0.8848


 61%|██████    | 61/100 [00:07<00:04,  8.28it/s]

Epoch: 60,
	Train Loss: 0.2804, Train Accuracy: 0.8829, Train Precision: 0.8744, Train Recall: 0.8829, Train F1 Score: 0.8745
	Val Loss: 0.2992, Val Accuracy: 0.8750, Val Precision: 0.9004, Val Recall: 0.8750, Val F1 Score: 0.8841


 71%|███████   | 71/100 [00:08<00:03,  7.95it/s]

Epoch: 70,
	Train Loss: 0.2769, Train Accuracy: 0.8856, Train Precision: 0.8772, Train Recall: 0.8856, Train F1 Score: 0.8777
	Val Loss: 0.2953, Val Accuracy: 0.8767, Val Precision: 0.8988, Val Recall: 0.8767, Val F1 Score: 0.8849


 81%|████████  | 81/100 [00:09<00:02,  7.97it/s]

Epoch: 80,
	Train Loss: 0.2755, Train Accuracy: 0.8838, Train Precision: 0.8757, Train Recall: 0.8838, Train F1 Score: 0.8750
	Val Loss: 0.2959, Val Accuracy: 0.8776, Val Precision: 0.9018, Val Recall: 0.8776, Val F1 Score: 0.8863


 91%|█████████ | 91/100 [00:11<00:01,  8.18it/s]

Epoch: 90,
	Train Loss: 0.2709, Train Accuracy: 0.8873, Train Precision: 0.8790, Train Recall: 0.8873, Train F1 Score: 0.8794
	Val Loss: 0.2944, Val Accuracy: 0.8774, Val Precision: 0.9022, Val Recall: 0.8774, Val F1 Score: 0.8862


100%|██████████| 100/100 [00:12<00:00,  8.10it/s]

Epoch: 100,
	Train Loss: 0.2695, Train Accuracy: 0.8884, Train Precision: 0.8802, Train Recall: 0.8884, Train F1 Score: 0.8808
	Val Loss: 0.2935, Val Accuracy: 0.8769, Val Precision: 0.9002, Val Recall: 0.8769, Val F1 Score: 0.8854



