In [3]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install -q dgl

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import numpy as np
import pandas as pd
import networkx as nx
import torch
import torch_geometric
from torch_geometric.data import Data
import pickle
from sklearn import preprocessing
import random

### Load data

Load in edge list (with edge features) and node features.

In [5]:
edges = pd.read_csv('/content/drive/MyDrive/Augmented_Elliptic/address_level/edgelist_hop3_input.csv')
node_features = pd.read_csv('/content/drive/MyDrive/Augmented_Elliptic/address_level/node_address_features.csv').fillna(0).set_index('address')

### Data pre-processing

First, scale the edge and node features using the `StandardScaler()` from the `sklearn` package. Because we don't use cross-validation to choose the hyperparameters for our models, there is no data leakage associated with scaling based on the whole data set (as opposed to just the training set). 

In [6]:
# Store names for future use
names = node_features.columns

In [7]:
scaler1 = preprocessing.StandardScaler()
scaler2 = preprocessing.StandardScaler()

# Scale edge features
edges = pd.concat([edges[['node', 'class', 'entity', 'category', 'node_source', 'node_sink']], pd.DataFrame(scaler1.fit_transform(edges[['txn_amount', 'fee', 'interactions', 'two_way']]))], axis = 1)
edges.columns = ['node', 'class', 'entity', 'category', 'node_source', 'node_sink', 'txn_amount', 'fee', 'interactions', 'two_way']
# Scale node features
node_features = pd.DataFrame(scaler2.fit_transform(node_features), index = node_features.index, columns=names)


In [8]:
# Labels object - one label for each address
labels = edges[['node', 'class']].groupby(['node']).agg({'class': 'max'})
labels = labels.replace(to_replace = 2, value = 1)

Next, we need to extract and store the data to build each of the address-level sub-graphs. The code below extracts the label, edges (and edge features) and node features associated with each input address.  

In [9]:
graphs = dict()

# Assign class
lab = labels.groupby('node')    
graphs['label'] = [lab.get_group(x) for x in lab.groups]

# Assign edge features
ef = edges.groupby('node')
graphs['edge_features'] = [ef.get_group(x) for x in ef.groups]

# Assign node features
#Filter nf df for all nodes in that graph - list of df with node features for each graph
node_list = [ef.get_group(x)['node_source'].append(ef.get_group(x)['node_sink']).unique() for x in ef.groups]
graphs['node_features'] = [node_features.filter(items = x, axis = 0) for x in node_list]


Finally, we use the `networkx` and `torch_geometric` packages t construct the graphs with the data listed above. This list of graphs will be used to train our graph-level classification algorithm. 

In [10]:
graphs_transf = list(range(len(graphs['label'])))

for g in range(len(graphs['label'])):
    # Create graph from edge list and attach edge features
    edge_index = nx.from_pandas_edgelist(graphs['edge_features'][g], 'node_source', 'node_sink', edge_attr = ['node', 'entity', 'category', 'txn_amount', 'fee', 'interactions', 'two_way'], create_using = nx.DiGraph)
    # Convert to torch_geometric data object
    data = torch_geometric.utils.from_networkx(edge_index)
    # Add node features   
    data.x = torch.tensor(np.array(graphs['node_features'][g]), dtype = torch.long)
    # Add class
    data.y = graphs['label'][g]['class'] #label of the graph
    # Add edge features as tensor
    data.edge_attr = torch.stack((data.txn_amount, data.fee, data.interactions, data.two_way), dim = 1)
    #Add data for node id, category and entity
    data.node = str(data.node[0])
    if type(data.category[0]) == torch.Tensor:
      data.category = str(data.category[0].numpy())
      data.entity = str(data.entity[0].numpy())
    else:
      data.category = data.category[0]
      data.entity = data.entity[0]
    # Drop individual edge features as passed to networkx graph
    data.txn_amount = None
    data.fee = None
    data.interactions = None
    data.two_way = None
    #Assign to list
    graphs_transf[g] = data


In [11]:
len(graphs_transf)

102075

In [12]:
classes = {graphs_transf[i].y.index.map(str)[0]: graphs_transf[i].y.item() for i in range(len(graphs_transf))}
classes_1 = dict(filter(lambda i: i[1] == 1, classes.items()))
classes_0 = dict(filter(lambda i: i[1] == 0, classes.items()))

print(f'Number of high risk addresses: {len(classes_1.keys())}')
print(f'Number of safe addresses: {len(classes_0.keys())}')

random.seed(1993)
classes_all_keys = list(classes.keys()) 

train_share = 0.9
# Sample training and test sets
train_keys = random.sample(classes_all_keys, k = round(train_share*len(classes_all_keys)))
test_keys = [*filter(lambda i: i not in train_keys, classes_all_keys)]

Number of high risk addresses: 10496
Number of safe addresses: 91579


In [13]:
# Separate training set into 0 and 1s
classes_train = {i: classes[i] for i in train_keys}
classes_train_0 = {key:value for (key, value) in classes_train.items() if value == 0}
classes_train_1 = {key:value for (key, value) in classes_train.items() if value == 1}


In [14]:
# Undersample safe class to match high risk class
sample_safe = random.sample(list(classes_train_0.keys()), k = len(classes_train_1.keys()))

# Combine training classes
classes_train_trim_0 = dict(filter(lambda i: i[0] in sample_safe, classes_train_0.items()))
classes_train_balanced = {**classes_train_1, **classes_train_trim_0}

# Print test shares
classes_test = {i: classes[i] for i in test_keys}
classes_test_0 = dict(filter(lambda i: i[1] == 0, classes_test.items()))
classes_test_1 = dict(filter(lambda i: i[1] == 1, classes_test.items()))

print(f'Number of high risk test addresses: {len(classes_test_1.keys())}')
print(f'Number of safe test addresses: {len(classes_test_0.keys())}')

print(f'Number of high risk train addresses: {len(classes_train_1.keys())}')
print(f'Number of safe train addresses: {len(classes_train_trim_0.keys())}')

# Construct train and test subsets
train_dataset = [graph for graph in graphs_transf if graph.node in list(classes_train_balanced.keys())] 
test_dataset = [graph for graph in graphs_transf if graph.node in test_keys]

Number of high risk test addresses: 1058
Number of safe test addresses: 9149
Number of high risk train addresses: 9438
Number of safe train addresses: 9438


In [15]:
dataset = [train_dataset, test_dataset]

In [16]:
with open('/content/drive/MyDrive/Augmented_Elliptic/address_level/address_subgraphs.pkl', 'wb') as File:
  pickle.dump(dataset, File)
  