In [None]:
# !pip install torch_geometric


In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

import torch
import torch_geometric
import torch.nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import Linear, LayerNorm, ReLU, Dropout
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv, MessagePassing
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import add_self_loops, degree
import torch_geometric.transforms as T


from sklearn.metrics import roc_auc_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

In [17]:
print(torch.__version__)

1.12.1


In [18]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)


cpu


In [19]:
# datasets_folder = '/content/drive/MyDrive/Colab Notebooks/ssn/projekt/datasets/elliptic_augmented_dataset/'
datasets_folder = '../datasets/elliptic_augmented_dataset/'

In [20]:
def split_edges_into_train_and_test(edges_dataframe, node_features_with_class):
    edges_amount = len(edges_dataframe)
    step = int(edges_amount * 0.02)    

    train_edges = [] # edges which have both nodes labeled.
    test_edges = []  # edges which have both nodes unlabeled.
    
    # this may take a long time
    for i, edge in enumerate(np.array(edges_dataframe)):
        if i % step == 0:
            print(f'{ round(i/ edges_amount * 100)}%')
            
        found_start_node = node_features_with_class[node_features_with_class[0] == edge[0]]
        
        #ignore edges for which a node does not exist (this happened once)
        if len(found_start_node) == 0: continue
            
        found_end_node = node_features_with_class[node_features_with_class[0] == edge[1]]
        if len(found_end_node) == 0: continue
            
        if found_start_node.values[0, -1] != 2 and found_end_node.values[0, -1] != 2:
            train_edges.append(edge)
            continue
        
        if found_start_node.values[0, -1] == 2 and found_end_node.values[0, -1] == 2:
            test_edges.append(edge)
            continue
          
        # if found_start_node.values[0, -1] == 2:
        #   test_edges.append(edge)
        #   continue
        
        # if found_end_node.values[0, -1] == 2:
        #   test_edges.append(edge)
        # else:
        #   train_edges.append(edge)
          
    train_edges = pd.DataFrame(train_edges)
    test_edges = pd.DataFrame(test_edges)
    return train_edges, test_edges

In [21]:
# This will read from cached files if they exist. If you change the code, delete the cached files.
def split_edges_into_train_and_test_IO(datasets_folder):
    train_edges_filepath = datasets_folder + 'train_edges.csv'
    test_edges_filepath = datasets_folder + 'test_edges.csv'
    
    train_edges = [] # edges which have both nodes labeled.
    test_edges = [] # edges which one or both nodes are unlabeled.
    if os.path.isfile(train_edges_filepath) and os.path.isfile(test_edges_filepath):
        train_edges = pd.read_csv(train_edges_filepath, sep=',').drop(columns=['Unnamed: 0']).rename(columns = {'0': 0, '1':1})
        test_edges = pd.read_csv(test_edges_filepath, sep=',').drop(columns=['Unnamed: 0']).rename(columns = {'0': 0, '1':1})
        
    else:
        edges_filepath = datasets_folder + 'elliptic_txs_edgelist.csv'
        edges_dataframe = pd.read_csv(edges_filepath, sep=',')
        train_edges, test_edges = split_edges_into_train_and_test(edges_dataframe, node_features_dataframe_with_class_without_timestep)
        
        # save the train_edges and test_edges to file so that the expensive split does not have to run every time.
        train_edges.to_csv(train_edges_filepath)
        test_edges.to_csv(test_edges_filepath)
    
    return train_edges, test_edges

In [22]:
train_edges, test_edges = split_edges_into_train_and_test_IO(datasets_folder)
print(len(train_edges))
print(len(test_edges))
train_edges

NameError: name 'node_features_dataframe_with_class_without_timestep' is not defined

In [None]:
classes_filepath = datasets_folder + 'elliptic_txs_classes.csv'
classes_dataframe = pd.read_csv(classes_filepath, sep=',')
classes_dataframe

In [None]:
# initially 2 is licit, 1 is illicit, suspicious is also illicit, unknown is unknown
# after remapping 0 is illicit, 1 is licit, 2 is unknown
# rows with class 0 and 1 will be used for training
# rows with class 2 will be used for prediction

def remap_label(label):
  if label == '1': return 0
  if label == 'suspicious': return 0
  if label == '2': return 1
  return 2

In [None]:
classes_dataframe_after_remap = classes_dataframe.copy()
classes_dataframe_after_remap['class'] = classes_dataframe['class'].map(remap_label)
classes_dataframe_after_remap

In [None]:
node_features_filepath = datasets_folder + 'elliptic_txs_features.csv'
node_features_dataframe = pd.read_csv(node_features_filepath, sep=',', header=None)
# node_features_dataframe
node_features_dataframe_with_class = node_features_dataframe.copy()
node_features_dataframe_with_class['class'] = classes_dataframe_after_remap['class']
# node_features_dataframe_with_class


In [None]:
# Splitting nodes into train and test
train_node_features = node_features_dataframe_with_class_without_timestep[node_features_dataframe_with_class_without_timestep['class'] != 2]
test_node_features = node_features_dataframe_with_class_without_timestep[node_features_dataframe_with_class_without_timestep['class'] == 2]
# train_node_features
train_node_features_without_class = train_node_features.drop(columns=['class'])
test_node_features_without_class = test_node_features.drop(columns=['class'])

In [None]:
def create_reindex_dict(node_features_without_class): 
    reindex_dictionary = dict()
    for i, node_id in enumerate(node_features_without_class[0]):
      reindex_dictionary[node_id] = i
    return reindex_dictionary

In [None]:
# # now we need to reindex the train nodes and train edges so that node id is equal to row id
def reindex_edges(node_features_without_class, edges):
    reindex_dictionary = create_reindex_dict(node_features_without_class)
    edges_reindexed = edges.copy()
    edges_reindexed[0] = edges[0].map(lambda node_id: reindex_dictionary[node_id])
    edges_reindexed[1] = edges[1].map(lambda node_id: reindex_dictionary[node_id])
    return edges_reindexed


In [None]:
train_edges_reindexed = reindex_edges(train_node_features_without_class, train_edges)


In [None]:
# attempting the same for test part
reindex_dictionary = create_reindex_dict(test_node_features_without_class)
# test_edges
test_edges[0].map(lambda node_id: reindex_dictionary[node_id])

In [None]:
test_edges_reindexed = reindex_edges(test_node_features_without_class, test_edges)
# now we have test nodes & test edges which can be used to test the trained model

In [None]:
# # now crafting the input data structures to pytorch geometric Data object
# edge_index = torch.tensor(np.array(train_edges_reindexed), dtype=torch.long)
# edge_index

# x = torch.tensor(np.array(train_node_features_without_class), dtype=torch.float)
# train_classes = train_node_features['class']
# y = torch.tensor(np.array(train_classes), dtype=torch.float)

# data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)
# data.validate(raise_on_error = True)
# split = T.RandomNodeSplit(num_val=0.1, num_test=0)
# data_with_masks = split(data)
# # print(data_with_masks.num_features)
# # data_with_masks
# # data.y




In [None]:
def create_train_data_object(train_edges_reindexed, train_node_features_without_class, train_classes):
    edge_index = torch.tensor(np.array(train_edges_reindexed), dtype=torch.long)
    x = torch.tensor(np.array(train_node_features_without_class), dtype=torch.float)
    y = torch.tensor(np.array(train_classes), dtype=torch.float)
    data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)
    data.validate(raise_on_error = True)
    split = T.RandomNodeSplit(num_val=0.1, num_test=0)
    data_with_masks = split(data)
    return data_with_masks

In [None]:
def create_test_data_object(test_edges_reindexed, test_node_features_without_class):
    edge_index = torch.tensor(np.array(test_edges_reindexed), dtype=torch.long)
    x = torch.tensor(np.array(test_node_features_without_class), dtype=torch.float)
    data = Data(x=x, edge_index=edge_index.t().contiguous())
    data.validate(raise_on_error = True)
    return data

In [None]:
data_with_masks = create_train_data_object(train_edges_reindexed, train_node_features_without_class, train_node_features['class'])
data_with_masks

In [None]:
test_data_object = create_test_data_object(test_edges_reindexed, test_node_features_without_class)
test_data_object

In [None]:
# similarily we craft the test data structure to pytorch geometric Data object


In [None]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.hid = 8
        self.in_head = 8
        self.out_head = 1
        self.conv1 = GATConv(data_with_masks.num_features, self.hid, heads = self.in_head)
        self.conv2 = GATConv(self.hid * self.in_head, 1, concat=False, heads = self.out_head)

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.6, training = self.training)
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training = self.training)
        x = self.conv2(x, edge_index)
        return F.sigmoid(x)

In [None]:
model = Net().to(device)
data_in_device = data_with_masks.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.BCELoss()

In [None]:
model.train()
for epoch in range(170):
    optimizer.zero_grad()
    out = model(data_in_device.x, data_in_device.edge_index)
    out = out.reshape((data.x.shape[0]))
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    auc = roc_auc_score(data.y.detach().cpu().numpy(), out.detach().cpu().numpy()) #[train_idx]

    if epoch%5 == 0:
        train_auc = roc_auc_score(data.y[data.train_mask].detach().cpu().numpy(), out[data.train_mask].detach().cpu().numpy()) 
        val_auc = roc_auc_score(data.y[data.val_mask].detach().cpu().numpy(), out[data.val_mask].detach().cpu().numpy())
        print("epoch: {} - loss: {} - train_roc: {} - val_auc: {}".format(epoch, loss.item(), train_auc, val_auc))
    loss.backward()
    optimizer.step()

model.eval()

In [None]:
# now testing the model on unlabeled dataset to see if proportions between predicted classes are as expected.

In [None]:
data_in_device = test_data_object.to(device)
output = model(data_in_device.x, data_in_device.edge_index)
output


In [None]:
amount_of_all_test_samples = output.shape[0]
amount_of_licit_predicted_transactions = output.count_nonzero().item()
amount_of_illicit_predicted_transactions = amount_of_all_test_samples - amount_of_licit_predicted_transactions

print(f'{amount_of_illicit_predicted_transactions=}')
print(f'{amount_of_licit_predicted_transactions=}')


print(f'percent of illicit transactions')
x = ['illicit', 'licit']
y = [amount_of_illicit_predicted_transactions, amount_of_licit_predicted_transactions]
plt.bar(x, y)
# plt.text(x=0, y= 0,s= 0)
plt.text(x='illicit', y= amount_of_illicit_predicted_transactions * 1.1,s= amount_of_illicit_predicted_transactions)
plt.text(x='licit', y= amount_of_licit_predicted_transactions * 1.1,s= amount_of_licit_predicted_transactions)
plt.show()
