In [1]:
import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, LayerNorm, ReLU, Dropout
import torch.nn.functional as F
from torch_geometric.nn import ChebConv, NNConv, DeepGCNLayer, GATConv, DenseGCNConv, GCNConv, GraphConv
from torch_geometric.data import Data, DataLoader

from sklearn.metrics import roc_auc_score
import scipy.sparse as sp

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
df_features = pd.read_csv('elliptic_txs_features.csv', header=None)
df_edges = pd.read_csv("elliptic_txs_edgelist.csv")
df_classes =  pd.read_csv("elliptic_txs_classes.csv")
df_classes['class'] = df_classes['class'].map({'unknown': 2, '1':1, '2':0})

In [3]:
# merging dataframes
df_merge = df_features.merge(df_classes, how='left', right_on="txId", left_on=0)
df_merge = df_merge.sort_values(0).reset_index(drop=True)
classified = df_merge.loc[df_merge['class'].loc[df_merge['class']!=2].index].drop('txId', axis=1)
unclassified = df_merge.loc[df_merge['class'].loc[df_merge['class']==2].index].drop('txId', axis=1)

# storing classified unclassified nodes seperatly for training and testing purpose
classified_edges = df_edges.loc[df_edges['txId1'].isin(classified[0]) & df_edges['txId2'].isin(classified[0])]
unclassifed_edges = df_edges.loc[df_edges['txId1'].isin(unclassified[0]) | df_edges['txId2'].isin(unclassified[0])]
del df_features, df_classes

In [4]:
# all nodes in data
nodes = df_merge[0].values
map_id = {j:i for i,j in enumerate(nodes)} # mapping nodes to indexes

edges = df_edges.copy()
edges.txId1 = edges.txId1.map(map_id)
edges.txId2 = edges.txId2.map(map_id)
edges = edges.astype(int)

edge_index = np.array(edges.values).T

# for undirected graph
# edge_index_ = np.array([edge_index[1,:], edge_index[0, :]])
# edge_index = np.concatenate((edge_index, edge_index_), axis=1)

edge_index = torch.tensor(edge_index, dtype=torch.long).contiguous()
weights = torch.tensor([1]* edge_index.shape[1] , dtype=torch.double)
print(edge_index.shape)

torch.Size([2, 234355])


In [5]:
# maping txIds to corresponding indexes, to pass node features to the model
node_features = df_merge.drop(['txId'], axis=1).copy()
node_features[0] = node_features[0].map(map_id)
classified_idx = node_features['class'].loc[node_features['class']!=2].index
unclassified_idx = node_features['class'].loc[node_features['class']==2].index
# replace unkown class with 0, to avoid having 3 classes, this data/labels never used in training
node_features['class'] = node_features['class'].replace(2, 0) 

In [6]:
labels = node_features['class'].values
node_features = torch.tensor(np.array(node_features.drop([0, 'class', 1], axis=1).values, dtype=np.double), dtype=torch.double)

# converting data to PyGeometric graph data format
data_train = Data(x=node_features, edge_index=edge_index, edge_attr=weights,
                               y=torch.tensor(labels, dtype=torch.double)) #, adj= torch.from_numpy(np.array(adj))

In [7]:
y_train = labels[classified_idx]

# spliting train set and validation set
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid, train_idx, valid_idx = train_test_split(node_features[classified_idx], y_train, classified_idx, test_size=0.15, random_state=42, stratify=y_train)

In [8]:
data_train.y[classified_idx].sum()

tensor(4545., dtype=torch.float64)

In [9]:
# data_train.y = data_train.y.double()
# data_train.x = data_train.x.double()

In [10]:
import gc
gc.collect()

0

In [11]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv1 = GCNConv(165, 128)
        self.conv2 = GCNConv(128, 128)
        self.conv3 = GCNConv(64, 64)
        self.conv4 = GCNConv(128, 1) 

    def forward(self, data, adj=None):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.1, training=self.training)
        x = self.conv4(x, edge_index)

        return F.sigmoid(x)

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
model.double()
data_train = data_train.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
criterion = torch.nn.BCELoss()

model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data_train)
    # data_train.y.unsqueeze(1)
    out = out.reshape((data_train.x.shape[0]))
    loss = criterion(out[train_idx], data_train.y[train_idx])
    auc = roc_auc_score(data_train.y.detach().cpu().numpy()[train_idx], out.detach().cpu().numpy()[train_idx]) #[train_idx]
    loss.backward()
    optimizer.step()
    if epoch%5 == 0:
      print("epoch: {} - loss: {} - roc: {}".format(epoch, loss.item(), auc))

epoch: 0 - loss: 0.41528438755931885 - roc: 0.6832203930991017
epoch: 5 - loss: 0.23637810057996067 - roc: 0.8973980442311795
epoch: 10 - loss: 0.19991107601238273 - roc: 0.9235280889794535
epoch: 15 - loss: 0.183598984105867 - roc: 0.9320894952724381
epoch: 20 - loss: 0.17055794687675216 - roc: 0.9396074787012347
epoch: 25 - loss: 0.15951367284443374 - roc: 0.946206522029992
epoch: 30 - loss: 0.15311132442491787 - roc: 0.9492829459381393
epoch: 35 - loss: 0.14507898766427244 - roc: 0.9546743506246984
epoch: 40 - loss: 0.14042418872692075 - roc: 0.9575324966332759
epoch: 45 - loss: 0.13435745663435955 - roc: 0.9612264782659834
epoch: 50 - loss: 0.13038395652448873 - roc: 0.9632862095826752
epoch: 55 - loss: 0.12557691120510248 - roc: 0.9661682374374168
epoch: 60 - loss: 0.12121576938150143 - roc: 0.9687168326818578
epoch: 65 - loss: 0.11756957478988514 - roc: 0.970183344738153
epoch: 70 - loss: 0.11390004619293095 - roc: 0.9719035334608365
epoch: 75 - loss: 0.10883725057475294 - roc: 0

In [28]:
preds = model(data_train)
preds = preds.detach().cpu().numpy()

In [29]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

# 计算二值化输出，假设使用0.6作为分类阈值
out_labels = preds > 0.6

# 计算训练集的准确率
train_acc = accuracy_score(data_train.y.detach().cpu().numpy()[train_idx], out_labels[train_idx])

# 计算训练集的AUC
train_auc = roc_auc_score(data_train.y.detach().cpu().numpy()[train_idx], preds[train_idx])

# 计算precision, recall, f1-score
precision, recall, f1, _ = precision_recall_fscore_support(data_train.y.detach().cpu().numpy()[train_idx], out_labels[train_idx], average='binary')

print("Train Accuracy: ", train_acc)
print("Train AUC: ", train_auc)
print("Train Precision: ", precision)
print("Train Recall: ", recall)
print("Train F1 Score: ", f1)


Train Accuracy:  0.9803936430935597
Train AUC:  0.9939397006795085
Train Precision:  0.9451975771560427
Train Recall:  0.8483044266114419
Train F1 Score:  0.8941336971350613


In [30]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support

# 预测验证集的AUC
valid_auc = roc_auc_score(data_train.y.detach().cpu().numpy()[valid_idx], preds[valid_idx])

# 根据阈值确定类别标签
out_labels = preds > 0.6
valid_labels = out_labels[valid_idx]

# 计算验证集的准确率
valid_acc = accuracy_score(data_train.y.detach().cpu().numpy()[valid_idx], valid_labels)

# 计算precision, recall, f1-score
precision, recall, f1, _ = precision_recall_fscore_support(data_train.y.detach().cpu().numpy()[valid_idx], valid_labels, average='binary')

print("Validation Accuracy: ", valid_acc)
print("Validation AUC: ", valid_auc)
print("Validation Precision: ", precision)
print("Validation Recall: ", recall)
print("Validation F1 Score: ", f1)


Validation Accuracy:  0.9643521832498211
Validation AUC:  0.9697483812344632
Validation Precision:  0.8441971383147854
Validation Recall:  0.7785923753665689
Validation F1 Score:  0.8100686498855834
