In [2]:
from __future__ import print_function, division
import numpy as np
import pandas as pd

import os

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, f1_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import pickle
import networkx as nx

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
# import warnings
# warnings.filterwarnings("ignore")
torch.manual_seed(15)

<torch._C.Generator at 0x7fb9d229ca30>

# Import data

In [5]:
def load_pickle(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)

G = load_pickle('subgraph.pkl')
G_simple = nx.Graph(G)

In [6]:
df_features = pd.read_csv("feature_data.csv")
df_features.dropna(inplace=True)
df_features.head(5)

Unnamed: 0,node,isp,closeness_centrality,betweenness_centrality,eigenvector_centrality,active_days,eccentricity,pagerank_std_last_month,in_out_degree_ratio,weightsin_weightsout_ratio,numin_numout_ratio,centrality_sum,rolling_average_pagerank,cumulative_interaction_count,pagerank_change,pagerank_closeness_interaction
0,0x1f1e784a61a8ca0a90250bcd2170696655b28a21,0,0.132361,0.0002290699,0.002497,107.0,7.0,0.0,0.132389,1.0,1.0,0.135087,0.0,1327.0,0.0,8.164333e-08
1,0x1266f8b9e4dffc9e2f719bf51713f7e714516861,0,0.109312,1.747367e-07,1.8e-05,1.0,8.0,0.0,1.0,1.0,1.0,0.10933,0.0,1328.0,-4.834848e-07,1.457527e-08
2,0xbbfaf27674c2eb5d13edc58a40081248d13dcfeb,1,0.117714,7.21472e-05,7.1e-05,0.0,7.0,0.0,1.0,1.0,0.0,0.117857,5.317118e-07,0.0,7.116401e-07,9.946543e-08
3,0xb50d0c4cb2c29cc232c96a59e9c65eb82914ec75,0,0.110925,0.0001036793,0.002118,89.0,7.0,0.0,0.379518,0.434854,1.0,0.113146,6.202811e-07,1445.0,7.495447e-07,9.791388e-08
4,0x563b377a956c80d77a7c613a9343699ad6123911,0,0.110763,0.005919153,0.006486,346.0,7.0,0.0,0.010035,2.9e-05,1.0,0.123168,4.186281e-07,9354.0,-7.983363e-07,1.538709e-08


In [7]:
def prepare_data(graph, df_features, sample = True, sample_num = 500):
    df_class = df_features[["node", "isp"]]

    # Create a mapping from unique strings to unique integers
    string_to_int = {string: index for index, string in enumerate(set(df_features['node']))}

    # Use the mapping to replace strings with integers in the list
    int_list = [string_to_int[value] for value in df_features['node']]
    df_features['Node'] = int_list

    df_features.drop(columns=['node'], inplace=True)

    # Map values in the 'Node' column to the df_class
    df_class['Node'] = df_class['node'].map(string_to_int)

    # Drop rows where the mapping is NaN (not found in the dictionary)
    df_class = df_class.dropna(subset=['Node'])

    # Optionally, drop the original 'node' column if not needed
    df_class = df_class.drop(columns=['node'])

    adj_m = nx.to_pandas_adjacency(graph)

    # Map values in the 'Node' column to the dictionary
    adj_m['Node'] = adj_m.index.map(string_to_int)
    adj_m.dropna(inplace=True)
    adj_m['Node'] = adj_m['Node'].astype(int)

    # Drop rows where the mapping is NaN (not found in the dictionary)
    adj_m = adj_m.dropna(subset=['Node'])

    # Optionally, drop the original 'fruit' column if not needed
    adj_m.set_index(adj_m['Node'], inplace=True)

    adj_m = adj_m.rename(columns=string_to_int)
    adj_m = adj_m.drop(columns=['Node'])
    edg = list(adj_m.index)

    feature_filtered = df_features[df_features['Node'].isin(edg)]

    if sample:
        feature_filtered = pd.concat([
            feature_filtered[feature_filtered['isp'] == 0].sample(sample_num),
            feature_filtered[feature_filtered['isp'] == 1].sample(sample_num)
        ])

        selected_nodes = feature_filtered['Node'].tolist()
        feature_filtered.drop(columns=['isp'],inplace=True)

        adj_m = adj_m[adj_m.index.isin(selected_nodes)]
        adj_m = adj_m.loc[:, selected_nodes]
        df_class = df_class[df_class['Node'].isin(selected_nodes)]

    classes_ts = df_class.sort_values(by='Node')
    feature_filtered = feature_filtered.sort_values(by='Node')
    adj_mats = adj_m.sort_index(axis=0).sort_index(axis=1)

    adj_np = np.array(adj_mats, dtype=np.float_)

    labels_ts = np.array(classes_ts['isp'] == 0, dtype = np.longlong)
    adj = torch.tensor(adj_np)
    features = torch.tensor(feature_filtered.values)
    lables = torch.tensor(labels_ts, dtype = torch.long)

    return labels_ts, adj, features

In [8]:
labels, adj, features = prepare_data(G_simple, df_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_class['Node'] = df_class['node'].map(string_to_int)


# GCN

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import time
import sys

class GraphConv(nn.Module):
    def __init__(self, in_features, out_features, activation  = 'relu', skip = False, skip_in_features = None):
        super(GraphConv, self).__init__()
        self.W = torch.nn.Parameter(torch.DoubleTensor(in_features, out_features))
        nn.init.xavier_uniform_(self.W)

        self.set_act = False
        if activation == 'relu':
            self.activation = nn.ReLU()
            self.set_act = True
        elif activation == 'softmax':
            self.activation = nn.Softmax(dim = 1)
            self.set_act = True
        else:
            self.set_act = False
            raise ValueError("activations supported are 'relu' and 'softmax'")

        self.skip = skip
        if self.skip:
            if skip_in_features == None:
                raise ValueError("pass input feature size of the skip connection")
            self.W_skip = torch.nn.Parameter(torch.DoubleTensor(skip_in_features, out_features))
            nn.init.xavier_uniform_(self.W)

    def forward(self, A, H_in, H_skip_in = None):
        # A must be an n x n matrix as it is an adjacency matrix
        # H is the input of the node embeddings, shape will n x in_features
        self.A = A
        self.H_in = H_in
        A_ = torch.add(self.A, torch.eye(self.A.shape[0]).double())
        D_ = torch.diag(A_.sum(1))
        # since D_ is a diagonal matrix,
        # its root will be the roots of the diagonal elements on the principle diagonal
        # since A is an adjacency matrix, we are only dealing with positive values
        # all roots will be real
        D_root_inv = torch.inverse(torch.sqrt(D_))
        A_norm = torch.mm(torch.mm(D_root_inv, A_), D_root_inv)
        # shape of A_norm will be n x n

        H_out = torch.mm(torch.mm(A_norm, H_in), self.W)
        # shape of H_out will be n x out_features

        if self.skip:
            H_skip_out = torch.mm(H_skip_in, self.W_skip)
            H_out = torch.add(H_out, H_skip_out)

        if self.set_act:
            H_out = self.activation(H_out)

        return H_out

# GCN 2 layers

In [10]:
class GCN_2layer(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, skip = False):
        super(GCN_2layer, self).__init__()
        self.skip = skip

        self.gcl1 = GraphConv(in_features, hidden_features)

        if self.skip:
            self.gcl_skip = GraphConv(hidden_features, out_features, activation = 'softmax', skip = self.skip,
                                  skip_in_features = in_features)
        else:
            self.gcl2 = GraphConv(hidden_features, out_features, activation = 'softmax')

    def forward(self, A, X):
        out = self.gcl1(A, X)
        if self.skip:
            out = self.gcl_skip(A, out, X)
        else:
            out = self.gcl2(A, out)

        return out

In [11]:
num_features = 15
num_classes = 2
num_ts = 49
epochs = 15
lr = 0.001
max_train_ts = 34
train_ts = np.arange(max_train_ts)

#adj_mats, features_labelled_ts, classes_ts = dataSet

# 0 - illicit, 1 - licit
# labels_ts = []
# for c in classes_ts:
#     labels_ts.append(np.array(c['class'] == '2', dtype = np.long))

gcn = GCN_2layer(num_features, 100, num_classes)
train_loss = nn.CrossEntropyLoss(weight = torch.DoubleTensor([0.7, 0.3]))
optimizer = torch.optim.Adam(gcn.parameters(), lr = lr)

# Training

# for ts in train_ts:
A = torch.tensor(adj)
X = torch.tensor(features)
L = torch.tensor(labels, dtype = torch.long)
for ep in range(epochs):
    t_start = time.time()

    gcn.train()
    optimizer.zero_grad()
    out = gcn(A, X)  

    loss = train_loss(out, L)
    train_pred = out.max(1)[1].type_as(L)
    acc = (train_pred.eq(L).double().sum())/L.shape[0]

    loss.backward()
    optimizer.step()

    sys.stdout.write("\r Epoch %d/%d Timestamp %d/%d training loss: %f training accuracy: %f Time: %s"
                        %(ep, epochs, 1, max_train_ts, loss, acc, time.time() - t_start)
                    )

torch.save(gcn.state_dict(), "modelDir"+ "gcn_weights.pth")

  A = torch.tensor(adj)
  X = torch.tensor(features)


 Epoch 14/15 Timestamp 1/34 training loss: 1.013262 training accuracy: 0.500000 Time: 0.10499405860900879

In [12]:
from sklearn.metrics import f1_score, precision_score, recall_score
test_ts = np.arange(14)
# adj_mats, features_labelled_ts, classes_ts = load_data(df_classes,df_edges,df_features, 35, 49)

# 0 - illicit, 1 - licit
# labels_ts = []
# for c in classes_ts:
#     labels_ts.append(np.array(c['class'] == '2', dtype = np.long))

gcn = GCN_2layer(num_features, 100, num_classes)
gcn.load_state_dict(torch.load("modelDir"+ "gcn_weights.pth"))

# Testing
test_accs = []
test_precisions = []
test_recalls = []
test_f1s = []

# for ts in test_ts:
# A = torch.tensor(adj_mats.values)
# X = torch.tensor(features_labelled_ts.values)
# L = torch.tensor(labels_ts, dtype = torch.long)

gcn.eval()
test_out = gcn(A, X)

test_pred = test_out.max(1)[1].type_as(L)
t_acc = (test_pred.eq(L).double().sum())/L.shape[0]
test_accs.append(t_acc.item())
test_precisions.append(precision_score(L, test_pred))
test_recalls.append(recall_score(L, test_pred))
test_f1s.append(f1_score(L, test_pred))

acc = np.array(test_accs).mean()
prec = np.array(test_precisions).mean()
rec = np.array(test_recalls).mean()
f1 = np.array(test_f1s).mean()

# Calculate ROC AUC
roc_auc = roc_auc_score(L, test_out[:, 1].detach().numpy())  # Use probabilities for ROC AUC, assuming binary classification

# Generate classification report
class_report = classification_report(L, test_pred.detach().numpy())  # Assuming L and test_pred are tensors



print("GCN - averaged accuracy: {}, precision: {}, recall: {}, f1: {}".format(acc, prec, rec, f1))
print("ROC AUC: {:.4f}".format(roc_auc))
print("Classification Report:\n", class_report)

GCN - averaged accuracy: 0.5, precision: 0.5, recall: 1.0, f1: 0.6666666666666666
ROC AUC: 0.5000
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       500
           1       0.50      1.00      0.67       500

    accuracy                           0.50      1000
   macro avg       0.25      0.50      0.33      1000
weighted avg       0.25      0.50      0.33      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
