In [1]:
! pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [2]:
!pip install torch_scatter

Collecting torch_scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch_scatter
  Building wheel for torch_scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch_scatter: filename=torch_scatter-2.1.2-cp310-cp310-linux_x86_64.whl size=495089 sha256=1c58faeaf188e42f9df7e5e24f9245314d00afffbe5fc475f7688d3737cef53e
  Stored in directory: /root/.cache/pip/wheels/92/f1/2b/3b46d54b134259f58c8363568569053248040859b1a145b3ce
Successfully built torch_scatter
Installing collected packages: torch_scatter
Successfully installed torch_scatter-2.1.2


In [3]:
import pandas as pd
import numpy as np
import torch
import scipy.sparse as scsp
from sklearn.cluster import KMeans
from torch_geometric.data import Data
import torch.nn.functional as F
import torch.nn as nn
import torch_scatter

from torch_geometric.nn.conv import MessagePassing
import torch_geometric.transforms as T
from torch_geometric.utils import remove_self_loops, add_self_loops, softmax, degree

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv("/content/drive/My Drive/BT4012 Team 8/Ethereum/feature_data.csv")

In [6]:
import pickle
with open("/content/drive/My Drive/BT4012 Team 8/Ethereum/subgraph.pkl", 'rb') as f:
    subgraph = pickle.load(f)

In [7]:
df_nodes = df['node']
df_class = df['isp']

In [8]:
import networkx as nx
df_edge = nx.to_pandas_edgelist(G = subgraph)


**Data Prep**

In [9]:
import torch
from torch_geometric.data import Data
import pandas as pd
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

# Convert edge index to tensor
# Concatenate 'source' and 'target' columns and find unique elements
unique_nodes = pd.concat([df_edge['source'], df_edge['target']]).unique()

# Create a mapping from nodes to integer indices
node_mapping = {node: i for i, node in enumerate(unique_nodes)}

# Map 'source' and 'target' nodes to their respective integer indices
df_edge['source_mapped'] = df_edge['source'].map(node_mapping)
df_edge['target_mapped'] = df_edge['target'].map(node_mapping)

# Create edge_index tensor for PyTorch Geometric
edge_index = torch.tensor(df_edge[['source_mapped', 'target_mapped']].values.T, dtype=torch.long)

node_features = torch.tensor(df.iloc[:, 2:].to_numpy(), dtype=torch.float)

node_labels = torch.tensor(df.iloc[:, 1].to_numpy(), dtype=torch.long)

# Create a graph data object
data = Data(x=node_features, edge_index=edge_index, y=node_labels)

# Step 2: Define the GNN model
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        num_classes = len(torch.unique(data.y))
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)


In [12]:
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report

model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Creating masks before training
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)

num_train_nodes = int(num_nodes * 0.8)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[indices[:num_train_nodes]] = True
test_mask[indices[num_train_nodes:]] = True

data.train_mask = train_mask
data.test_mask = test_mask

best_accuracy = 0
best_epoch = 0

# Training loop
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Evaluation step
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
    accuracy = int(correct) / int(data.test_mask.sum())
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_epoch = epoch
    print(f'Epoch {epoch}, Loss: {loss.item()}, Current Best Accuracy: {accuracy}')

print(f'Best Accuracy: {best_accuracy} at Epoch {best_epoch}')

# Final evaluation and classification report
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
accuracy = int(correct) / int(data.test_mask.sum())
print(f'Final Accuracy: {accuracy}')

# Print classification report
true_labels = data.y[data.test_mask].cpu().numpy()
predicted_labels = pred[data.test_mask].cpu().numpy()
print(classification_report(true_labels, predicted_labels, zero_division=0))
# Computing ROC AUC score
roc_auc = roc_auc_score(true_labels, predicted_labels, average='weighted')
print(f'ROC AUC: {roc_auc}')

Epoch 0, Loss: 577899.0, Current Best Accuracy: 0.9518072289156626
Epoch 1, Loss: 891554.5625, Current Best Accuracy: 0.9518072289156626
Epoch 2, Loss: 604735.125, Current Best Accuracy: 0.9518072289156626
Epoch 3, Loss: 297753.46875, Current Best Accuracy: 0.9518072289156626
Epoch 4, Loss: 139807.203125, Current Best Accuracy: 0.9533344646190396
Epoch 5, Loss: 208983.578125, Current Best Accuracy: 0.960970643135924
Epoch 6, Loss: 119141.234375, Current Best Accuracy: 0.960970643135924
Epoch 7, Loss: 169265.5625, Current Best Accuracy: 0.960970643135924
Epoch 8, Loss: 109030.2265625, Current Best Accuracy: 0.960970643135924
Epoch 9, Loss: 119491.4765625, Current Best Accuracy: 0.960970643135924
Epoch 10, Loss: 114944.3671875, Current Best Accuracy: 0.960970643135924
Epoch 11, Loss: 133865.25, Current Best Accuracy: 0.960970643135924
Epoch 12, Loss: 113302.5703125, Current Best Accuracy: 0.960970643135924
Epoch 13, Loss: 117253.1015625, Current Best Accuracy: 0.960970643135924
Epoch 14,

In [13]:
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.metrics import classification_report, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np

# Step 2: Define the GNN model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

# Splitting data into train and test sets
train_idx, test_idx = train_test_split(range(len(node_labels)), test_size=0.2, random_state=42)
train_mask = torch.zeros(len(node_labels), dtype=torch.bool)
test_mask = torch.zeros(len(node_labels), dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

# Creating graph data object for train and test
train_data = Data(x=node_features, edge_index=edge_index, y=node_labels, train_mask=train_mask)
test_data = Data(x=node_features, edge_index=edge_index, y=node_labels, test_mask=test_mask)

# Initialize model and optimizer
model = GCN(input_dim=node_features.size(1), hidden_dim=16, output_dim=len(torch.unique(train_data.y)))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Add a learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Grid search for class weights
best_accuracy = 0
best_roc_auc = 0
best_weights = None
best_epoch = 0

for weight_0 in [0.1, 0.5, 1.0]:  # Grid search weights for class 0
    for weight_1 in [5.0, 10.0, 15.0, 20.0]:  # Grid search weights for class 1
        class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float)
        criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

        # Initialize early stopping parameters for each set of weights
        best_accuracy_weight = 0
        best_roc_auc_weight = 0
        best_model_weight = None
        prev_loss = float('inf')
        patience = 20
        counter = 0
        best_epoch_weight = 0

        for epoch in range(200):  # Training loop for each set of weights
            model.train()
            optimizer.zero_grad()
            out = model(train_data)
            loss = criterion(out[train_data.train_mask], train_data.y[train_data.train_mask])
            loss.backward()
            optimizer.step()

            # Evaluation step on validation set
            model.eval()
            with torch.no_grad():
                out = model(test_data)
                loss = criterion(out[test_data.test_mask], test_data.y[test_data.test_mask])
                _, pred = out.max(dim=1)
                correct = (pred[test_data.test_mask] == test_data.y[test_data.test_mask]).sum().item()
                accuracy = correct / test_data.test_mask.sum().item()
                roc_auc = roc_auc_score(test_data.y[test_data.test_mask].cpu().numpy(),
                                      pred[test_data.test_mask].cpu().numpy(), average='weighted')
                print(f'Epoch {epoch}, weight 0: {weight_0}, weight 1: {weight_1}, Loss: {loss.item()}, Current Accuracy: {accuracy}, Current roc_auc: {roc_auc }')
                if loss < prev_loss:
                    best_accuracy_weight = accuracy
                    best_model_weight = model.state_dict()
                    prev_loss = loss
                    counter = 0
                    best_epoch_weight = epoch
                else:
                    counter += 1

                if accuracy > best_accuracy and roc_auc  > best_roc_auc :
                    best_accuracy = accuracy
                    best_roc_auc  = roc_auc
                    best_weights = class_weights
                    best_model = best_model_weight
                    best_epoch = best_epoch_weight

                if counter >= patience:
                    break

# Use the best weights found in the grid search for final evaluation
class_weights = best_weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
model.load_state_dict(best_model)

# Final evaluation and classification report
model.eval()
with torch.no_grad():
    out = model(test_data)
    loss = criterion(out[test_data.test_mask], test_data.y[test_data.test_mask])
    scheduler.step(loss)  # Update learning rate based on validation loss
    _, pred = out.max(dim=1)
    correct = (pred[test_data.test_mask] == test_data.y[test_data.test_mask]).sum().item()
    accuracy = correct / test_data.test_mask.sum().item()
    roc_auc  = roc_auc_score(test_data.y[test_data.test_mask].cpu().numpy(),
                          pred[test_data.test_mask].cpu().numpy(), average='weighted')

    print(f'Best Accuracy: {best_accuracy}, Best Recall: {best_roc_auc }')
    print(f'Final Accuracy: {accuracy}, Final Recall: {roc_auc }')

    true_labels = test_data.y[test_data.test_mask].cpu().numpy()
    predicted_labels = pred[test_data.test_mask].cpu().numpy()
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    roc_auc = roc_auc_score(true_labels, predicted_labels, average='weighted')
    print(f'ROC AUC: {roc_auc}')

Epoch 0, weight 0: 0.1, weight 1: 5.0, Loss: 302681.09375, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 1, weight 0: 0.1, weight 1: 5.0, Loss: 293009.625, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 2, weight 0: 0.1, weight 1: 5.0, Loss: 286485.71875, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 3, weight 0: 0.1, weight 1: 5.0, Loss: 277251.09375, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 4, weight 0: 0.1, weight 1: 5.0, Loss: 267472.96875, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 5, weight 0: 0.1, weight 1: 5.0, Loss: 257428.109375, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 6, weight 0: 0.1, weight 1: 5.0, Loss: 247968.984375, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 7, weight 0: 0.1, weight 1: 5.0, Loss: 237834.640625, Current Accuracy: 0.03902935686407602, Current roc_auc: 0.5
Epoch 8, weight 0: 0.1, weight 1: 5.0, Loss: 22

## Use Oversampling

In [16]:
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(data.num_node_features, 16)
        num_classes = len(torch.unique(data.y))
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [17]:
from collections import Counter
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import classification_report

model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# Creating masks before training
num_nodes = data.num_nodes
indices = torch.randperm(num_nodes)
num_train_nodes = int(num_nodes * 0.8)

class_counts = Counter(data.y.numpy())
minority_class = min(class_counts, key=class_counts.get)
num_to_oversample = max(class_counts.values()) - class_counts[minority_class]
minority_indices = (data.y == minority_class).nonzero(as_tuple=True)[0]
oversampled_indices = minority_indices[torch.randint(len(minority_indices), (num_to_oversample,))]
oversampled_train_indices = torch.cat((indices[:num_train_nodes], oversampled_indices))

# Update train mask
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[oversampled_train_indices] = True
data.train_mask = train_mask

test_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask[indices[num_train_nodes:]] = True
data.test_mask = test_mask

best_accuracy = 0
best_epoch = 0

# Training loop
for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Evaluation step
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
    accuracy = int(correct) / int(data.test_mask.sum())
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_epoch = epoch
    print(f'Epoch {epoch}, Loss: {loss.item()}, Current Best Accuracy: {accuracy}')

print(f'Best Accuracy: {best_accuracy} at Epoch {best_epoch}')

# Final evaluation and classification report
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
accuracy = int(correct) / int(data.test_mask.sum())
print(f'Final Accuracy: {accuracy}')

# Print classification report
true_labels = data.y[data.test_mask].cpu().numpy()
predicted_labels = pred[data.test_mask].cpu().numpy()
print(classification_report(true_labels, predicted_labels, zero_division=0))
roc_auc = roc_auc_score(true_labels, predicted_labels, average='weighted')
print(f'ROC AUC: {roc_auc}')

Epoch 0, Loss: 42570.55859375, Current Best Accuracy: 0.9579161717291702
Epoch 1, Loss: 41008.50390625, Current Best Accuracy: 0.9579161717291702
Epoch 2, Loss: 38492.22265625, Current Best Accuracy: 0.9579161717291702
Epoch 3, Loss: 34036.55859375, Current Best Accuracy: 0.9579161717291702
Epoch 4, Loss: 25810.302734375, Current Best Accuracy: 0.9579161717291702
Epoch 5, Loss: 22997.57421875, Current Best Accuracy: 0.9579161717291702
Epoch 6, Loss: 20636.3125, Current Best Accuracy: 0.9579161717291702
Epoch 7, Loss: 16818.47265625, Current Best Accuracy: 0.9579161717291702
Epoch 8, Loss: 13572.9755859375, Current Best Accuracy: 0.9579161717291702
Epoch 9, Loss: 11514.13671875, Current Best Accuracy: 0.9579161717291702
Epoch 10, Loss: 11855.560546875, Current Best Accuracy: 0.9579161717291702
Epoch 11, Loss: 10048.625, Current Best Accuracy: 0.9579161717291702
Epoch 12, Loss: 6905.3154296875, Current Best Accuracy: 0.9579161717291702
Epoch 13, Loss: 3223.68359375, Current Best Accuracy

## Use UnderSampling

In [18]:
from collections import Counter
import torch
from torch_geometric.data import Data
from sklearn.metrics import classification_report
from torch.nn.functional import cross_entropy
from torch.optim import Adam

num_nodes = data.num_nodes
class_counts = Counter(data.y.numpy())
minority_class = min(class_counts, key=class_counts.get)
minority_class_size = class_counts[minority_class]
indices = torch.randperm(num_nodes)

# Filter indices for each class and limit the majority class
class_indices = {cls: indices[(data.y == cls).nonzero(as_tuple=True)[0]] for cls in class_counts}
limited_indices = torch.cat([idx[:minority_class_size] if cls != minority_class else idx for cls, idx in class_indices.items()])

# Update train and test masks
num_train_nodes = int(len(limited_indices) * 0.8)
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[limited_indices[:num_train_nodes]] = True
test_mask[limited_indices[num_train_nodes:]] = True

data.train_mask = train_mask
data.test_mask = test_mask

best_accuracy = 0
best_epoch = 0

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # Evaluation step
    model.eval()
    _, pred = model(data).max(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
    accuracy = int(correct) / int(data.test_mask.sum())
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_epoch = epoch
    print(f'Epoch {epoch}, Loss: {loss.item()}, Current Best Accuracy: {accuracy}')

print(f'Best Accuracy: {best_accuracy} at Epoch {best_epoch}')

# Final evaluation and classification report
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
accuracy = int(correct) / int(data.test_mask.sum())
print(f'Final Accuracy: {accuracy}')

# Print classification report
true_labels = data.y[data.test_mask].cpu().numpy()
predicted_labels = pred[data.test_mask].cpu().numpy()
print(classification_report(true_labels, predicted_labels, zero_division=0))
roc_auc = roc_auc_score(true_labels, predicted_labels, average='weighted')
print(f'ROC AUC: {roc_auc}')

Epoch 0, Loss: 0.20009735226631165, Current Best Accuracy: 0.9613733905579399
Epoch 1, Loss: 0.19969876110553741, Current Best Accuracy: 0.9613733905579399
Epoch 2, Loss: 0.1990332305431366, Current Best Accuracy: 0.9613733905579399
Epoch 3, Loss: 0.19852955639362335, Current Best Accuracy: 0.9613733905579399
Epoch 4, Loss: 0.19779789447784424, Current Best Accuracy: 0.9613733905579399
Epoch 5, Loss: 0.19668486714363098, Current Best Accuracy: 0.9613733905579399
Epoch 6, Loss: 0.19663038849830627, Current Best Accuracy: 0.9613733905579399
Epoch 7, Loss: 0.195838063955307, Current Best Accuracy: 0.9613733905579399
Epoch 8, Loss: 0.19530686736106873, Current Best Accuracy: 0.9613733905579399
Epoch 9, Loss: 0.19445429742336273, Current Best Accuracy: 0.9613733905579399
Epoch 10, Loss: 0.19424773752689362, Current Best Accuracy: 0.9613733905579399
Epoch 11, Loss: 0.1934739202260971, Current Best Accuracy: 0.9613733905579399
Epoch 12, Loss: 0.19289639592170715, Current Best Accuracy: 0.9613