In [2]:
import dgl
from dgl.data import DGLDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
from sklearn.metrics import roc_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import numpy as np

Using backend: pytorch[09:36:07] /opt/dgl/src/runtime/tensordispatch.cc:
43: TensorDispatcher: dlopen failed: /usr/anaconda3/lib/python3.9/site-packages/dgl/tensoradapter/pytorch/libtensoradapter_pytorch_1.10.1.so: cannot open shared object file: No such file or directory


In [3]:
import inspect
print(inspect.getfile(dgl))
print(inspect.getfile(dgl.nn.GraphConv))

/usr/anaconda3/lib/python3.9/site-packages/dgl/__init__.py
/usr/anaconda3/lib/python3.9/site-packages/dgl/nn/pytorch/conv/graphconv.py


In [4]:
from dgl.nn import GraphConv

class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes, norm='none'):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = torch.sigmoid(h)
        h = self.conv2(g, h)
        return h

# Create the model with given dimensions
# model = GCN(g.ndata['feat'].shape[1], 16, dataset.num_classes)

In [5]:
def train(g, model, epoch):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    best_val_acc = 0
    best_test_acc = 0

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    val_mask = g.ndata['val_mask']
    test_mask = g.ndata['test_mask']
    in_feats = features.shape[1]
    n_classes = dataset.num_classes
    n_edges = dataset.graph.number_of_edges()
    print("""----Data statistics------'
      #Edges %d
      #Classes %d
      #Train samples %d - %d spammers
      #Val samples %d - %d spammers
      #Test samples %d - %d spammers""" %
          (n_edges, n_classes,
              train_mask.int().sum().item(), torch.count_nonzero(labels[train_mask]).item(),
              val_mask.int().sum().item(), torch.count_nonzero(labels[val_mask]).item(),
              test_mask.int().sum().item(), torch.count_nonzero(labels[test_mask]).item()))
    
    g = dgl.remove_self_loop(g)
    g = dgl.add_self_loop(g)
    n_edges = g.number_of_edges()
    
    for e in range(epoch):
        # Forward
        logits = model(g, features)

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_mask], labels[train_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
        val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
        test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_val_acc < val_acc:
            best_val_acc = val_acc
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 4900 == 100:
            print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
                e, loss, val_acc, best_val_acc, test_acc, best_test_acc))
    
    logits = model(g, features)
    logits = logits[test_mask]
    labels = labels[test_mask]
    y_prob_torch, indices = torch.max(logits, dim=1)
    return labels.cpu().numpy(), indices.cpu().numpy(), test_acc # y_true, y_pred, test_acc

In [6]:
nodes_data = pd.read_csv('/home/chanho/project/Bob/twitter/members.csv')
edges_data = pd.read_csv('/home/chanho/project/Bob/twitter/interactions.csv')
twitter_node_PR = pd.read_csv('/home/chanho/project/Bob/data/twitter/twitter-PR100.txt', names=['index', 'PR'], header=None, index_col='index', sep = ' ')
nodeIndex = pd.read_csv('/home/chanho/project/Bob/twitter/nodeIndex.csv', names=['Original', 'Indexed'], header=None, index_col='Original')

  mask |= (ar1 == a)


In [7]:
nodeIndex.head(4)

Unnamed: 0_level_0,Indexed
Original,Unnamed: 1_level_1
1289,0
6612,1
9087,2
12052,3


In [8]:
nodes_data.assign(PR=np.full(nodes_data.shape[0], 15, dtype=int))

Unnamed: 0,Id,T0,T1,T2,T3,Spammer,PR
0,0,0,0,0,0,0,15
1,1,7,27,90,83,0,15
2,2,24,209,559,506,0,15
3,3,0,0,0,0,0,15
4,4,0,0,0,0,0,15
...,...,...,...,...,...,...,...
175589,175589,0,0,0,1,1,15
175590,175590,0,0,1,11,1,15
175591,175591,0,10,5,4,0,15
175592,175592,11,108,341,267,0,15


In [9]:
N = nodeIndex.index
T = twitter_node_PR.index
for i in N:
    if i in T:
        nodes_data.at[nodeIndex.at[i, 'Indexed'], 'PR'] = int(twitter_node_PR.at[i, 'PR'] * 100)

In [10]:
nodes_data.head(10)

Unnamed: 0,Id,T0,T1,T2,T3,Spammer,PR
0,0,0,0,0,0,0,65.0
1,1,7,27,90,83,0,210.0
2,2,24,209,559,506,0,366.0
3,3,0,0,0,0,0,15.0
4,4,0,0,0,0,0,16.0
5,5,0,0,0,0,0,15.0
6,6,25,2642,45635,155525,0,29512.0
7,7,776,9689,38364,48145,0,5333.0
8,8,0,1,0,1,1,21.0
9,9,0,3,38,80,1,46.0


In [11]:
twitter_node_PR.loc[1289, 'PR']

0.65859

In [12]:
nodes_data['PR']

0          65.0
1         210.0
2         366.0
3          15.0
4          16.0
          ...  
175589     46.0
175590     19.0
175591    259.0
175592    211.0
175593    743.0
Name: PR, Length: 175594, dtype: float64

In [13]:
print(nodes_data.shape)
nodes_data = nodes_data.fillna(0)
print(nodes_data.shape)
nodes_data = nodes_data.astype(int)
nodes_data['PR']

(175594, 7)
(175594, 7)


0          65
1         210
2         366
3          15
4          16
         ... 
175589     46
175590     19
175591    259
175592    211
175593    743
Name: PR, Length: 175594, dtype: int64

In [26]:
nodes_data['PR'].loc[95803]

8911101

In [48]:
class TwitterDataset(DGLDataset):
    directed = False
    
    def __init__(self, directed=False):
        super().__init__(name='twitter')
        self.directed = directed

    def process(self):
        node_features = torch.from_numpy(np.ones(nodes_data.shape, dtype=int))
        #node_features = torch.from_numpy(nodes_data[['PR']].to_numpy())
        #node_features = torch.from_numpy(nodes_data[['T0', 'T1', 'T2', 'T3']].to_numpy())
        node_labels = torch.from_numpy(nodes_data['Spammer'].to_numpy())
        if self.directed:
            edge_features = torch.from_numpy(edges_data['Weight'].to_numpy())
        edges_src = torch.from_numpy(edges_data['Src'].to_numpy())
        edges_dst = torch.from_numpy(edges_data['Dst'].to_numpy())

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=nodes_data.shape[0])
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels
        if self.directed:
            self.graph.edata['weight'] = edge_features
        self.num_classes = 2

        # If your dataset is a node classification dataset, you will need to assign
        # masks indicating whether a node belongs to training, validation, and test set.
        n_nodes = nodes_data.shape[0]
        n_train = int(n_nodes * 0.2)
        n_val = int(n_nodes * 0.2)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[:n_train] = True
        val_mask[n_train:n_train + n_val] = True
        test_mask[n_train + n_val:] = True
        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['val_mask'] = val_mask
        self.graph.ndata['test_mask'] = test_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

In [49]:
#print('Node features')
#print(g.ndata)

In [50]:
#print('Edge features')
#print(g.edata)

In [51]:
classification_report_list = []
accuracy_list = []
conf_mat = []

for i in range(5):
    print(f"\n\n{i + 1} :\n")
    dataset = TwitterDataset(directed=True)
    g = dataset[0]
    g = g.to('cuda')
    model = GCN(g.ndata['feat'].shape[1], 4, dataset.num_classes).to('cuda')
    y_true, y_pred, acc = train(g, model, 5001)
    res = classification_report(y_true, y_pred, target_names=['non-spammer', 'spammer'], output_dict=True)
    classification_report_list.append(res)
    accuracy_list.append(acc)
    conf_mat.append(confusion_matrix(y_true, y_pred).ravel()) # tn, fp, fn, tp

res = dict()
res['non-spammer'] = dict()
res['spammer'] = dict()

res['non-spammer']['precision'] = 0
res['non-spammer']['recall'] = 0
res['non-spammer']['f1-score'] = 0
res['spammer']['precision'] = 0
res['spammer']['recall'] = 0
res['spammer']['f1-score'] = 0

for i in classification_report_list:
    res['non-spammer']['precision'] += i['non-spammer']['precision']
    res['non-spammer']['recall'] += i['non-spammer']['recall']
    res['non-spammer']['f1-score'] += i['non-spammer']['f1-score']
    res['spammer']['precision'] += i['spammer']['precision']
    res['spammer']['recall'] += i['spammer']['recall']
    res['spammer']['f1-score'] += i['spammer']['f1-score']
    
res['non-spammer']['precision'] /= len(classification_report_list)
res['non-spammer']['recall'] /= len(classification_report_list)
res['non-spammer']['f1-score'] /= len(classification_report_list)
res['spammer']['precision'] /= len(classification_report_list)
res['spammer']['recall'] /= len(classification_report_list)
res['spammer']['f1-score'] /= len(classification_report_list)

fpr_list = []

for i in conf_mat:
    fpr = i[1] / (i[0]+ i[1])
    fpr_list.append(fpr)

print("\n<<<Directed>>>\n\n")

print(f"fpr : {sum(fpr_list) / len(fpr_list)}")

print(res)
print(sum(accuracy_list)/len(accuracy_list))



1 :

----Data statistics------'
      #Edges 155831
      #Classes 2
      #Train samples 35118 - 6007 spammers
      #Val samples 35118 - 5962 spammers
      #Test samples 105358 - 17637 spammers
In epoch 100, loss: 0.352, val acc: 0.830 (best 0.830), test acc: 0.833 (best 0.833)
In epoch 5000, loss: 0.096, val acc: 0.966 (best 0.967), test acc: 0.966 (best 0.966)


2 :

----Data statistics------'
      #Edges 155831
      #Classes 2
      #Train samples 35118 - 6007 spammers
      #Val samples 35118 - 5962 spammers
      #Test samples 105358 - 17637 spammers
In epoch 100, loss: 0.331, val acc: 0.830 (best 0.830), test acc: 0.833 (best 0.833)
In epoch 5000, loss: 0.095, val acc: 0.969 (best 0.969), test acc: 0.968 (best 0.968)


3 :

----Data statistics------'
      #Edges 155831
      #Classes 2
      #Train samples 35118 - 6007 spammers
      #Val samples 35118 - 5962 spammers
      #Test samples 105358 - 17637 spammers
In epoch 100, loss: 0.321, val acc: 0.830 (best 0.830), test 

In [None]:
classification_report_list = []
accuracy_list = []
conf_mat = []

for i in range(5):
    print(f"\n\n{i + 1} :\n")
    dataset = TwitterDataset(directed=False)
    g = dataset[0]
    g = g.to('cuda')
    model = GCN(g.ndata['feat'].shape[1], 16, dataset.num_classes).to('cuda')
    y_true, y_pred, acc = train(g, model, 5001)
    res = classification_report(y_true, y_pred, target_names=['non-spammer', 'spammer'], output_dict=True)
    classification_report_list.append(res)
    accuracy_list.append(acc)
    conf_mat.append(confusion_matrix(y_true, y_pred).ravel()) # tn, fp, fn, tp
    #tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

res = dict()
res['non-spammer'] = dict()
res['spammer'] = dict()

res['non-spammer']['precision'] = 0
res['non-spammer']['recall'] = 0
res['non-spammer']['f1-score'] = 0
res['spammer']['precision'] = 0
res['spammer']['recall'] = 0
res['spammer']['f1-score'] = 0

for i in classification_report_list:
    res['non-spammer']['precision'] += i['non-spammer']['precision']
    res['non-spammer']['recall'] += i['non-spammer']['recall']
    res['non-spammer']['f1-score'] += i['non-spammer']['f1-score']
    res['spammer']['precision'] += i['spammer']['precision']
    res['spammer']['recall'] += i['spammer']['recall']
    res['spammer']['f1-score'] += i['spammer']['f1-score']
    
res['non-spammer']['precision'] /= len(classification_report_list)
res['non-spammer']['recall'] /= len(classification_report_list)
res['non-spammer']['f1-score'] /= len(classification_report_list)
res['spammer']['precision'] /= len(classification_report_list)
res['spammer']['recall'] /= len(classification_report_list)
res['spammer']['f1-score'] /= len(classification_report_list)

fpr_list = []

for i in conf_mat:
    fpr = i[1] / (i[0]+ i[1])
    fpr_list.append(fpr)

print("\n<<<Undrected>>>\n\n")

print(f"fpr : {sum(fpr_list) / len(fpr_list)}")

print(res)
print(sum(accuracy_list)/len(accuracy_list))

In [47]:
def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        fpr, tpr, thr = roc_curve(labels.cpu().numpy(), indices.cpu().numpy())
        #tpr = torch.sum((indices == torch.ones(len(indices)).cuda()) == labels).item() * 1.0 / torch.sum(labels == torch.ones(len(labels)).cuda()).item() * 1.0
        #fpr = torch.sum((indices == torch.ones(len(indices)).cuda()) != labels).item() * 1.0 / torch.sum(labels == torch.zeros(len(labels)).cuda()).item() * 1.0
        return correct.item() * 1.0 / len(labels), tpr, fpr, thr

In [48]:
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.savefig('./result.png')