In [1]:
import pickle
import pandas as pd

from torch_geometric.data import DataLoader
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, ChebConv  # noqa

In [2]:
with open("./SySeVR_GraphDataset_e3.pkl", "rb") as f:
    dataset = pickle.load(f)

In [26]:
# quick fix
for i in range(len(dataset)):
    dataset[i].x = dataset[i].x.float()

In [3]:
loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [4]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset[0].num_features, 16, cached=True, normalize=False)
        # self.conv2 = GCNConv(16, dataset[0].num_classes, cached=True, normalize=False)
        self.conv2 = GCNConv(16, 2, cached=True, normalize=False)
        # self.conv1 = ChebConv(data.num_features, 16, K=2)
        # self.conv2 = ChebConv(16, data.num_features, K=2)

    def forward(self, arg_data):
        x, edge_index, edge_weight = arg_data.x, arg_data.edge_index, arg_data.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return F.log_softmax(x, dim=1)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam([
    dict(params=model.conv1.parameters(), weight_decay=5e-4),
    dict(params=model.conv2.parameters(), weight_decay=0)
], lr=0.01)  # Only perform weight-decay on first convolution.

In [6]:
def train():
    model.train()
    tmp_cnt = 0
    for data in loader:
        tmp_cnt += 1
        print("\r# train batch={}/{}".format(tmp_cnt, len(loader)), end="")
        optimizer.zero_grad()
        tdata = data.to(device)
        F.nll_loss(model(tdata)[tdata.train_mask], tdata.y[tdata.train_mask]).backward()
        optimizer.step()
    print()

In [7]:
# @torch.no_grad()
# def test():
#     model.eval()
#     # accs = {"train_mask":[], "val_mask":[], "test_mask":[]}
#     rp = {"train_mask":[], "val_mask":[], "test_mask":[]}
#     np = {"train_mask":[], "val_mask":[], "test_mask":[]}
#     tmp_cnt = 0
#     for data in loader:
#         tmp_cnt += 1
#         print("\r# test batch={}/{}".format(tmp_cnt, len(loader)), end="")
#         tdata = data.to(device)
#         logits = model(tdata)
#         for tag, mask in tdata('train_mask', 'val_mask', 'test_mask'):
#             if not any(mask):
#                 # skip all False mask
#                 continue
#             pred = logits[mask].max(1)[1]
#             # acc = pred.eq(tdata.y[mask]).sum().item() / mask.sum().item()
#             # accs[tag].append(acc)
#             rp[tag].append(pred.eq(tdata.y[mask]).sum().item())
#             np[tag].append(mask.sum().item())
#     print()
#     # return [accs["train_mask"], accs["val_mask"], accs["test_mask"]]
#     return [
#         sum(rp["train_mask"]) / sum(np["train_mask"]),
#         sum(rp["val_mask"]) / sum(np["val_mask"]),
#         sum(rp["test_mask"]) / sum(np["test_mask"])
#     ]

In [8]:
@torch.no_grad()
def test():
    model.eval()
    pred_sts = {"train_mask":[], "val_mask":[], "test_mask":[]}
    y_sts = {"train_mask":[], "val_mask":[], "test_mask":[]}
    tmp_cnt = 0
    for data in loader:
        tmp_cnt += 1
        print("\r# test batch={}/{}".format(tmp_cnt, len(loader)), end="")
        tdata = data.to(device)
        logits = model(tdata)
        for tag, mask in tdata('train_mask', 'val_mask', 'test_mask'):
            if not any(mask):
                # skip all False mask
                continue
            pred = logits[mask].max(1)[1]
            pred_sts[tag] += pred.tolist()
            y_sts[tag] += tdata.y[mask].tolist()
    print()
    return pred_sts, y_sts

In [None]:
best_val_acc = test_acc = 0
for epoch in range(1, 201):
    train()
    # train_acc, val_acc, tmp_test_acc = test()
    tmp_pred, tmp_y = test()
    
    train_acc = sum([tmp_pred["train_mask"][i]==tmp_y["train_mask"][i] for i in range(len(tmp_pred["train_mask"]))]) / len(tmp_pred["train_mask"])
    val_acc = sum([tmp_pred["val_mask"][i]==tmp_y["val_mask"][i] for i in range(len(tmp_pred["val_mask"]))]) / len(tmp_pred["val_mask"])
    test_acc = sum([tmp_pred["test_mask"][i]==tmp_y["test_mask"][i] for i in range(len(tmp_pred["test_mask"]))]) / len(tmp_pred["test_mask"])
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = test_acc
    log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    print(log.format(epoch, train_acc, best_val_acc, test_acc))
    
    y_actu = pd.Series(tmp_y["test_mask"], name='Actual')
    y_pred = pd.Series(tmp_pred["test_mask"], name='Predicted')
    df_confusion = pd.crosstab(y_actu, y_pred)
    # print(df_confusion)
    print("Confusion matrix for test set:\n{}".format(df_confusion))

# train batch=25842/25842
# test batch=25842/25842
Epoch: 001, Train: 0.8836, Val: 0.8815, Test: 0.8797
Confusion matrix for test set:
Predicted      0     1
Actual                
0          69664   208
1           9716  2904
# train batch=25842/25842
# test batch=25842/25842
Epoch: 002, Train: 0.8837, Val: 0.8816, Test: 0.8799
Confusion matrix for test set:
Predicted      0     1
Actual                
0          69748   124
1           9783  2837
# train batch=25842/25842
# test batch=25842/25842
Epoch: 003, Train: 0.8836, Val: 0.8816, Test: 0.8799
Confusion matrix for test set:
Predicted      0     1
Actual                
0          69745   127
1           9783  2837
# train batch=25842/25842
# test batch=25842/25842
Epoch: 004, Train: 0.8835, Val: 0.8816, Test: 0.8797
Confusion matrix for test set:
Predicted      0     1
Actual                
0          69749   123
1           9803  2817
# train batch=25842/25842
# test batch=25842/25842
Epoch: 005, Train: 0.8509, Val: 0.8816, T

In [84]:
tmp_pred, tmp_y = test()

# test batch=20235/20235


In [85]:
from sklearn.metrics import confusion_matrix

In [87]:
confusion_matrix(tmp_y["test_mask"], tmp_pred["test_mask"])

array([[55320,    25],
       [ 9502,   236]])

In [89]:
import pandas as pd
y_actu = pd.Series(tmp_y["test_mask"], name='Actual')
y_pred = pd.Series(tmp_pred["test_mask"], name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)

In [91]:
print(df_confusion)

Predicted      0    1
Actual               
0          55320   25
1           9502  236
