In [1]:
import torch
import numpy as np
import pandas as pd
from torch_geometric.data  import Data

In [19]:
import os.path as osp

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cpu"

In [3]:
# load the Cora dataset
dataset = 'Cora'
path = osp.join('.', 'data', dataset)
dataset = Planetoid(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]
print(dataset.data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


In [20]:
print('Number of Graphs : ', len(dataset))
print('Number of Classes : ', dataset.num_classes)
print('Node Features : ', dataset.num_node_features)
print('Edge Features : ', dataset.num_edge_features)
print(dataset.num_features)

Number of Graphs :  1
Number of Classes :  7
Node Features :  1433
Edge Features :  0
1433


In [8]:
dataset.data.edge_index.shape

torch.Size([2, 10556])

In [10]:
dataset.data.x.shape

torch.Size([2708, 1433])

In [14]:
dataset.data.x[0].shape

torch.Size([1433])

In [13]:
dataset.data.y[0]

tensor(3)

In [21]:
dataset.data.edge_index

tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])

In [18]:
from torch_geometric.nn import SAGEConv

In [22]:
data = dataset[0]

In [25]:
class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()

        self.conv = SAGEConv(dataset.num_features, dataset.num_classes, aggr = "max")
    
    def forward(self):
        
        x = self.conv(data.x, data.edge_index)
        return F.log_softmax(x, dim=1)


In [29]:
model= Net()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01, weight_decay=5e-4)

In [36]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []

    for _, mask in data('train_mask', 'val_mask','test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    
    return accs


In [39]:
best_val_acc = test_acc = 0 

for epoch in range(1, 100):

    train()
    _, val_acc, tmp_test_acc = test()

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc

    log = 'Epoch : {}, Val: {}, Test: {}'

    if epoch % 10 == 0:
        print(log.format(epoch, best_val_acc, test_acc))

Epoch : 10, Val: 0.724, Test: 0.692
Epoch : 20, Val: 0.73, Test: 0.699
Epoch : 30, Val: 0.73, Test: 0.699
Epoch : 40, Val: 0.73, Test: 0.699
Epoch : 50, Val: 0.73, Test: 0.699
Epoch : 60, Val: 0.73, Test: 0.699
Epoch : 70, Val: 0.73, Test: 0.699
Epoch : 80, Val: 0.73, Test: 0.699
Epoch : 90, Val: 0.73, Test: 0.699


In [44]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_save_to_state_dict',
 '_slow_forward',
 '_state_dict_hooks',
 '_version',
 'add_module',
 'apply',


In [5]:
# use train_test_split_edges to create neg and positive edges
data.train_mask = data.val_mask = data.test_mask = data.y = None
data = train_test_split_edges(data)
print(data)



Data(x=[2708, 1433], val_pos_edge_index=[2, 263], test_pos_edge_index=[2, 527], train_pos_edge_index=[2, 8976], train_neg_adj_mask=[2708, 2708], val_neg_edge_index=[2, 263], test_neg_edge_index=[2, 527])


In [6]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 128)
        self.conv2 = GCNConv(128, 64)

    def encode(self):
        x = self.conv1(data.x, data.train_pos_edge_index) # convolution 1
        x = x.relu()
        return self.conv2(x, data.train_pos_edge_index) # convolution 2

    def decode(self, z, pos_edge_index, neg_edge_index): # only pos and neg edges
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1) # concatenate pos and neg edges
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        return logits

    def decode_all(self, z): 
        prob_adj = z @ z.t() # get adj NxN
        return (prob_adj > 0).nonzero(as_tuple=False).t() # get predicted edge_list 

In [7]:
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

In [8]:

def get_link_labels(pos_edge_index, neg_edge_index):
    # returns a tensor:
    # [1,1,1,1,...,0,0,0,0,0,..] with the number of ones is equel to the lenght of pos_edge_index
    # and the number of zeros is equal to the length of neg_edge_index
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()

    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, #positive edges
        num_nodes=data.num_nodes, # number of nodes
        num_neg_samples=data.train_pos_edge_index.size(1)) # number of neg_sample equal to number of pos_edges

    optimizer.zero_grad()
    
    z = model.encode() #encode
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index) # decode
    
    link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()

    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']

        z = model.encode() # encode train
        link_logits = model.decode(z, pos_edge_index, neg_edge_index) # decode test or val
        link_probs = link_logits.sigmoid() # apply sigmoid
        
        link_labels = get_link_labels(pos_edge_index, neg_edge_index) # get link
        
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu())) #compute roc_auc score
    return perfs


In [9]:

best_val_perf = test_perf = 0
for epoch in range(1, 101):
    train_loss = train()
    val_perf, tmp_test_perf = test()
    if val_perf > best_val_perf:
        best_val_perf = val_perf
        test_perf = tmp_test_perf
    log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    if epoch % 10 == 0:
        print(log.format(epoch, train_loss, best_val_perf, test_perf))



Epoch: 010, Loss: 0.6886, Val: 0.7438, Test: 0.7317
Epoch: 020, Loss: 0.6512, Val: 0.7438, Test: 0.7317
Epoch: 030, Loss: 0.5568, Val: 0.7618, Test: 0.7491
Epoch: 040, Loss: 0.5032, Val: 0.8229, Test: 0.8170
Epoch: 050, Loss: 0.4795, Val: 0.8514, Test: 0.8403
Epoch: 060, Loss: 0.4738, Val: 0.8740, Test: 0.8698
Epoch: 070, Loss: 0.4662, Val: 0.8753, Test: 0.8688
Epoch: 080, Loss: 0.4633, Val: 0.8792, Test: 0.8753
Epoch: 090, Loss: 0.4558, Val: 0.8842, Test: 0.8767
Epoch: 100, Loss: 0.4496, Val: 0.8924, Test: 0.8803


In [10]:
z = model.encode()
final_edge_index = model.decode_all(z)

In [15]:
final_edge_index[0]

tensor([   0,    0,    0,  ..., 2707, 2707, 2707])