In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
!pip install dgl



In [None]:
import dgl
from dgl import DGLGraph

# Load Pytorch as backend
dgl.load_backend('pytorch')

Using backend: pytorch


Load the rest of the necessary libraries.

In [None]:
import numpy as np

In [None]:
from dgl.nn.pytorch import conv as dgl_conv
import torch.nn.functional as F

class GCNModel(nn.Module):
    def __init__(self,in_feats,out_dim,aggregator_type):
        super(GCNModel, self).__init__()
        self.layers = nn.ModuleList()

        # input layer
        self.layer1 = dgl_conv.SAGEConv(in_feats, 1632,aggregator_type=aggregator_type)
        self.layer2 = dgl_conv.SAGEConv(1632, 502,aggregator_type=aggregator_type)
        self.layer3 = dgl_conv.SAGEConv(502, out_dim,aggregator_type=aggregator_type)

    def forward(self, g, features):
        h = F.relu(self.layer1(g,features))
        h1 = F.relu(self.layer2(g,h))
        h2 = self.layer3(g,h1)
        return h2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import networkx as nx

In [None]:
raw_edge_list = open('/content/drive/My Drive/Colab Notebooks/DSLab/yeast.edgelist','r')
G = nx.Graph()
G = nx.parse_edgelist(raw_edge_list, delimiter='\t', create_using=G,nodetype=str, data=(('weight', float),))
# Get graph edges and nodes from networkx graph object
nodes = G.nodes
edges = G.edges

print("Graph's Nodes : {} / Edges : {}".format(len(nodes), len(edges)))

Graph's Nodes : 6526 / Edges : 532180


In [None]:
G.remove_edges_from(nx.selfloop_edges(G))

In [None]:
!pip install stellargraph



In [None]:
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter

In [None]:
es_test = EdgeSplitter(G)
graph_test, examples_test, link_labels_test = es_test.train_test_split(
    p=0.25, method="global"
)

** Sampled 132623 positive and 132623 negative edges. **


In [None]:
from sklearn.model_selection import train_test_split
es_train = EdgeSplitter(graph_test, G)
graph_train, examples_train, link_labels_train = es_train.train_test_split(
    p=0.25, method="global"
)
examples_train,examples_val, link_labels_train, link_label_val = train_test_split(examples_train,link_labels_train,test_size=0.1)

** Sampled 99468 positive and 99468 negative edges. **


In [None]:
g = DGLGraph()
g.from_networkx(graph_train)
g.readonly()

## Link prediction

In [None]:
adj = nx.adjacency_matrix(G)

In [None]:
#features =np.matrix([[i, -i,] for i in range(adj.shape[0])], dtype=float)
features = np.identity(adj.shape[0])
#features = Variable(torch.FloatTensor(features), requires_grad=True)

In [None]:
features

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
in_feats = features.shape[1]

In [None]:
# NCE loss
def NCE_loss(pos_score, neg_score, neg_sample_size):
    pos_score = F.logsigmoid(pos_score)
    neg_score = F.logsigmoid(-neg_score).reshape(-1, neg_sample_size)
    return -pos_score - torch.sum(neg_score, dim=1)

class LinkPrediction(nn.Module):
    def __init__(self, gconv_model):
        super(LinkPrediction, self).__init__()
        self.gconv_model = gconv_model

    def forward(self, g, features, neg_sample_size):
        emb = self.gconv_model(g, features)
        pos_g, neg_g = edge_sampler(g, neg_sample_size, return_false_neg=False)
        pos_score = score_func(pos_g, emb)
        neg_score = score_func(neg_g, emb)
        return torch.mean(NCE_loss(pos_score, neg_score, neg_sample_size))

In [None]:
class LinkPrediction2(nn.Module):
    def __init__(self, gconv_model):
        super(LinkPrediction2, self).__init__()
        self.gconv_model = gconv_model

    def forward(self, g, features, node_pairs,labels):
        emb = self.gconv_model(g, features)
        preds = calculate_cost(emb,node_pairs,labels)
        loss = F.binary_cross_entropy(preds,torch.FloatTensor(labels))
        return loss

In [None]:
!pip install numpy_indexed



In [None]:
import numpy_indexed as npi

In [None]:

def calculate_cost(emb,node_pairs, labels):
  src_nodes = node_pairs[:,0]
  dst_nodes = node_pairs[:,1]

  nodes = np.asarray(graph_train.nodes)
  
  src_nid = npi.indices(nodes,src_nodes)
  dst_nid = npi.indices(nodes,dst_nodes)
  
  emb_src = emb[src_nid]
  
  emb_dst = emb[dst_nid]

  
  
  #Hamdamard Product
  emb_prod = torch.mul(emb_src,emb_dst)
  emb_prod = torch.sum(torch.tensor(emb_prod), dim=1)
  preds = torch.sigmoid(torch.tensor(emb_prod))
  return preds

In [None]:
def edge_sampler(g, neg_sample_size, edges=None, return_false_neg=True):
    sampler = dgl.contrib.sampling.EdgeSampler(g, batch_size=int(g.number_of_edges()/10),
                                               seed_edges=edges,
                                               neg_sample_size=neg_sample_size,
                                               negative_mode='tail',
                                               shuffle=True,
                                               return_false_neg=return_false_neg)
    sampler = iter(sampler)
    return next(sampler)

In [None]:
def score_func(g, emb):
    src_nid, dst_nid = g.all_edges(order='eid')
    # Get the node Ids in the parent graph.
    src_nid = g.parent_nid[src_nid]
    dst_nid = g.parent_nid[dst_nid]
    # Read the node embeddings of the source nodes and destination nodes.
    pos_heads = emb[src_nid]
    pos_tails = emb[dst_nid]
    # cosine similarity
    return torch.sum(pos_heads * pos_tails, dim=1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:

def gcn_evaluate(gconv_model, g, features, node_pairs_eval,labels):
    gconv_model.eval()
    with torch.no_grad():
        emb = gconv_model(g, features)

        val_preds = calculate_cost(emb,node_pairs_eval,labels)
        val_preds = (val_preds > 0.25)
        val_preds = 1*val_preds

        return accuracy_score(labels,val_preds)

In [None]:
in_feats

6526

In [None]:
#Model hyperparameters
out_dim = 20
dropout = 0.3

# create GraphSAGE model
gconv_model = GCNModel(in_feats,out_dim=out_dim,aggregator_type='gcn')

The training loop

In [None]:
from torch.autograd import Variable
# Model for link prediction
model = LinkPrediction2(gconv_model)

# Training hyperparameters
weight_decay = 5e-4
n_epochs = 30
lr = 2e-3
neg_sample_size = 100

dgl_graph_train = DGLGraph()
dgl_graph_train.from_networkx(graph_train)

# use optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# initialize graph
dur = []
for epoch in range(n_epochs):
    model.train()
    loss = model(dgl_graph_train, torch.FloatTensor(features), examples_train,link_labels_train)
    loss = Variable(loss, requires_grad = True)
    optimizer.zero_grad()
    loss.backward()
    
    
    optimizer.step()
    acc = gcn_evaluate(gconv_model, dgl_graph_train, torch.FloatTensor(features), examples_val,link_label_val)
    print("Epoch = {}|Loss = {:.4f}|Accuracy = {:.4f}".format(epoch,loss.item(),acc))



Epoch = 0|Loss = 0.6933|Accuracy = 0.5005
Epoch = 1|Loss = 0.6933|Accuracy = 0.5005


KeyboardInterrupt: ignored

In [None]:
list(model.parameters())[4].shape

torch.Size([502, 20])

In [None]:
link_labels_train

array([1, 1, 0, ..., 0, 0, 1])

In [None]:
# Let's save the trained node embeddings.
acc = LPEvaluate(gconv_model, g, torch.FloatTensor(features), test_eids, neg_sample_size)
print("Test MRR {:.4f}".format(acc))

Test MRR 0.1409
