### **Team -11, Predicting Protein Interaction using GCNs on Yeast Dataset**

Note: Use GPU to run this code

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import networkx as nx
import os

try:
  import dgl
except ImportError as e:
  os.system('!pip install dgl-cu101')

In [25]:
import dgl
from dgl import DGLGraph
dgl.load_backend('pytorch')
from dgl.nn.pytorch import conv as dgl_conv

Using backend: pytorch
Using backend: pytorch


In [None]:
!pip install stellargraph

In [2]:
from stellargraph import StellarGraph
from stellargraph.data import EdgeSplitter

In [3]:
class GCNModel(nn.Module):
    def __init__(self,in_feats,n_hidden,out_dim,layer_dim,hidden_layer_dim):
        super(GCNModel, self).__init__()
        

        self.gcn_layer_1 = dgl_conv.GraphConv(in_feats, n_hidden)

        self.gcn_layer_2 = dgl_conv.GraphConv(n_hidden, out_dim)

        self.lin_layer_1 = nn.Linear(layer_dim,hidden_layer_dim)

        self.lin_layer_2 = nn.Linear(hidden_layer_dim,2)

    def forward(self, g, features, node_pairs, link_labels):
        x = features
        x = self.gcn_layer_1(g, features)
        x = F.relu(x)

        x = self.gcn_layer_2(g,x)
        x = F.relu(x)

        src = node_pairs[:,0]
        dst = node_pairs[:,1]

        emb_src = x[src]
        emb_dst = x[dst]

        z_tensor = torch.mul(emb_src,emb_dst)
        z_tensor = self.lin_layer_1(z_tensor)
        z_tensor = F.relu(z_tensor)
        z_tensor = self.lin_layer_2(z_tensor)

        return z_tensor
    
    def calculate_loss(self, features, g, node_pairs, link_labels):
      pred = self.forward(features, g, node_pairs, link_labels)
      return F.cross_entropy(pred,link_labels)
    
    def predict(self, features, g, node_pairs, link_labels):
        pred = self.forward(features, g, node_pairs, link_labels)
        loss = F.cross_entropy(pred, link_labels)
        return loss, pred


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
raw_edge_list = open('/content/drive/My Drive/Colab Notebooks/DSLab/yeast.edgelist','r')
G = nx.Graph()
G = nx.parse_edgelist(raw_edge_list, delimiter='\t', create_using=G,nodetype=str, data=(('weight', float),))
# Get graph edges and nodes from networkx graph object
nodes = G.nodes
edges = G.edges

print("Graph's Nodes : {} / Edges : {}".format(len(nodes), len(edges)))

Graph's Nodes : 6526 / Edges : 532180


In [6]:
G.remove_edges_from(nx.selfloop_edges(G))

In [7]:
G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering='default', label_attribute=None)

In [8]:
es_test = EdgeSplitter(G)
graph_test, examples_test, link_labels_test = es_test.train_test_split(p=0.25, method="global")

** Sampled 132623 positive and 132623 negative edges. **


In [9]:
es_train = EdgeSplitter(graph_test, G)
graph_train, examples_train, link_labels_train = es_train.train_test_split(
    p=0.25, method="global")
examples_train,examples_val, link_labels_train, link_label_val = train_test_split(examples_train,link_labels_train,test_size=0.033)

** Sampled 99468 positive and 99468 negative edges. **


In [10]:
train_nodes = list(graph_train.nodes)

In [11]:
adj = nx.adjacency_matrix(graph_train)

In [12]:
#features =np.matrix([[i, i] for i in range(adj.shape[0])])
features = np.identity(adj.shape[0])
#features = Variable(torch.FloatTensor(features), requires_grad=True)
features

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [13]:
in_feats = features.shape[0]
n_hidden = 502   #evaluate for 256, 128, 64, 32
out_dim =  50
weight_decay = 5e-4
n_epochs = 20    #evaluate for 5 to 20
lr = 0.01        #evaluate for  0.001, 0.01, and 0.1
layer_dim = 50      
hidden_layer_dim = 32
batch_size = 128  #evaluate for 64, 128 and 256 

In [14]:
from torch.utils.data import Dataset,DataLoader

In [15]:
class EdgeDataset(Dataset):
    def __init__(self, x, y):
        super(EdgeDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [17]:
traindata = EdgeDataset(examples_train, link_labels_train)
testdata = EdgeDataset(examples_test,link_labels_test)
edge_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True)

In [18]:
'''def get_edge_batch(loader=edge_loader):
  samples = []
  for i,data in enumerate(loader):
    samples.append(data)
  num_batches = len(samples)
  return num_batches,samples'''

'def get_edge_batch(loader=edge_loader):\n  samples = []\n  for i,data in enumerate(loader):\n    samples.append(data)\n  num_batches = len(samples)\n  return num_batches,samples'

In [21]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [22]:
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
from torch.autograd import Variable
g = DGLGraph()
g.from_networkx(graph_train)
g.readonly()
model = GCNModel(in_feats,n_hidden, out_dim,layer_dim,hidden_layer_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#num_epochs,data_batches = get_edge_batch()

#iterator = iter(edge_loader)
print("Training for 40 epochs.")


losses = []
train_accs = []
val_losses = []
val_accs = []
train_acc = 0.0
loss = 0.0
for epoch in range(n_epochs):
  print('\n', end='')
  for i,data in enumerate(edge_loader):
    #print("Step {}/{}......".format(i+1,))
    pairs, edges = data
    pairs = Variable(pairs).cuda()
    edges = Variable(edges).cuda()
    
    optimizer.zero_grad()
    loss, pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), pairs, edges)

    losses.append(loss)
    
    loss.backward()
    optimizer.step()

    pred = F.log_softmax(pred, dim=1)
    pred = pred.detach().to("cpu").numpy()
    pred = np.argmax(pred, axis=1)
    y_pred = []
    y_pred = np.append(y_pred, pred)
    train_acc = accuracy_score(edges.detach().to("cpu").numpy(), y_pred)
    train_accs.append(train_acc)
    print("\repoch = {}/{} ({}/{}) - loss = {:.4f} accuracy = {:.4f}"
    .format(epoch+1,n_epochs,i+1,int(examples_train.shape[0]/batch_size),
            loss,train_acc),end='',flush=True)

  #validation scores
  val_loss, val_pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), 
                                     torch.tensor(examples_val).cuda(), 
                                     torch.tensor(link_label_val).cuda())
  
  val_pred = F.log_softmax(val_pred, dim=1)
  val_pred = val_pred.detach().to("cpu").numpy()
  val_pred = np.argmax(val_pred, axis=1)
  val_y_pred = []
  val_y_pred = np.append(val_y_pred, val_pred)
  val_acc = accuracy_score(link_label_val, val_y_pred)
  val_accs.append(val_acc)
  print("\repoch = {}/{} - loss = {:.4f} accuracy = {:.4f} val_loss = {:.4f} val_accuracy = {:.4f}"
  .format(epoch+1,n_epochs,loss,train_acc,val_loss,val_acc),end='',flush=True)

Training for 40 epochs.

epoch = 1/40 - loss = 0.4699 accuracy = 0.7826 val_loss = 0.4379 val_accuracy = 0.7921
epoch = 2/40 - loss = 0.3873 accuracy = 0.8522 val_loss = 0.4353 val_accuracy = 0.7904
epoch = 3/40 - loss = 0.4331 accuracy = 0.8087 val_loss = 0.4350 val_accuracy = 0.7939
epoch = 4/40 - loss = 0.4126 accuracy = 0.7826 val_loss = 0.4332 val_accuracy = 0.7938
epoch = 5/40 - loss = 0.4328 accuracy = 0.8174 val_loss = 0.4263 val_accuracy = 0.7935
epoch = 6/40 - loss = 0.3722 accuracy = 0.8435 val_loss = 0.4291 val_accuracy = 0.7956
epoch = 7/40 - loss = 0.4446 accuracy = 0.7826 val_loss = 0.4289 val_accuracy = 0.7930
epoch = 8/40 - loss = 0.4447 accuracy = 0.8174 val_loss = 0.4276 val_accuracy = 0.7916
epoch = 9/40 - loss = 0.4362 accuracy = 0.7913 val_loss = 0.4279 val_accuracy = 0.7947
epoch = 10/40 - loss = 0.4452 accuracy = 0.7652 val_loss = 0.4300 val_accuracy = 0.7925
epoch = 11/40 - loss = 0.4791 accuracy = 0.7739 val_loss = 0.4299 val_accuracy = 0.7933
epoch = 12/40 - 

In [21]:
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [27]:
g_test = DGLGraph()
g_test.from_networkx(graph_test)
g.readonly()
test_losses = []
test_preds = []
for i,Data in enumerate(test_loader):
  node_pairs, labels_test = Data
  with torch.no_grad():
    test_loss, tp = model.predict(g_test,torch.FloatTensor(features), torch.tensor(node_pairs), torch.tensor(labels_test))
    test_losses.append(test_loss)
    test_preds = np.append(test_preds,tp)


  if __name__ == '__main__':
