### **Team -11, Predicting Protein Interaction using GCNs on Yeast Dataset**

**Note:** This notebook uses CUDA v.10.1. If your PC is not supported, run this notebook on Colab with GPU settings turned on.

In [7]:
!pip install stellargraph

Collecting stellargraph
[?25l  Downloading https://files.pythonhosted.org/packages/74/78/16b23ef04cf6fb24a7dea9fd0e03c8308a56681cc5efe29f16186210ba04/stellargraph-1.2.1-py3-none-any.whl (435kB)
[K     |████████████████████████████████| 440kB 1.7MB/s 
Installing collected packages: stellargraph
Successfully installed stellargraph-1.2.1


In [1]:
from stellargraph.data import EdgeSplitter

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score,roc_auc_score,average_precision_score
from sklearn.model_selection import train_test_split
import numpy as np
import networkx as nx
import os

Using backend: pytorch


In [5]:
!pip install dgl-cu101

Collecting dgl-cu101
[?25l  Downloading https://files.pythonhosted.org/packages/b9/02/9eba53d81b4eff438cd306775f07f164bdb8708f0fe5175cb122852e50e0/dgl_cu101-0.4.3.post2-cp36-cp36m-manylinux1_x86_64.whl (16.9MB)
[K     |████████████████████████████████| 16.9MB 1.8MB/s 
Installing collected packages: dgl-cu101
Successfully installed dgl-cu101-0.4.3.post2


In [3]:
import dgl
from dgl import DGLGraph
dgl.load_backend('pytorch')
from dgl.nn.pytorch import conv as dgl_conv

Using backend: pytorch


In [4]:
class GCNModel(nn.Module):
    def __init__(self,in_feats,n_hidden,out_dim,layer_dim,hidden_layer_dim):
        super(GCNModel, self).__init__()
        

        self.gcn_layer_1 = dgl_conv.GraphConv(in_feats, n_hidden)

        self.gcn_layer_2 = dgl_conv.GraphConv(n_hidden, out_dim)

        self.lin_layer_1 = nn.Linear(layer_dim,hidden_layer_dim)

        self.lin_layer_2 = nn.Linear(hidden_layer_dim,2)

    def forward(self, g, features, node_pairs, link_labels):
        x = features
        x = self.gcn_layer_1(g, features)
        x = F.relu(x)

        x = self.gcn_layer_2(g,x)
        x = F.relu(x)

        src = node_pairs[:,0]
        dst = node_pairs[:,1]

        emb_src = x[src]
        emb_dst = x[dst]

        z_tensor = torch.mul(emb_src,emb_dst)
        z_tensor = self.lin_layer_1(z_tensor)
        z_tensor = F.relu(z_tensor)
        z_tensor = self.lin_layer_2(z_tensor)

        return z_tensor
    
    def calculate_loss(self, features, g, node_pairs, link_labels):
      pred = self.forward(features, g, node_pairs, link_labels)
      return F.cross_entropy(pred,link_labels)
    
    def predict(self, features, g, node_pairs, link_labels):
        pred = self.forward(features, g, node_pairs, link_labels)
        loss = F.cross_entropy(pred, link_labels)
        return loss, pred


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
raw_edge_list = open('/content/drive/My Drive/Colab Notebooks/DSLab/yeast.edgelist','r')
G = nx.Graph()
G = nx.parse_edgelist(raw_edge_list, delimiter='\t', create_using=G,nodetype=str, data=(('weight', float),))
# Get graph edges and nodes from networkx graph object
nodes = G.nodes
edges = G.edges

print("Graph's Nodes : {} / Edges : {}".format(len(nodes), len(edges)))

Graph's Nodes : 6526 / Edges : 532180


In [7]:
G.remove_edges_from(nx.selfloop_edges(G))

In [8]:
G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering='default', label_attribute=None)

In [9]:
es_test = EdgeSplitter(G)
graph_test, examples_test, link_labels_test = es_test.train_test_split(p=0.25, method="global")

** Sampled 132623 positive and 132623 negative edges. **


In [10]:
es_train = EdgeSplitter(graph_test, G)
graph_train, examples_train, link_labels_train = es_train.train_test_split(
    p=0.25, method="global")
examples_train,examples_val, link_labels_train, link_label_val = train_test_split(examples_train,link_labels_train,test_size=0.033)

** Sampled 99468 positive and 99468 negative edges. **


In [11]:
train_nodes = list(graph_train.nodes)

In [12]:
adj = nx.adjacency_matrix(graph_train)

In [13]:
#features =np.matrix([[i, i] for i in range(adj.shape[0])])
features = np.identity(adj.shape[0])
#features = Variable(torch.FloatTensor(features), requires_grad=True)
features

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [14]:
in_feats = features.shape[0]
n_hidden = 502   #evaluate for 256, 128, 64, 32
out_dim =  50
weight_decay = 5e-4
n_epochs = 20    #evaluate for 5 to 20
lr = 0.01        #evaluate for  0.001, 0.01, and 0.1
layer_dim = 50      
hidden_layer_dim = 32
batch_size = 128  #evaluate for 64, 128 and 256 

In [15]:
from torch.utils.data import Dataset,DataLoader

In [16]:
class EdgeDataset(Dataset):
    def __init__(self, x, y):
        super(EdgeDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [17]:
traindata = EdgeDataset(examples_train, link_labels_train)
testdata = EdgeDataset(examples_test,link_labels_test)
edge_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True)

In [18]:
'''def get_edge_batch(loader=edge_loader):
  samples = []
  for i,data in enumerate(loader):
    samples.append(data)
  num_batches = len(samples)
  return num_batches,samples'''

'def get_edge_batch(loader=edge_loader):\n  samples = []\n  for i,data in enumerate(loader):\n    samples.append(data)\n  num_batches = len(samples)\n  return num_batches,samples'

In [19]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [22]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [None]:
from torch.autograd import Variable
g = DGLGraph()
g.from_networkx(graph_train)
g.readonly()
model = GCNModel(in_feats,n_hidden, out_dim,layer_dim,hidden_layer_dim)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

print("Training for {} epochs.".format(n_epochs))


losses = []
train_rocs = []
val_losses = []
val_rocs = []
train_acc = 0.0
loss = 0.0
for epoch in range(n_epochs):
  print('\n', end='')
  for i,data in enumerate(edge_loader):
    #print("Step {}/{}......".format(i+1,))
    pairs, edges = data
    pairs = Variable(pairs).cuda()
    edges = Variable(edges).cuda()
    
    optimizer.zero_grad()
    loss, pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), pairs, edges)

    losses.append(loss)
    
    loss.backward()
    optimizer.step()

    pred = F.log_softmax(pred, dim=1)
    pred = pred.detach().to("cpu").numpy()
    pred = np.argmax(pred, axis=1)
    y_pred = []
    y_pred = np.append(y_pred, pred)
    train_roc = roc_auc_score(edges.detach().to("cpu").numpy(), y_pred)
    train_rocs.append(train_roc)
    print("\repoch = {}/{} ({}/{}) - loss = {:.4f} roc_auc_score = {:.4f}"
    .format(epoch+1,n_epochs,i+1,int(examples_train.shape[0]/batch_size),
            loss,train_roc),end='',flush=True)

  #validation scores
  val_loss, val_pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), 
                                     torch.tensor(examples_val).cuda(), 
                                     torch.tensor(link_label_val).cuda())
  
  val_pred = F.log_softmax(val_pred, dim=1)
  val_pred = val_pred.detach().to("cpu").numpy()
  val_pred = np.argmax(val_pred, axis=1)
  val_y_pred = []
  val_y_pred = np.append(val_y_pred, val_pred)
  val_roc = roc_auc_score(link_label_val, val_y_pred)
  val_ap = average_precision_score(link_label_val,val_y_pred)
  val_rocs.append(val_roc)
  print("\repoch = {}/{} - loss = {:.4f} roc_auc_score = {:.4f} val_loss = {:.4f} val_roc_auc_score = {:.4f} val_avg_precision = {:.4f}"
  .format(epoch+1,n_epochs,loss,train_roc,val_loss,val_roc,val_ap),end='',flush=True)

###Predict test set using the trained model

In [None]:
g_test = DGLGraph()
g_test.from_networkx(graph_test)
g.readonly()
test_losses = []
test_preds = []
for i,Data in enumerate(test_loader):
  node_pairs, labels_test = Data
  with torch.no_grad():
    test_loss, tp = model.predict(g_test,torch.FloatTensor(features), torch.tensor(node_pairs), torch.tensor(labels_test))
    test_losses.append(test_loss)
    test_preds = np.append(test_preds,tp)


  if __name__ == '__main__':
