### **Team -11, Predicting Protein Interaction using GCNs on Yeast Dataset**

**Note:** This notebook uses CUDA v.10.1. If your PC is not supported, run this notebook on Colab with GPU settings turned on.

In [None]:
!pip install stellargraph

Collecting stellargraph
[?25l  Downloading https://files.pythonhosted.org/packages/74/78/16b23ef04cf6fb24a7dea9fd0e03c8308a56681cc5efe29f16186210ba04/stellargraph-1.2.1-py3-none-any.whl (435kB)
[K     |▊                               | 10kB 22.9MB/s eta 0:00:01[K     |█▌                              | 20kB 24.9MB/s eta 0:00:01[K     |██▎                             | 30kB 29.8MB/s eta 0:00:01[K     |███                             | 40kB 13.9MB/s eta 0:00:01[K     |███▊                            | 51kB 14.8MB/s eta 0:00:01[K     |████▌                           | 61kB 12.1MB/s eta 0:00:01[K     |█████▎                          | 71kB 12.5MB/s eta 0:00:01[K     |██████                          | 81kB 11.9MB/s eta 0:00:01[K     |██████▊                         | 92kB 11.3MB/s eta 0:00:01[K     |███████▌                        | 102kB 11.3MB/s eta 0:00:01[K     |████████▎                       | 112kB 11.3MB/s eta 0:00:01[K     |█████████                       | 

In [None]:
from stellargraph.data import EdgeSplitter

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score,roc_auc_score,average_precision_score
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import numpy as np
import pandas as pd
import networkx as nx
import os
import time

In [None]:
!pip install dgl-cu101

Collecting dgl-cu101
[?25l  Downloading https://files.pythonhosted.org/packages/b9/02/9eba53d81b4eff438cd306775f07f164bdb8708f0fe5175cb122852e50e0/dgl_cu101-0.4.3.post2-cp36-cp36m-manylinux1_x86_64.whl (16.9MB)
[K     |████████████████████████████████| 16.9MB 202kB/s 
Installing collected packages: dgl-cu101
Successfully installed dgl-cu101-0.4.3.post2


In [None]:
import dgl
from dgl import DGLGraph
dgl.load_backend('pytorch')
from dgl.nn.pytorch import conv as dgl_conv

Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch
Using backend: pytorch


In [None]:
class GCNModel(nn.Module):
    def __init__(self,in_feats,n_hidden,out_dim,layer_dim,hidden_layer_dim):
        super(GCNModel, self).__init__()
        

        self.gcn_layer_1 = dgl_conv.GraphConv(in_feats, n_hidden)

        self.gcn_layer_2 = dgl_conv.GraphConv(n_hidden, out_dim)

        self.lin_layer_1 = nn.Linear(out_dim, hidden_layer_dim)

        self.lin_layer_2 = nn.Linear(hidden_layer_dim,2)

    def forward(self, g, features, node_pairs, link_labels):
        x = features
        x = self.gcn_layer_1(g, features)
        x = F.relu(x)

        x = self.gcn_layer_2(g,x)
        x = F.relu(x)

        src = node_pairs[:,0]
        dst = node_pairs[:,1]

        emb_src = x[src]
        emb_dst = x[dst]

        z_tensor = torch.mul(emb_src,emb_dst)
        z_tensor = self.lin_layer_1(z_tensor)
        z_tensor = F.relu(z_tensor)
        z_tensor = self.lin_layer_2(z_tensor)
        return z_tensor
    
    def predict(self, features, g, node_pairs, link_labels):
        pred = self.forward(features, g, node_pairs, link_labels)
        loss = F.cross_entropy(pred, link_labels)
        return loss, pred


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
raw_edge_list = open('/content/drive/My Drive/Colab Notebooks/DSLab/yeast.edgelist','r')
G = nx.Graph()
G = nx.parse_edgelist(raw_edge_list, delimiter='\t', create_using=G,nodetype=str, data=(('weight', float),))
# Get graph edges and nodes from networkx graph object
nodes = G.nodes
edges = G.edges

print("Graph's Nodes : {} / Edges : {}".format(len(nodes), len(edges)))

Graph's Nodes : 6526 / Edges : 532180


In [None]:
G.remove_edges_from(nx.selfloop_edges(G))

In [None]:
G = nx.relabel.convert_node_labels_to_integers(G, first_label=0, ordering='default', label_attribute=None)

In [None]:
es_test = EdgeSplitter(G)
graph_test, examples_test, link_labels_test = es_test.train_test_split(p=0.1, method="local")

  p=p, method=method, probs=probs, keep_connected=keep_connected


** Sampled 53049 positive and 53049 negative edges. **


In [None]:
es_train = EdgeSplitter(graph_test, G)
graph_train, examples_train, link_labels_train = es_train.train_test_split(
    p=0.7, method="local")
examples_train,examples_val, link_labels_train, link_label_val = train_test_split(examples_train,link_labels_train,test_size=0.033)

  p=p, method=method, probs=probs, keep_connected=keep_connected


** Sampled 334212 positive and 334212 negative edges. **


In [None]:
def preprocess_graph(adj):
  adj = sp.coo_matrix(adj)
  adj_ = adj + sp.eye(adj.shape[0])
  rowsum = np.array(adj_.sum(1))
  degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
  features = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo() 
  print(features.shape)

  print(type(features))

  indices = np.vstack((features.row, features.col))
  values = features.data
  shape = features.shape
  i = torch.LongTensor(indices)
  v = torch.FloatTensor(values)

  features = torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()
  return features

In [None]:
# #features =np.matrix([[i, i] for i in range(adj.shape[0])])
features = preprocess_graph(nx.adjacency_matrix(graph_train))
# #features = Variable(torch.FloatTensor(features), requires_grad=True)
# features

(6526, 6526)
<class 'scipy.sparse.coo.coo_matrix'>


RandomsearchCV

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# param_dist = {"n_hidden": [32, 64, 128, 256],
#               "epochs": np.range(5, 21),
#               "lr": [0.1, 0.01, 0.001],
#               "batch_size": [64, 128, 256]}

# gs = GridSearchCV(net, params, refit=False, scoring='r2', verbose=1, cv=10)

# gs.fit(X_trf, y_trf)

In [None]:
from torch.utils.data import Dataset,DataLoader

In [None]:
class EdgeDataset(Dataset):
    def __init__(self, x, y):
        super(EdgeDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [None]:

testdata = EdgeDataset(examples_test,link_labels_test)

test_loader = DataLoader(testdata, batch_size=batch_size, shuffle=False)

NameError: ignored

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
from torch.autograd import Variable

In [None]:
def create_model_and_train(lr,batch_size,n_hidden,hidden_layer_dim):
  g = DGLGraph()
  g.from_networkx(graph_train)
  g.readonly()
  n_epochs=20
  out_dim = 80
  model = GCNModel(in_feats = features.shape[0],n_hidden=n_hidden,out_dim=out_dim,layer_dim=out_dim,hidden_layer_dim=hidden_layer_dim)
  model.to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  traindata = EdgeDataset(examples_train, link_labels_train)
  edge_loader = DataLoader(traindata, batch_size=batch_size, shuffle=True)
  print("Training for {} epochs.".format(n_epochs))

  eval_report = {
      'train_losses':[],
      'train_rocs':[],
      'train_aps':[],
      'val_losses':[],
      'val_rocs':[],
      'val_aps':[],
      'times_per_epoch':[],
      'epochs':[],
      'learning_rate': list(np.full((n_epochs,),lr)),
      'batch_size':list(np.full((n_epochs,),batch_size)),
      'hidden1_dim':list(np.full((n_epochs,),n_hidden)) ,
      'hidden_layer_dim':list(np.full((n_epochs,),hidden_layer_dim))
  }

   
  for epoch in range(n_epochs):
    eval_report['epochs'].append(epoch+1)
    t = time.time()
    print('\n', end='')
    
    loss = 0.0
    train_roc = 0.0
    train_ap = 0.0
    for i,data in enumerate(edge_loader):
      #print("Step {}/{}......".format(i+1,))
      pairs, edges = data
      pairs = Variable(pairs).cuda()
      edges = Variable(edges).cuda()
      
      optimizer.zero_grad()
      loss, pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), pairs, edges)
      loss.backward()
      optimizer.step()

      pred = F.log_softmax(pred, dim=1)
      pred = pred.detach().to("cpu").numpy()
      pred = np.argmax(pred, axis=1)
      y_pred = []
      y_pred = np.append(y_pred, pred)
      train_roc = roc_auc_score(edges.detach().to("cpu").numpy(), y_pred)
      train_ap = average_precision_score(edges.detach().to("cpu").numpy(), y_pred)
      print("\repoch = {}/{} ({}/{}) - loss = {:.4f} roc_auc_score = {:.4f}"
      .format(epoch+1,n_epochs,i+1,int(examples_train.shape[0]/batch_size),
              loss,train_roc),end='',flush=True)
  
    eval_report['train_losses'].append(float(loss.detach().to('cpu').numpy()))
    eval_report['train_rocs'].append(train_roc)
    eval_report['train_aps'].append(train_ap)
    


    #validation scores
    val_loss, val_pred = model.predict(g.to(device),torch.FloatTensor(features).cuda(), 
                                      torch.tensor(examples_val).cuda(), 
                                      torch.tensor(link_label_val).cuda())
    val_pred = F.log_softmax(val_pred, dim=1)
    val_pred = val_pred.detach().to("cpu").numpy()
    val_pred = np.argmax(val_pred, axis=1)
    val_y_pred = []
    val_y_pred = np.append(val_y_pred, val_pred)
    val_roc = roc_auc_score(link_label_val, val_y_pred)
    val_ap = average_precision_score(link_label_val,val_y_pred)

    
    eval_report['val_losses'].append(float(val_loss.detach().to('cpu').numpy()))
    eval_report['val_rocs'].append(val_roc)
    eval_report['val_aps'].append(val_ap)
    eval_report['times_per_epoch'].append(float(time.time()-t))

    print("\repoch = {}/{} - loss = {:.4f} roc_auc_score = {:.4f} val_loss = {:.4f} val_roc_auc_score = {:.4f} val_avg_precision = {:.4f} time= {:.5f}"
    .format(epoch+1,n_epochs,loss,train_roc,val_loss,val_roc,val_ap, time.time()-t),end='',flush=False)
  return eval_report

In [None]:
#generate csv (do not run every time)
#import pandas as pd
#df = pd.DataFrame(columns=['epochs','learning_rate','batch_size','hidden1_dim',
#                           'hidden_layer_dim','train_losses','train_rocs',
#                           'train_aps','val_losses','val_rocs','val_aps',
#                           'times_per_epoch'])
#df.to_csv('/content/drive/My Drive/Colab Notebooks/DSLab/eval_report.csv')

In [None]:
def append_to_csv(eval_report):
  df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/DSLab/eval_report.csv', index_col=0)
  df2 = pd.DataFrame({
    'epochs':eval_report['epochs'],
    'learning_rate':eval_report['learning_rate'],
    'batch_size':eval_report['batch_size'],
    'hidden1_dim':eval_report['hidden1_dim'],
    'hidden_layer_dim':eval_report['hidden_layer_dim'],
    'train_losses':eval_report['train_losses'],
    'train_rocs':eval_report['train_rocs'],
    'train_aps':eval_report['train_aps'],
    'val_losses':eval_report['val_losses'],
    'val_rocs':eval_report['val_rocs'],
    'val_aps':eval_report['val_aps'],
    'times_per_epoch':eval_report['times_per_epoch'],
  })
  df = pd.concat([df,df2],ignore_index=True)
  df.to_csv('/content/drive/My Drive/Colab Notebooks/DSLab/eval_report.csv')

###**Start Evaluation**

In [None]:
#lrs = [0.001, 0.01,0.1]
#_hiddens = [256]
#hidden_layer_dims = [64,32,16,8]
#batch_sizes = [64,128,256]

#19 done(1)
#20 done(1)
#21 not required
#22 training(1).....
#23 training(2).....
#24 256 0.01 64
#25 done(2)
#26 done(2)
#27 256 0.1 64


eval_report = create_model_and_train(lr=0.01,batch_size=512,n_hidden=256,hidden_layer_dim=16)
#append_to_csv(eval_report)

Training for 20 epochs.

epoch = 1/20 - loss = 0.5864 roc_auc_score = 0.6959 val_loss = 0.5940 val_roc_auc_score = 0.6737 val_avg_precision = 0.6174 time= 71.72142
epoch = 2/20 - loss = 0.5339 roc_auc_score = 0.7233 val_loss = 0.5949 val_roc_auc_score = 0.6763 val_avg_precision = 0.6164 time= 71.91388
epoch = 3/20 - loss = 0.5715 roc_auc_score = 0.7001 val_loss = 0.5851 val_roc_auc_score = 0.6822 val_avg_precision = 0.6201 time= 71.86156
epoch = 4/20 - loss = 0.5558 roc_auc_score = 0.6870 val_loss = 0.5831 val_roc_auc_score = 0.6840 val_avg_precision = 0.6271 time= 71.80244
epoch = 5/20 - loss = 0.5921 roc_auc_score = 0.6684 val_loss = 0.5830 val_roc_auc_score = 0.6850 val_avg_precision = 0.6238 time= 71.52719
epoch = 6/20 - loss = 0.5825 roc_auc_score = 0.7048 val_loss = 0.5819 val_roc_auc_score = 0.6868 val_avg_precision = 0.6255 time= 71.78729
epoch = 7/20 - loss = 0.5614 roc_auc_score = 0.7130 val_loss = 0.5815 val_roc_auc_score = 0.6858 val_avg_precision = 0.6236 time= 71.82686
ep

In [None]:
g_test = DGLGraph()
g_test.from_networkx(graph_test)
g.readonly()
test_losses = []
test_preds = []
test_true = []


for i,Data in enumerate(test_loader):
  node_pairs, labels_test = Data
  with torch.no_grad():
    test_loss, test_pred = model.predict(g_test.to(device),torch.FloatTensor(features).cuda(), torch.tensor(node_pairs).cuda(), torch.tensor(labels_test).cuda())
    test_losses.append(test_loss)
    test_pred = F.log_softmax(test_pred, dim=1)
    test_pred = test_pred.detach().to("cpu").numpy()
    test_pred = np.argmax(test_pred, axis=1)
    
    test_preds = np.append(test_preds, test_pred)
    test_true = np.append(test_true,labels_test)

  if sys.path[0] == '':


In [None]:
test_roc = roc_auc_score(test_true, test_preds)
test_ap = average_precision_score(test_true,test_preds)
print("Test ROC AUC Score = {:.4f}\nTest Average Precision Score = {:.4f}".format(test_roc,test_ap))

Test ROC AUC Score = 0.8058
Test Average Precision Score = 0.7469


In [None]:
# df = pd.DataFrame(columns=['g1i', 'g2i', 'c1i', 'c2i', 'n_epochs', 'lr', 'batch_size', 
#                            'avgtime', 'train_loss', 'train_roc_auc', 'val_loss', 'val_roc_auc', 'val_avg_pre', 'test_roc_auc', 'test_avg_pre'])

df = pd.read_csv('/content/drive/My Drive/Project/hype.csv', index_col=0)


df = df.append({
   'g1i' : in_feats,
   'g2i' : n_hidden,
   'c1i' : layer_dim,
   'c2i' : hidden_layer_dim,
   'n_epochs' : n_epochs,
   'lr' : lr,
   'batch_size' : batch_size,
   'avgtime' : np.mean(np.array(tpes)),
   'train_loss' : losses[-1].data.tolist(),
   'train_roc_auc' : train_rocs[-1].data.tolist(),
   'val_loss' : val_losses[-1].data.tolist(),
   'val_roc_auc' : val_rocs[-1].data.tolist(),
   'val_avg_pre' : val_ap,
   'test_roc_auc' : test_roc,
   'test_avg_pre' : test_ap,
   'test_loss' : test_losses[-1].data.tolist()  
},ignore_index=True)

In [None]:
print(df)

      g1i    g2i   c1i  c2i  ...  val_avg_pre  test_roc_auc  test_avg_pre  test_loss
0  6526.0   64.0   8.0  6.0  ...     0.738393      0.000000      0.000000   0.000000
1  6526.0   64.0   8.0  6.0  ...     0.744686      0.000000      0.000000   0.000000
2  6526.0   64.0   8.0  6.0  ...     0.732694      0.000000      0.000000   0.000000
3  6526.0   64.0   8.0  6.0  ...     0.721398      0.798542      0.732627   0.000000
4  6526.0   64.0   8.0  6.0  ...     0.717607      0.795744      0.733527   0.442549
5  6526.0   64.0   8.0  6.0  ...     0.712938      0.787661      0.724660   0.464991
6  6526.0   64.0   8.0  6.0  ...     0.721496      0.793060      0.736295   0.410822
7  6526.0  128.0  32.0  8.0  ...     0.736307      0.805799      0.746854   0.437984

[8 rows x 16 columns]


In [None]:
df.to_csv('/content/drive/My Drive/Project/hype.csv')