In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
import dgl
import dgl.function as fn
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings("ignore")

#local files
from MLP import MLP
from GIN_CNN import GIN_CNN

torch.manual_seed(2)
np.random.seed(2) 

In [2]:
dgl.__version__ == '0.2'

True

In [3]:
with open('../graphs_amino.pickle', 'rb') as gr:
    all_A = pickle.load(gr)
    
with open('../lables_amino.pickle', 'rb') as la:
    label = pickle.load(la)

In [4]:
np.unique(label, return_counts=True)

(array([0, 1]), array([363, 632]))

In [5]:
label = list(label)
max_label = int(max(label)) + 1
print(max_label)

2


In [6]:
#undersampling
class_1 = np.random.choice(np.where(np.array(label)==1)[0], len(np.where(np.array(label)==0)[0]), replace=False)
inds = np.append(class_1, np.where(np.array(label)==0)[0])
inds = np.random.permutation(inds)
all_A = list(np.array(all_A)[inds])
label = list(np.array(label)[inds])

# Graph Isomorphism Network

In [7]:
def train(model, train_graphs, optimizer, epoch):
    model.train()

    total_iters = iters_per_epoch
#     pbar = tqdm(range(total_iters), unit='batch')
    pbar = range(total_iters)

    loss_accum = 0
    n_iter = 0
    for pos in pbar:
        selected_idx = np.random.permutation(len(train_graphs))[:batch_size]
                
        batch_graph = [train_graphs[idx][0] for idx in selected_idx]
        labels = torch.FloatTensor([train_graphs[idx][1] for idx in selected_idx])
        
        output = model(batch_graph)
        
        loss = criterion(output, labels.view_as(output))

        #backprop
        if optimizer is not None:
            optimizer.zero_grad()
            loss.backward()         
            optimizer.step()
        

        loss = loss.detach().numpy()
        loss_accum += loss
        n_iter += 1

        #report
#         pbar.set_description('epoch: %d' % (epoch))

    average_loss = loss_accum/n_iter
    print(f"epoch: {epoch}, \t loss training: {average_loss}", end='\t')
    
    return average_loss

###pass data to model with minibatch during testing to avoid memory overflow (does not perform backpropagation)
def pass_data_iteratively(model, graphs, minibatch_size = 64):
    model.eval()
    output = []
    idx = np.arange(len(graphs))
    for i in range(0, len(graphs), minibatch_size):
        sampled_idx = idx[i:i+minibatch_size]
        if len(sampled_idx) == 0:
            continue
        output.append(model([graphs[j] for j in sampled_idx]).detach())
    return torch.cat(output, 0)

def test(model, train_graphs, test_graphs, epoch):
    model.eval()
    
    
    batch_graph_train = [train_graph_[0] for train_graph_ in train_graphs]
    labels = torch.FloatTensor([train_graph_[1] for train_graph_ in train_graphs])
    output = pass_data_iteratively(model, batch_graph_train)
    output = torch.round(torch.sigmoid(output))
    correct = output.eq(labels.view_as(output)).sum().cpu().item()
    acc_train = correct / float(len(train_graphs))
    print("accuracy train: %f" % (acc_train), end='\t')
    
    
    #############################################################

    batch_graph = [test_graph_[0] for test_graph_ in test_graphs]
    labels = torch.FloatTensor([test_graph_[1] for test_graph_ in test_graphs])
        
    #### we will not use pass_data_iteratively for now as we do not have a lot of data
    output = model(batch_graph)
    output = torch.round(torch.sigmoid(output))

    correct = output.eq(labels.view_as(output)).sum().item()
    acc_test = correct / float(len(test_graphs))

    print(f"accuracy test: {acc_test}") #accuracy train: {acc_train};

    return acc_test

In [8]:
arr = []
for z in zip(all_A, label):
    arr.append(z)

trainset, testset = train_test_split(arr)

In [9]:
batch_size=64
# 'input batch size for training (default: 32)'
iters_per_epoch=int(len(trainset)/batch_size)
# 'number of iterations per each epoch (default: 50)'
epochs=30
#'number of epochs to train (default: 350)'
lr=0.01
#'learning rate (default: 0.01)'
num_layers = 2
#'number of layers INCLUDING the input one (default: 5)'
num_mlp_layers=2
#'number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.'
hidden_dim=8
#'number of hidden units (default: 64)'
final_dropout=0.1
#'final layer dropout (default: 0.5)'

print_mode=0

num_classes = 1

In [10]:
criterion = nn.BCEWithLogitsLoss()


model = GIN_CNN(num_layers, num_mlp_layers, trainset[0][0].ndata['h'].shape[1], hidden_dim, num_classes, final_dropout)

optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2)

losses=[]
acc = []

for epoch in range(1, epochs + 1):
    scheduler.step()

    avg_loss = train(model, trainset, optimizer, epoch)
    losses.append(avg_loss)
    acc_test = test(model, trainset, testset, epoch)
    acc.append(acc_test)


    if print_mode==1:
        print(model.edge_features)
        print('_____________________')

epoch: 1, 	 loss training: 1.1846226304769516	accuracy train: 0.494485	accuracy test: 0.4835164835164835
epoch: 2, 	 loss training: 0.8808127045631409	accuracy train: 0.477941	accuracy test: 0.5054945054945055
epoch: 3, 	 loss training: 0.7640379443764687	accuracy train: 0.472426	accuracy test: 0.489010989010989
epoch: 4, 	 loss training: 0.7723512127995491	accuracy train: 0.511029	accuracy test: 0.521978021978022
epoch: 5, 	 loss training: 0.733861654996872	accuracy train: 0.512868	accuracy test: 0.5274725274725275
epoch: 6, 	 loss training: 0.690263070166111	accuracy train: 0.560662	accuracy test: 0.5769230769230769
epoch: 7, 	 loss training: 0.6783007308840752	accuracy train: 0.602941	accuracy test: 0.5274725274725275
epoch: 8, 	 loss training: 0.6744397431612015	accuracy train: 0.586397	accuracy test: 0.5604395604395604
epoch: 9, 	 loss training: 0.6961414739489555	accuracy train: 0.514706	accuracy test: 0.5054945054945055
epoch: 10, 	 loss training: 0.6850193440914154	accuracy tra