In [1]:
import os
import sys
import math
import pickle
import argparse
import numpy as np
import torch
import torch.optim as optim
import torch.nn.functional as F
from model import BACPI
from model import BACPI_DEEPCCA
from utils import *
from data_process import training_data_process

In [2]:
# args = argparse.ArgumentParser(description='Argparse for compound-protein interactions prediction')
tasks = {'interaction': 'interaction', "affinity": 'affinity'}
# args.add_argument('-task', type=str, default='interaction', help='affinity/interaction')t
datasets = {'human': 'human','celegans': 'celegans', 'EC50': 'EC50','IC50': 'IC50',
           'Kd': 'Kd','Ki': 'Ki'}
# args.add_argument('-dataset', type=str, default='human', help='choose a dataset')
mode = 'gpu'
# args.add_argument('-mode', type=str, default='gpu', help='gpu/cpu')

# args.add_argument('-cuda', type=str, default='0', help='visible cuda devices')
verbose = [0, 1]
# args.add_argument('-verbose', type=int, default=1, help='0: do not output log in stdout, 1: output log')

# Hyper-parameter
lr = 0.0005
step_size = 10
gamma = 0.5
batch_size = 8
num_epochs = 20
# args.add_argument('-lr', type=float, default=0.0005, help='init learning rate')
# args.add_argument('-step_size', type=int, default=10, help='step size of lr_scheduler')
# args.add_argument('-gamma', type=float, default=0.5, help='lr weight decay rate')
# args.add_argument('-batch_size', type=int, default=16, help='batch size')
# args.add_argument('-num_epochs', type=int, default=20, help='number of epochs')

# graph attention layer
gat_dim = 50
num_head = 3
dropout = 0.1
alpha = 0.1

# args.add_argument('-gat_dim', type=int, default=50, help='dimension of node feature in graph attention layer')
# args.add_argument('-num_head', type=int, default=3, help='number of graph attention layer head')
# args.add_argument('-dropout', type=float, default=0.1)
# args.add_argument('-alpha', type=float, default=0.1, help='LeakyReLU alpha')

comp_dim = 80
prot_dim = 80
latent_dim = 80

# args.add_argument('-comp_dim', type=int, default=80, help='dimension of compound atoms feature')
# args.add_argument('-prot_dim', type=int, default=80, help='dimension of protein amino feature')
# args.add_argument('-latent_dim', type=int, default=80, help='dimension of compound and protein feature')

window = 5
layer_cnn = 3
layer_out = 3
# args.add_argument('-window', type=int, default=5, help='window size of cnn model')
# args.add_argument('-layer_cnn', type=int, default=3, help='number of layer in cnn model')
# args.add_argument('-layer_out', type=int, default=3, help='number of output layer in prediction model')

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
print('The code is running on the', device)
## modify by selecting dataset type and interaction/affinity ##
task = tasks['interaction']
dataset = datasets['human']
##
data_dir = '../datasets/' + task + '/' + dataset
if not os.path.isdir(data_dir):
    training_data_process(task, dataset)

The code is run on the cuda


In [4]:
# datasets for processed data
#
## newly_create

def load_new_feature(datadir, target_type, datapack):
    if target_type:
        dir_input = datadir + '/' + target_type + '/'
    else:
        dir_input = datadir + '/'
    print(dir_input + 'new_feature_' + target_type + '.npy')
    new_feature = np.load(dir_input + 'new_feature_' + target_type + '.npy')
    datapack.append(new_feature)
    return datapack

print('Load data...')
train_data = load_data(data_dir, 'train')
test_data = load_data(data_dir, 'test')
# newly added
train_data = load_new_feature(data_dir, 'train', train_data)
test_data = load_new_feature(data_dir, 'test', test_data)

Load data...
../datasets/interaction/human/train/new_feature_train.npy
../datasets/interaction/human/test/new_feature_test.npy


In [5]:
train_data, dev_data = split_data(train_data, 0.1)
atom_dict = pickle.load(open(data_dir + '/atom_dict', 'rb'))
amino_dict = pickle.load(open(data_dir + '/amino_dict', 'rb'))

# train_data has [compounds, adjacencies, fingerprint, proteins, interactions, features_created_by_deepcca]

In [6]:
def train_eval(model, task, data_train, data_dev, data_test, device, params):
    if task == 'affinity':
        criterion = F.mse_loss
        best_res = 2 ** 10
    elif task == 'interaction':
        criterion = F.cross_entropy
        best_res = 0
    else:
        print("Please choose a correct mode!!!")
        return

    optimizer = optim.Adam(model.parameters(), lr=params['lr'], weight_decay=0, amsgrad=True)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    idx = np.arange(len(data_train[0]))
    batch_size = params['batch_size']
    for epoch in range(params['num_epochs']):
        print('epoch: {}'.format(epoch))
        np.random.shuffle(idx)
        model.train()
        pred_labels = []
        predictions = []
        labels = []
        for i in range(math.ceil(len(data_train[0]) / batch_size)):
            batch_data = [data_train[di][idx[i * batch_size: (i + 1) * batch_size]] for di in range(len(data_train))]
            # added new feature
            atoms_pad, atoms_mask, adjacencies_pad, batch_fps, amino_pad, amino_mask, label, batch_new_feature = batch2tensor(batch_data, device)

            pred = model(atoms_pad, atoms_mask, adjacencies_pad, amino_pad, amino_mask, batch_fps, batch_new_feature)
            if task == 'affinity':
                loss = criterion(pred.float(), label.float())
                predictions += pred.cpu().detach().numpy().reshape(-1).tolist()
                labels += label.cpu().numpy().reshape(-1).tolist()
            elif task == 'interaction':
                loss = criterion(pred.float(), label.view(label.shape[0]).long())
                ys = F.softmax(pred, 1).to('cpu').data.numpy()
                pred_labels += list(map(lambda x: np.argmax(x), ys))
                predictions += list(map(lambda x: x[1], ys))
                labels += label.cpu().numpy().reshape(-1).tolist()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if params['verbose']:
                sys.stdout.write('\repoch:{}, batch:{}/{}, loss:{}'.format(epoch, i, math.ceil(len(data_train[0])/batch_size)-1, float(loss.data)))
                sys.stdout.flush()

        if task == 'affinity':
            print(' ')
            predictions = np.array(predictions)
            labels = np.array(labels)
            rmse_train, pearson_train, spearman_train = regression_scores(labels, predictions)
            print('Train rmse:{}, pearson:{}, spearman:{}'.format(rmse_train, pearson_train, spearman_train))

            rmse_dev, pearson_dev, spearman_dev = test(model, task, data_dev, batch_size, device)
            print('Dev rmse:{}, pearson:{}, spearman:{}'.format(rmse_dev, pearson_dev, spearman_dev))

            rmse_test, pearson_test, spearman_test = test(model, task, data_test, batch_size, device)
            print( 'Test rmse:{}, pearson:{}, spearman:{}'.format(rmse_test, pearson_test, spearman_test))

            if rmse_dev < best_res:
                best_res = rmse_dev
                # torch.save(model, '../checkpoint/best_model_affinity.pth')
                res = [rmse_test, pearson_test, spearman_test]

        else:
            print(' ')
            pred_labels = np.array(pred_labels)
            predictions = np.array(predictions)
            labels = np.array(labels)
            auc_train, acc_train, apur_train = classification_scores(labels, predictions, pred_labels)
            print('Train auc:{}, acc:{}, aupr:{}'.format(auc_train, acc_train, apur_train))

            auc_dev, acc_dev, aupr_dev = test(model, task, data_dev, batch_size, device)
            print('Dev auc:{}, acc:{}, aupr:{}'.format(auc_dev, acc_dev, aupr_dev))

            auc_test, acc_test, aupr_test = test(model, task, data_test, batch_size, device)
            print('Test auc:{}, acc:{}, aupr:{}'.format(auc_test, acc_test, aupr_test))

            if auc_dev > best_res:
                best_res = auc_dev
                # torch.save(model, '../checkpoint/best_model_interaction.pth')
                res = [auc_test, acc_test, aupr_test]

        scheduler.step()
    return res


def test(model, task, data_test, batch_size, device):
    model.eval()
    predictions = []
    pred_labels = []
    labels = []
    for i in range(math.ceil(len(data_test[0]) / batch_size)):
        batch_data = [data_test[di][i * batch_size: (i + 1) * batch_size] for di in range(len(data_test))]
        atoms_pad, atoms_mask, adjacencies_pad, batch_fps, amino_pad, amino_mask, label, batch_new_feature = batch2tensor(batch_data, device)
        with torch.no_grad():
            pred = model(atoms_pad, atoms_mask, adjacencies_pad, amino_pad, amino_mask, batch_fps, batch_new_feature)
        if task == 'affinity':
            predictions += pred.cpu().detach().numpy().reshape(-1).tolist()
            labels += label.cpu().numpy().reshape(-1).tolist()
        else:
            ys = F.softmax(pred, 1).to('cpu').data.numpy()
            pred_labels += list(map(lambda x: np.argmax(x), ys))
            predictions += list(map(lambda x: x[1], ys))
            labels += label.cpu().numpy().reshape(-1).tolist()
    pred_labels = np.array(pred_labels)
    predictions = np.array(predictions)
    labels = np.array(labels)
    if task == 'affinity':
        rmse_value, pearson_value, spearman_value = regression_scores(labels, predictions)
        return rmse_value, pearson_value, spearman_value
    else:
        auc_value, acc_value, aupr_value = classification_scores(labels, predictions, pred_labels)
        return auc_value, acc_value, aupr_value

In [7]:
print('training...')
params = {'lr':lr, 'step_size':step_size, 'gamma':gamma, 'batch_size':batch_size,
        'num_epochs':num_epochs, 'verbose':verbose, 'gat_dim':gat_dim, 'num_head':num_head,
         'dropout':dropout, 'alpha':alpha, 'comp_dim':comp_dim, 'prot_dim':prot_dim,
          'latent_dim':latent_dim,"window":window, 'layer_cnn':layer_cnn, 'layer_out':layer_out}

# model = BACPI(task, len(atom_dict), len(amino_dict), params)
# change to use new model
model = BACPI_DEEPCCA(task, len(atom_dict), len(amino_dict), params)
model.to(device)

res = train_eval(model, task, train_data, dev_data, test_data, device, params)

print('Finish training!')
if task == 'affinity':
    print('Finally test result of rmse:{}, pearson:{}, spearman:{}'.format(res[0], res[1], res[2]))
elif task == 'interaction':
    print('Finally test result of auc:{}, acc:{}, aupr:{}'.format(res[0], res[1], res[2]))

training...
epoch: 0
atoms_vector.shape torch.Size([8, 72, 80])
epoch:0, batch:0/1819, loss:0.6912870407104492atoms_vector.shape torch.Size([8, 67, 80])
epoch:0, batch:1/1819, loss:0.6856408715248108atoms_vector.shape torch.Size([8, 52, 80])
epoch:0, batch:2/1819, loss:0.6646974682807922atoms_vector.shape torch.Size([8, 60, 80])
epoch:0, batch:3/1819, loss:0.6508852243423462atoms_vector.shape torch.Size([8, 50, 80])
epoch:0, batch:4/1819, loss:0.5821177363395691atoms_vector.shape torch.Size([8, 85, 80])
epoch:0, batch:5/1819, loss:0.6178185939788818atoms_vector.shape torch.Size([8, 75, 80])
epoch:0, batch:6/1819, loss:0.5621717572212219atoms_vector.shape torch.Size([8, 71, 80])
epoch:0, batch:7/1819, loss:0.29334497451782227atoms_vector.shape torch.Size([8, 92, 80])
epoch:0, batch:8/1819, loss:0.2130964696407318atoms_vector.shape torch.Size([8, 66, 80])
epoch:0, batch:9/1819, loss:0.4148363769054413atoms_vector.shape torch.Size([8, 47, 80])
epoch:0, batch:10/1819, loss:0.05949272960424