In [1]:
import pickle
import numpy as np
import os
import torch
import gc

In [2]:
#  making feature vectors of seq, one-hot encoding

print('Making Training dataset...')
ecfp = np.load('./dataset_hard'+'/cv_'+str(0)+'/train_fingerprint.npy')
ecfp = np.asarray(ecfp, dtype='float32').reshape(-1,1024)

file_interactions=np.load('./dataset_hard'+'/cv_'+str(0)+'/train_interaction.npy')
print('Loading labels: train_interaction.npy')
cID = np.load('./dataset_hard'+'/cv_'+str(0)+'/train_chemIDs.npy')
print('Loading chemIDs: train_chemIDs.npy')
with open('./dataset_hard'+'/cv_'+str(0)+'/train_proIDs.txt') as f:
    pID = [s.strip() for s in f.readlines()]
print('Loading proIDs: train_proIDs.txt')
n2v_c, n2v_p = [], []
with open('./modelpp.pickle', mode='rb') as f:
    modelpp = pickle.load(f)
with open('./modelcc.pickle', mode='rb') as f:
    modelcc = pickle.load(f)
for j in cID:
    n2v_c.append(modelcc.wv[str(j)])
for k in pID:
    n2v_p.append(modelpp.wv[k])
interactions = np.asarray(file_interactions, dtype='int32').reshape(-1,1)
n2vc = np.asarray(n2v_c, dtype='float32').reshape(-1, 128)
n2vp = np.asarray(n2v_p, dtype='float32').reshape(-1, 128)
#reset memory
del n2v_c, n2v_p, cID, pID, modelcc, modelpp, file_interactions
gc.collect()

file_sequences=np.load('./dataset_hard'+'/cv_'+str(0)+'/train_reprotein.npy')
print('Loading sequences: train_reprotein.npy', flush=True)
sequences = np.asarray(file_sequences, dtype='float32').reshape((-1, 1, 5762, 20))
# reset memory
del file_sequences
gc.collect()

print('interactions.shape: ', interactions.shape, 'ecfp.shape: ', ecfp.shape,'sequences.shape: ',  sequences.shape, 'n2vc.shape:', n2vc.shape,'n2vp.shape:', n2vp.shape, flush=True)

Making Training dataset...
Loading labels: train_interaction.npy
Loading chemIDs: train_chemIDs.npy
Loading proIDs: train_proIDs.txt
Loading sequences: train_reprotein.npy
interactions.shape:  (14196, 1) ecfp.shape:  (14196, 1024) sequences.shape:  (14196, 1, 5762, 20) n2vc.shape: (14196, 128) n2vp.shape: (14196, 128)


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('use', device)

use cuda


In [36]:
import torch.nn as nn
import torch.nn.functional as F
import time
import ignite.metrics

# prosize: 5762, plensize:20
# j1:33, s1:1, pf1:64 = window-size, stride-step, No. of filters of first protein-CNN convolution layer
# ja1:17 sa1:1 = window-size, stride-step of first protein-CNN average-pooling layer
# j2:23,s2:1, pf2:64 = second protein-CNN convolution layer
# ja2:11, sa2:1 = second protein-CNN average-pooling layer
# j3:33, s3:1, pf3:32 = third protein-CNN convolution layer
# ja3:17, sa3:1 third protein-CNN average-pooling layer
# n_hid3:70, n_hid4:80, n_hid5:60, n_out:1
class DeepCNN(nn.Module):
    def __init__(self):
        super(DeepCNN, self).__init__()
        # first conv of seq_cnn
        self.conv1_pro = nn.Conv2d(1, 64, (33, 20), stride= (1, ), padding=(33//2, 0))
        self.bn1_pro = nn.BatchNorm2d(64)
        # second conv of seq_cnn
        self.conv2_pro = nn.Conv2d(64, 64, (23, 1), stride= (1, ), padding=(23//2, 0))
        self.bn2_pro = nn.BatchNorm2d(64)
        # third conv of seq_cnn
        self.conv3_pro = nn.Conv2d(64, 32, (33, 1), stride=(1, ), padding=(33//2, 0))
        self.bn3_pro = nn.BatchNorm2d(32)
        self.fc3_pro = nn.Linear(1, 70)
        self.fc4 = nn.Linear(1152, 80) # 1024 + 128
        self.fc5 = nn.Linear(80, 60) # nhid4, nhid5
        self.fc4_pro = nn.Linear(2368, 80) # 2240+128
        self.fc5_pro = nn.Linear(80, 60)
        self.fc6 = nn.Linear(3600, 1)  #


        self.m1 = (5762+(33//2*2)-33)//1+1
        # print('m1', self.m1)
        self.m2 = (self.m1+(17//2*2)-17)//1+1
        # print('m2', self.m2)
        self.m3 = (self.m2+(23//2*2)-23)//1+1
        # print('m3', self.m3)
        self.m4 = (self.m3+(11//2*2)-11)//1+1
        # print('m4', self.m4)
        self.m5 = (self.m4+(33//2*2)-33)//1+1
        # print('m5', self.m5)
        self.m6 = (self.m5+(17//2*2)-17)//1+1
        # print('m6', self.m6)

    def forward(self, seq):
        seq = self.conv1_pro(seq)  # first conv
        seq = self.bn1_pro(seq)    # batch norm
        seq = F.leaky_relu(seq)    # leaky_relu activation
        seq = F.dropout(seq, p=0.2) # dropout
        seq = F.avg_pool2d(seq, (17, 1), stride=1, padding=(17//2, 0)) # avg_pooling

        seq = self.conv2_pro(seq)
        seq = self.bn2_pro(seq)
        seq = F.leaky_relu(seq)
        seq = F.dropout(seq, p=0.2)
        seq = F.avg_pool2d(seq, (11, 1), stride=1, padding=(11//2, 0))

        seq = self.conv3_pro(seq)
        seq = self.bn3_pro(seq)
        seq = F.leaky_relu(seq)
        seq = F.dropout(seq, p=0.2)
        seq = F.avg_pool2d(seq, (17, 1), stride=1, padding=(17//2, 0))
        seq_protein = F.max_pool2d(seq, (self.m6, 1))
        # fully-connect fc3
        seq_protein = F.leaky_relu(self.fc3_pro(seq_protein))
        seq_protein = F.dropout(seq_protein, p=0.2)
        return seq_protein

    def cos_similarity(self, fp, seq_, n2c, n2p):
        x_compound = fp
        x_compound = self.fc4(torch.concat((x_compound, n2c)))
        x_compound = F.dropout(F.leaky_relu(x_compound), p=0.2)
        x_compound = F.dropout(F.leaky_relu(self.fc5(x_compound)), p=0.2)
        x_protein = self.predict_pro(seq_)
        x_protein = self.fc4_pro(torch.cat((x_protein, n2p)))
        x_protein = F.dropout(F.leaky_relu(x_protein), p=0.2)
        #print(x_protein.shape)
        x_protein = F.dropout(F.leaky_relu(self.fc5_pro(x_protein)), p=0.2)
        #print(x_protein.shape)
        y = x_compound * x_protein
        return y

    def __call__(self, fp, seq_, n2c, n2p, interaction):
        z = self.cos_similarity(ecfp, sequences, n2vc, n2vp)
        print('Z shape:', z.shape)
        Z = self.fc6(z)

        loss = F.cosine_similarity(Z, interactions)
        # loss = tf.compat.v1.losses.sigmoid_cross_entropy(Z, interactions)
        # ---------------------------------------------------------------
        accuracy = ignite.metrics.Accuracy(Z, interactions)
        # accuracy_ = tf.keras.metrics.binary_accuracy(Z, interactions) #---
        # ---------------------------------------------------------------
        print({'loss': loss, 'accuracy': accuracy}, self)
        return loss


model = DeepCNN()
model.to(device)


DeepCNN(
  (conv1_pro): Conv2d(1, 64, kernel_size=(33, 20), stride=(1,), padding=(16, 0))
  (bn1_pro): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2_pro): Conv2d(64, 64, kernel_size=(23, 1), stride=(1,), padding=(11, 0))
  (bn2_pro): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3_pro): Conv2d(64, 32, kernel_size=(33, 1), stride=(1,), padding=(16, 0))
  (bn3_pro): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [37]:
START = time.time()
features = []
for i in range(14196):
    seq_pro = torch.from_numpy(sequences[i].astype(np.float32)).clone()
    seq_pro = seq_pro.reshape(1, 1, 5762, 20)
    #print(seq.shape)
    seq = seq_pro.to(device)
    with torch.no_grad():
        feature = model(seq)
    features.append(feature.cpu().detach().numpy().reshape(-1))
features = np.array(features)
print(features.shape)

END = time.time()
print('Total time is {} sec．'.format(END-START))



(14196, 32)
Total time is 39.66327261924744 sec．


In [52]:
import chainer
import chainer.links as L
#import chainer.functions as F
#from chainer import datasets
# print(ecfp.shape, n2vc.shape)

# train_dataset = datasets.TupleDataset(ecfp, sequences, n2vc, n2vp, interactions)
# train_dataset in chainer:
# (ecfp, sequences, n2vc, n2vp, interactions)
# ...
# (ecfp, sequences, n2vc, n2vp, interactions)  14196 x 5
dataset_pytorch = []

for i in range(14196):
    dataset_pytorch.append((torch.from_numpy(ecfp[i, :].astype(np.float32)).clone(),
                            torch.from_numpy(sequences[i].astype(np.float32)).clone(),
                            torch.from_numpy(n2vc[i, :].astype(np.float32)).clone(),
                            torch.from_numpy(n2vp[i, :].astype(np.float32)).clone(),
                            torch.from_numpy(interactions[i].astype(np.float32)).clone()))

# print(len(dataset_pytorch), len(dataset_pytorch[0]))

n = int(0.8 * len(dataset_pytorch))
train_dataset_pytorch, valid_dataset_pytorch = dataset_pytorch[:n], dataset_pytorch[n:]
print('train: ', len(train_dataset_pytorch), flush=True)
print('valid: ', len(valid_dataset_pytorch), flush=True)
train_loader = torch.utils.data.DataLoader(train_dataset_pytorch, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(valid_dataset_pytorch, batch_size=100, shuffle=True)
#ecfp_ = torch.from_numpy(ecfp.astype(np.float32)).clone()
#n2vc_ = torch.from_numpy(n2vc.astype(np.float32)).clone()
#a = torch.concat(ecfp_.to(device), n2vc_.to(device))
# a = F.concat(ecfp, n2vc)

# print(a.shape)






<class 'chainer.datasets.tuple_dataset.TupleDataset'>
5
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32), array([ 0.72283   ,  0.3805213 , -0.0905413 ,  0.39402148,  0.4876889 ,
       -0.09808945,  0.31464857,  0.07862262, -0.29625794,  0.4016296 ,
       -0.01218679, -0.50566816,  0.26562607, -0.10225522,  0.3340107 ,
        0.23728873,  0.02845587, -0.06437396,  0.00943288,  0.31212628,
       -0.16052926,  0.44658536, -0.13911773,  0.23581393,  0.05283666,
        0.36045703,  0.5515379 ,  0.23126523, -0.00510399, -0.29908812,
       -0.06842223,  0.19955993,  0.07587766, -0.83031344, -0.28564745,
        0.2915692 , -0.1540279 , -0.41768283, -0.18621375,  0.3890175 ,
        0.02277567,  0.41691542,  0.2373718 , -0.

In [44]:
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss
from ignite.engine import create_supervised_trainer, create_supervised_evaluator, Events

# train initialize

output_dir ='./result/dataset_hard'+'/'+'ecfpN2vc_mSGD'+'/'+'pattern'+str(0)
os.makedirs(output_dir)

#-------------------------------
#reset memory again
del sequences, interactions, ecfp, n2vc, n2vp
gc.collect()


optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.00001)

print('Trainer is setting up...', flush=True)

trainer = create_supervised_trainer(model, optimizer, F.nll_loss, device=device)
evaluator = create_supervised_evaluator(model, metrics={'accuracy': Accuracy(), 'nll': Loss(F.nll_loss)}, device=device)
training_history = {'accuracy': [], 'loss': []}
validation_history = {'accuracy': [], 'loss': []}

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    evaluator.run(train_loader)
    metrics = evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_nll = metrics['nll']
    training_history['accuracy'].append(avg_accuracy)
    training_history['loss'].append(avg_nll)
    print(
        "Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll)
    )

@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    evaluator.run(test_loader)
    metrics = evaluator.state.metrics
    avg_accuracy = metrics['accuracy']
    avg_nll = metrics['nll']
    validation_history['accuracy'].append(avg_accuracy)
    validation_history['loss'].append(avg_nll)
    print(
        "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
            .format(engine.state.epoch, avg_accuracy, avg_nll))

checkpointer = ModelCheckpoint(
    './models',
    'model',
    save_interval=1,
    n_saved=2,
    create_dir=True,
    save_as_state_dict=True,
    require_empty=False,
)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'Model': model})

del model, trainer
gc.collect()