In [1]:
import pickle
import numpy as np
import os
import torch
import gc

In [2]:
#  making feature vectors of seq, one-hot encoding

print('Making Training dataset...')
ecfp = np.load('./dataset_hard'+'/cv_'+str(0)+'/train_fingerprint.npy')
ecfp = np.asarray(ecfp, dtype='float32').reshape(-1,1024)

file_interactions=np.load('./dataset_hard'+'/cv_'+str(0)+'/train_interaction.npy')
print('Loading labels: train_interaction.npy')
cID = np.load('./dataset_hard'+'/cv_'+str(0)+'/train_chemIDs.npy')
print('Loading chemIDs: train_chemIDs.npy')
with open('./dataset_hard'+'/cv_'+str(0)+'/train_proIDs.txt') as f:
    pID = [s.strip() for s in f.readlines()]
print('Loading proIDs: train_proIDs.txt')
n2v_c, n2v_p = [], []
with open('./modelpp.pickle', mode='rb') as f:
    modelpp = pickle.load(f)
with open('./modelcc.pickle', mode='rb') as f:
    modelcc = pickle.load(f)
for j in cID:
    n2v_c.append(modelcc.wv[str(j)])
for k in pID:
    n2v_p.append(modelpp.wv[k])
interactions = np.asarray(file_interactions, dtype='int32').reshape(-1,1)
n2vc = np.asarray(n2v_c, dtype='float32').reshape(-1, 128)
n2vp = np.asarray(n2v_p, dtype='float32').reshape(-1, 128)
#reset memory
del n2v_c, n2v_p, cID, pID, modelcc, modelpp, file_interactions
gc.collect()

file_sequences=np.load('./dataset_hard'+'/cv_'+str(0)+'/train_reprotein.npy')
print('Loading sequences: train_reprotein.npy', flush=True)
sequences = np.asarray(file_sequences, dtype='float32').reshape((-1, 1, 5762, 20))
# reset memory
del file_sequences
gc.collect()

print('interactions.shape: ', interactions.shape, 'ecfp.shape: ', ecfp.shape,'sequences.shape: ',  sequences.shape, 'n2vc.shape:', n2vc.shape,'n2vp.shape:', n2vp.shape, flush=True)

Making Training dataset...
Loading labels: train_interaction.npy
Loading chemIDs: train_chemIDs.npy
Loading proIDs: train_proIDs.txt
Loading sequences: train_reprotein.npy
interactions.shape:  (14196, 1) ecfp.shape:  (14196, 1024) sequences.shape:  (14196, 1, 5762, 20) n2vc.shape: (14196, 128) n2vp.shape: (14196, 128)


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('use', device)

use cuda


In [36]:
import torch.nn as nn
import torch.nn.functional as F
import time


# prosize: 5762, plensize:20
# j1:33, s1:1, pf1:64 = window-size, stride-step, No. of filters of first protein-CNN convolution layer
# ja1:17 sa1:1 = window-size, stride-step of first protein-CNN average-pooling layer
# j2:23,s2:1, pf2:64 = second protein-CNN convolution layer
# ja2:11, sa2:1 = second protein-CNN average-pooling layer
# j3:33, s3:1, pf3:32 = third protein-CNN convolution layer
# ja3:17, sa3:1 third protein-CNN average-pooling layer
# n_hid3:70, n_hid4:80, n_hid5:60, n_out:1
class DeepCNN(nn.Module):
    def __init__(self):
        super(DeepCNN, self).__init__()
        # first conv of seq_cnn
        self.conv1_pro = nn.Conv2d(1, 64, (33, 20), stride= (1, ), padding=(33//2, 0))
        self.bn1_pro = nn.BatchNorm2d(64)
        # second conv of seq_cnn
        self.conv2_pro = nn.Conv2d(64, 64, (23, 1), stride= (1, ), padding=(23//2, 0))
        self.bn2_pro = nn.BatchNorm2d(64)
        # third conv of seq_cnn
        self.conv3_pro = nn.Conv2d(64, 32, (33, 1), stride=(1, ), padding=(33//2, 0))
        self.bn3_pro = nn.BatchNorm2d(32)
        fc3_pro = nn.Linear(115240, 70) # 5762 x 20

        self.m1 = (5762+(33//2*2)-33)//1+1
        # print('m1', self.m1)
        self.m2 = (self.m1+(17//2*2)-17)//1+1
        # print('m2', self.m2)
        self.m3 = (self.m2+(23//2*2)-23)//1+1
        # print('m3', self.m3)
        self.m4 = (self.m3+(11//2*2)-11)//1+1
        # print('m4', self.m4)
        self.m5 = (self.m4+(33//2*2)-33)//1+1
        # print('m5', self.m5)
        self.m6 = (self.m5+(17//2*2)-17)//1+1
        # print('m6', self.m6)

    def forward(self, seq):
        seq = self.conv1_pro(seq)  # first conv
        seq = self.bn1_pro(seq)    # batch norm
        seq = F.leaky_relu(seq)    # leaky_relu activation
        seq = F.dropout(seq, p=0.2) # dropout
        seq = F.avg_pool2d(seq, (17, 1), stride=1, padding=(17//2, 0)) # avg_pooling

        seq = self.conv2_pro(seq)
        seq = self.bn2_pro(seq)
        seq = F.leaky_relu(seq)
        seq = F.dropout(seq, p=0.2)
        seq = F.avg_pool2d(seq, (11, 1), stride=1, padding=(11//2, 0))

        seq = self.conv3_pro(seq)
        seq = self.bn3_pro(seq)
        seq = F.leaky_relu(seq)
        seq = F.dropout(seq, p=0.2)
        seq = F.avg_pool2d(seq, (17, 1), stride=1, padding=(17//2, 0))
        seq_protein = F.max_pool2d(seq, (self.m6, 1))
        return seq_protein


model = DeepCNN()
model.to(device)


DeepCNN(
  (conv1_pro): Conv2d(1, 64, kernel_size=(33, 20), stride=(1,), padding=(16, 0))
  (bn1_pro): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2_pro): Conv2d(64, 64, kernel_size=(23, 1), stride=(1,), padding=(11, 0))
  (bn2_pro): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3_pro): Conv2d(64, 32, kernel_size=(33, 1), stride=(1,), padding=(16, 0))
  (bn3_pro): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [37]:
START = time.time()
features = []
for i in range(14196):
    seq_pro = torch.from_numpy(sequences[i].astype(np.float32)).clone()
    seq_pro = seq_pro.reshape(1, 1, 5762, 20)
    #print(seq.shape)
    seq = seq_pro.to(device)
    with torch.no_grad():
        feature = model(seq)
    features.append(feature.cpu().detach().numpy().reshape(-1))
features = np.array(features)
print(features.shape)

END = time.time()
print('Total time is {} sec．'.format(END-START))



(14196, 32)
Total time is 39.66327261924744 sec．


In [41]:
import chainer
import chainer.links as L
import chainer.functions as F
print(ecfp.shape, n2vc.shape)
ecfp_ = torch.from_numpy(ecfp.astype(np.float32)).clone()
n2vc_ = torch.from_numpy(n2vc.astype(np.float32)).clone()
a = torch.concat(ecfp_.to(device), n2vc_.to(device))
# a = F.concat(ecfp, n2vc)

print(a.shape)






(14196, 1024) (14196, 128)


TypeError: concat() received an invalid combination of arguments - got (Tensor, Tensor), but expected one of:
 * (tuple of Tensors tensors, int dim, *, Tensor out)
 * (tuple of Tensors tensors, name dim, *, Tensor out)
