# logistic regression

In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.examples.tutorials.mnist import input_data

In [51]:
def load_drug_target_interaction_adjacency_matrix(dir):
    mat=pd.read_table(dir, delim_whitespace=True)
    print('# of drug: '+str(mat.shape[0]))
    print('# of target: '+str(mat.shape[1])+'\n')
    mat.index=mat.index.map(str)
    return mat


def load_drug_target_interaction_dict(dir):
    mat = pd.read_table(dir, delim_whitespace=True, header=None,names=['target','drug'])
    dict_DTI=dict()
    i=0
    for value in mat.values:
        if value[1] not in dict_DTI.keys():
            dict_DTI[value[1]]=[value[0]]
            i+=1
        else:
            dict_DTI[value[1]].append(value[0])
            i+=1

    print('# of drug-target interactions: ' + str(i))
    return dict_DTI


def load_drug_descriptor_matrix(dir):
    mat = pd.read_table(dir, delim_whitespace=True)
    print('# of drugs: ' + str(mat.shape[0]))
    print('# of features of a drug: ' + str(mat.shape[1])+'\n')
    mat.index=mat.index.map(str)
    return mat


def load_target_descriptor_matrix(dir):
    mat = pd.read_table(dir, delim_whitespace=True)
    print('# of targets: ' + str(mat.shape[0]))
    print('# of features of a target: : ' + str(mat.shape[1])+'\n')
    mat.index=mat.index.map(str)
    return mat


def load_drug_target_pair_matrix(dir):
    mat = pd.read_table(dir, delim_whitespace=True)
    print('# of drug-target pairs: ' + str(mat.shape[0]))
    print('# of features of a drug-target pair: ' + str(mat.shape[1]))
    mat.index=mat.index.map(str)
    return mat


def load_etc(dir):
    pass

def get_DTI_dict_from_adjmat(adjmat):
    dict_DTI=dict()
    adjmat=adjmat.T
    i=0
    for drug in adjmat.keys():
        target_list=adjmat.index[adjmat[drug]==1].tolist()
        dict_DTI[drug]=target_list
        i+=len(target_list)
    print('# of drug-target interactions: '+str(i))
    return dict_DTI

In [53]:
dir_dataset='C:\\Users\\csjeong\\Desktop\\research\\dataset\\conv_DTI\\2012, Tabei'

dir_DTI_adjmat=dir_dataset+'\\inter_admat.txt'
dir_drug=dir_dataset+'\\drug_repmat.txt'
dir_target = dir_dataset + '\\target_repmat.txt'

matrix_DTI = load_drug_target_interaction_adjacency_matrix(dir_DTI_adjmat)
matrix_drug=load_drug_descriptor_matrix(dir_drug)
matrix_target=load_target_descriptor_matrix(dir_target)
dict_DTI = get_DTI_dict_from_adjmat(matrix_DTI)


# of drug: 1862
# of target: 1554

# of drugs: 1862
# of features of a drug: 881

# of targets: 1554
# of features of a target: : 876

# of drug-target interactions: 4809


In [4]:
n_feature_drug = len(matrix_drug.keys())
n_feature_target = len(matrix_target.keys())
n_sample_drug = len(matrix_drug.index)
n_sample_target = len(matrix_target.index)

pos_label = set()

i = 0
pos_sample=pd.DataFrame()
for drug, targets in dict_DTI.items():
    for target in targets:
        pair_name=[str(drug) + '_' + target]
        pos_label.update(pair_name)
        if i==0:
            pos_sample=pd.DataFrame(pd.concat([matrix_drug.loc[drug],matrix_target.loc[target]]),columns=pair_name)
        else:
            spl=pd.DataFrame(pd.concat([matrix_drug.loc[drug],matrix_target.loc[target]]),columns=pair_name)
            pos_sample=pos_sample.join(spl)
        i+=1

pos_sample = pos_sample.T
print(pos_sample.shape)
print('# of positive samples: ' + str(len(pos_label)))


(4809, 1757)
# of positive samples: 4809


In [54]:
n_positive = len(pos_label)
n_negative = n_positive
neg_label_total = set()
for ind1 in matrix_target.index:
    for ind2 in matrix_drug.index:
        neg_label_total.update([str(ind2) + '_' + str(ind1)])
print(len(neg_label_total))
neg_label_total = neg_label_total.difference(pos_label)
neg_label = np.random.choice(list(neg_label_total), size=n_negative, replace=False)


2893548


In [57]:
pos_label = set()

for drug, targets in dict_DTI.items():
    for target in targets:
        pair_name = [str(drug) + '_' + target]
        pos_label.update(pair_name)
print(pos_label)

{'4064_GBRA5_HUMAN', '5326888_FA7_HUMAN', '449328_ADH1G_HUMAN', '5468_PGH2_HUMAN', '9864_CAH2_HUMAN', '87642_HMDH_HUMAN', '439554_FCERA_HUMAN', '439153_NDUS3_HUMAN', '176_TTHY_HUMAN', '4116_AT2C1_HUMAN', '6022_HSP71_HUMAN', '60149_5HT2C_HUMAN', '185698_CTLA4_HUMAN', '94191_BPI_HUMAN', '7405_AMYS_HUMAN', '26879_ALBU_HUMAN', '5816_PH4H_HUMAN', '4636_ADA1A_HUMAN', '311_PDE5A_HUMAN', '941361_CAC1H_HUMAN', '54454_TNFC_HUMAN', '5280360_PE2R3_HUMAN', '439153_NDUV1_HUMAN', '1060_MOT4_HUMAN', '44257_NOS2A_HUMAN', '7405_AMYP_HUMAN', '167250_ITB4_HUMAN', '60164_RXRB_HUMAN', '5360696_ACHA2_HUMAN', '4184_HRH1_HUMAN', '168120_PDE4B_HUMAN', '5287643_AMYS_HUMAN', '750_GLYM_HUMAN', '5066_PDIA1_HUMAN', '6262_ARGI2_HUMAN', '5880_ST2A1_HUMAN', '6022_DCK_HUMAN', '5509_PGH1_HUMAN', '5326847_PTGD2_HUMAN', '441290_ACM2_HUMAN', '448294_BGAT_HUMAN', '74989_UQCR1_HUMAN', '3660_CDC2_HUMAN', '34040_ADA1A_HUMAN', '176_ADH7_HUMAN', '33887_AT1A1_HUMAN', '2733_KCMA1_HUMAN', '443940_ADRB1_HUMAN', '3108_MDR1_HUMAN', '31

In [55]:
neg_sample=pd.DataFrame()
j=0
# dtype_drug=np.issubdtype(matrix_drug.index.dtype,np.number)
# dtype_target=np.issubdtype(matrix_target.index.dtype,np.number)
for neg in neg_label:
    drug, target = neg.split('_',1)
    if j==0:
        neg_sample = pd.DataFrame(pd.concat([matrix_drug.loc[drug], matrix_target.loc[target]]),
                                      columns=[neg])
    else:
        df_tmp = pd.DataFrame(pd.concat([matrix_drug.loc[drug], matrix_target.loc[target]]), columns=[neg])
        neg_sample = neg_sample.join(df_tmp)
    j += 1
neg_sample = neg_sample.T
print(neg_sample.shape)
neg_sample.to_csv('neg_sample.txt',sep='\t')
    

False
(4809, 1757)


In [56]:
print(neg_sample)

                     SUB1  SUB2  SUB3  SUB4  SUB5  SUB6  SUB7  SUB8  SUB9  \
3341_DHI1_HUMAN         1     1     1     0     0     0     0     0     0   
79034_SCN9A_HUMAN       1     1     1     0     0     0     0     0     0   
9547884_NOS2C_HUMAN     1     1     0     0     0     0     0     0     0   
5289420_SYLC_HUMAN      1     1     0     0     0     0     0     0     0   
43232_ADH1B_HUMAN       1     1     1     0     0     0     0     0     0   
7741_PTN1_HUMAN         1     1     0     0     0     0     0     0     0   
5880_SUCR1_HUMAN        1     1     1     0     0     0     0     0     0   
3715_KPCB_HUMAN         1     1     1     0     0     0     0     0     0   
448662_S38A3_HUMAN      1     1     1     0     0     0     0     0     0   
8064_GRIK2_HUMAN        1     1     0     0     0     0     0     0     0   
4369379_IP3KA_HUMAN     1     1     1     0     0     0     0     0     0   
31640_MGR5_HUMAN        1     1     0     0     0     0     0     0     0   

In [None]:

# d1 = {'col1': np.array(range(1,3)), 'col2': np.array(range(3,5))}
# d2 = {'aa1': np.array(range(1,4)), 'aa2': np.array(range(4,7)),'aa3':np.array(range(7,10))}
# a = pd.DataFrame(data=d1, index=['a','b'])
# print(a)
# b = pd.DataFrame(data=d2,index=['x','y','z'])
# print(b)
# n_feature_drug=len(a.keys())
# n_feature_target=len(b.keys())
# n_sample_drug=len(a.index)
# n_sample_target=len(b.index)
# index=list(a.index)*5
# c=pd.DataFrame(pd.np.tile(a, (n_sample_target, 1)),columns=a.keys())
# ind_pair=[]
# for ind1 in b.index:
#     for ind2 in a.index:
#         ind_pair.append(str(ind2)+'_'+str(ind1))

# d=pd.DataFrame(np.reshape(pd.np.tile(b,(1,n_sample_drug)),(n_sample_drug*n_sample_target,n_sample_target)),
#                columns=b.keys())
# print(d)

# kp=pd.concat([c,d], axis=1)
# kp.index=ind_pair
# print(kp)
# # matrix_DTpair=my_tensordot(matrix_drug,matrix_target)
# kp.to_csv('test.txt',sep='\t')

In [None]:
x=tf.placeholder(tf.float32,[None,image_size])
W=tf.Variable(tf.zeros([image_size,n_class]),name='W')
b=tf.Variable(tf.zeros([n_class]),name='b')
y=tf.nn.softmax(tf.matmul(x,W)+b)
y_=tf.placeholder(tf.float32,[None,n_class])

In [None]:
print(tf.trainable_variables())

In [None]:
cost=tf.reduce_mean(-tf.reduce_sum(y_*tf.log(y),reduction_indices=[1]))
optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr).minimize(cost)
# accuracy_train=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y,1),tf.argmax(y_,1)),tf.float32))

print(cost)
print(optimizer)
sess=tf.Session()


In [None]:
batch_xs,batch_ys=mnist.train.next_batch(100)
print(batch_xs.shape)
print(batch_ys.shape)
# print(batch_xs[0,:])
# print(batch_ys[0,:])

In [None]:
init=tf.global_variables_initializer()
sess.run(init)
avg_cost_list=[]
total_batch=int(mnist.train.num_examples/batch_size)
for epoch in range(training_epochs):
    # accuracy_list=[]
    avg_cost=0.
    
    for step in range(total_batch):
        batch_xs,batch_ys=mnist.train.next_batch(batch_size)
        sess.run(optimizer,feed_dict={x:batch_xs,y_:batch_ys})
        avg_cost+=sess.run(cost,feed_dict={x:batch_xs,y_:batch_ys})/total_batch
    avg_cost_list.append(avg_cost)
plt.plot(range(training_epochs),avg_cost_list)
# show training accuracy as iteraction in one epoch
#         accuracy_list.append(sess.run(accuracy_train,feed_dict={x:batch_xs,y_:batch_ys}))
# itr=range(total_batch)
# acc=accuracy_list
# plt.plot(itr,acc)
# plt.xlabel('iteraction')
# plt.ylabel('training accuracy')
# plt.show()

In [None]:
correct_prediction=tf.equal(tf.argmax(y,1),tf.argmax(y_,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,dtype='float32'))
ac=sess.run(accuracy,feed_dict={x:mnist.test.images, y_:mnist.test.labels})
print(str(ac*100)+'%')


In [None]:
sess.close()