In [1]:
import tensorflow as tf
import numpy as np
import h5py
import matplotlib.pyplot as plt
import random
import csv
import os
import collections
import time
import math
import pickle
import re
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Input
from tensorflow.keras.layers import Reshape, MaxPooling2D
from tensorflow.keras.layers import Conv2D, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.models import load_model


In [2]:
tf.__version__

'1.15.4'

In [3]:
h5py.__version__

'2.10.0'

In [4]:
train_labels = open("SR-ARE-train/names_labels.txt","r")
content = train_labels.read()
#print(content)

In [5]:
content_list = re.split(",|\n",content)
#print(content_list)
#print(len(content_list))

print(content_list.count("0"),content_list.count("1"))

6069 1098


In [6]:
drug_dict = {content_list[2*i]:float(content_list[2*i+1]) for i in range(len(content_list)//2)}
#print(drug_dict)
#print(len(drug_dict))

In [7]:
train_smiles = open("SR-ARE-train/names_smiles.txt","r")
dtent = train_smiles.read()
#print(dtent)

In [8]:
dtent_list = re.split(",|\n",dtent)
drug_name_dict = {dtent_list[2*i]:dtent_list[2*i+1] for i in range(len(dtent_list)//2)}
#print(drug_name_dict)
#print(len(drug_name_dict))

In [9]:
train_one_hot = open("SR-ARE-train/names_onehots.pickle","rb")
rtent = pickle.load(train_one_hot)
#print(rtent)

In [10]:
print(type(rtent))
print(type(rtent["onehots"]))
print(rtent["onehots"].shape)


<class 'dict'>
<class 'numpy.ndarray'>
(7167, 70, 325)


In [11]:
print(type(rtent["names"]))

<class 'list'>


In [12]:
x_train = rtent["onehots"]
label = rtent["names"]
zlabel = [drug_dict[label[i]] for i in range(len(label))]
#print(len(zlabel))
y_train = np.array(zlabel,float)
print(x_train.shape)
print(y_train.shape)


(7167, 70, 325)
(7167,)


In [13]:
drug_height = x_train.shape[1]
print(drug_height)
drug_width = x_train.shape[2]
print(drug_width)
drug_shape = (drug_height , drug_width)
print(drug_shape)
drug_full_shape = (drug_height , drug_width,1)
print(drug_full_shape)
no_class = 1
no_channel = 1

70
325
(70, 325)
(70, 325, 1)


In [14]:
def find_formula(dictlist,namelabel,y):
    for i in range(9):
        print(namelabel[i],dictlist[namelabel[i]],y[i])

In [15]:
find_formula(drug_name_dict,label[0:9],y_train[0:9])

NCGC00254346-01 CC(C)=CCC\C(C)=C/CO 0.0
NCGC00254668-01 FC1=CC=CC(F)=C1C(=O)NC(=O)NC2=C(F)C=C(OC3=C(Cl)C=C(C=C3)C(F)(F)F)C=C2 1.0
NCGC00015646-05 Cl.CC1=CNN=C1 0.0
NCGC00255698-01 CC1(C)[C@H](C=C(Cl)Cl)[C@@H]1C(=O)O[C@@H](C#N)C2=CC=CC(OC3=CC=CC=C3)=C2 0.0
NCGC00160628-03 C[C@@H]1O[C@@H](OC[C@H]2O[C@@H](OC3=C(OC4=C(C3=O)C(O)=CC(O)=C4)C5=CC(O)=C(O)C=C5)[C@H](O)[C@@H](O)[C@@H]2O)[C@H](O)[C@H](O)[C@H]1O 0.0
NCGC00255567-01 CCCCCCOC1=C(C=CC=C1)C(O)=O 0.0
NCGC00262943-01 OCCNCCO.CC(OC(=O)C1(C)CCC(C(O)=O)C1(C)C)C2=CC=C(C)C=C2 0.0
NCGC00256129-01 CCCCOC(=O)CCCCCCCC(=O)OCCCC 0.0
NCGC00016666-06 Br.COC1=C2CN(C)CCC2=CC3=C1OCO3 0.0


In [16]:
def find_example_error(pred,dictlist,namelabel,y):
    incorrect = (pred != y)
    namelabel = namelabel[incorrect]
    pred = pred[incorrect]
    find_formula(dictlist,namelabel[0:9],y[0:9])

In [14]:
model = Sequential()
model.add(InputLayer(input_shape=(drug_height,drug_width,)))

model.add(Reshape(drug_full_shape))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=8,padding='same',activation='relu',name="conv1"))
model.add(MaxPooling2D(pool_size=(1,2), strides=(1,2),padding='same'))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=16,padding='same',activation='relu',name="conv2"))
model.add(MaxPooling2D(pool_size=(1,2), strides=(1,2),padding='same'))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=32,padding='same',activation='relu',name="conv3"))
model.add(MaxPooling2D(pool_size=(1,4), strides=(1,4),padding='same'))

model.add(Flatten())

model.add(Dense(140,activation="relu"))

model.add(Dense(no_class,activation="sigmoid"))



Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [15]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 70, 325, 1)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 70, 325, 8)        136       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 70, 163, 8)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 70, 163, 16)       2064      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 70, 82, 16)        0         
_________________________________________________________________
conv3 (Conv2D)               (None, 70, 82, 32)        8224      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 70, 21, 32)        0

In [16]:
optimizer = Adam(lr=1e-4)

In [17]:
model.compile(optimizer=optimizer,loss="binary_crossentropy", metrics=['accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [66]:
model.fit(x=x_train, y=y_train, epochs=10, batch_size=64)

Train on 7167 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x151822fcc50>

In [18]:
test_labels = open("SR-ARE-test/names_labels.txt","r")
ttcontent = test_labels.read()

In [19]:
ttcontent_list = re.split(",|\n",ttcontent)
#print(ttcontent_list)
#print(len(ttcontent_list))
print(ttcontent_list.count("0"),ttcontent_list.count("1"))

186 48


In [20]:
ttdrug_dict = {ttcontent_list[2*i]:float(ttcontent_list[2*i+1]) for i in range(len(ttcontent_list)//2)}
#print(drug_dict)
#print(len(drug_dict))

In [21]:
test_smiles = open("SR-ARE-test/names_smiles.txt","r")
ttdtent = test_smiles.read()
#print(ttdtent)

In [22]:
ttdtent_list = re.split(",|\n",ttdtent)
ttdrug_name_dict = {ttdtent_list[2*i]:ttdtent_list[2*i+1] for i in range(len(ttdtent_list)//2)}
#print(ttdrug_name_dict)
print(len(ttdrug_name_dict))

234


In [23]:
test_one_hot = open("SR-ARE-test/names_onehots.pickle","rb")
ttrtent = pickle.load(test_one_hot)
#print(rtent)

In [24]:
x_test = ttrtent["onehots"]
ttlabel = ttrtent["names"]
ttzlabel = [ttdrug_dict[ttlabel[i]] for i in range(len(ttlabel))]
#print(len(zlabel))
y_test = np.array(ttzlabel,float)

In [68]:
result = model.evaluate(x=x_test,y=y_test)



In [39]:
for name, value in zip(model.metrics_names, result):
    print(name, value)

loss 0.9232861667107313
acc 0.7777778


In [67]:
#save model
path_model = 'model/model1-3.keras'
model.save(path_model)

In [25]:
path_model = 'model/model1-2.keras'
model = tf.keras.models.load_model(path_model)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [55]:
pred_test = model.predict(x=x_test)
print(pred_test)

[[2.80839205e-03]
 [9.53189969e-01]
 [6.93500042e-05]
 [1.95192993e-02]
 [1.40789747e-02]
 [2.62403488e-03]
 [2.44379044e-06]
 [5.97563326e-01]
 [0.00000000e+00]
 [8.64267349e-07]
 [8.90887856e-01]
 [4.26173210e-06]
 [0.00000000e+00]
 [0.00000000e+00]
 [7.36922026e-03]
 [1.83254480e-04]
 [6.31746650e-03]
 [2.80559063e-04]
 [1.00407004e-03]
 [4.18410599e-02]
 [1.71029389e-01]
 [5.36511242e-01]
 [2.78132975e-01]
 [8.97166729e-02]
 [5.96046448e-08]
 [3.83682698e-01]
 [0.00000000e+00]
 [4.05907631e-05]
 [1.26302242e-04]
 [8.36610794e-04]
 [9.02622938e-04]
 [5.77867031e-05]
 [7.41117239e-01]
 [6.71327114e-04]
 [8.39582086e-03]
 [9.80225086e-01]
 [1.05202198e-03]
 [6.57696128e-01]
 [4.25167680e-02]
 [5.04138172e-02]
 [9.56730366e-01]
 [9.36998963e-01]
 [9.41836476e-01]
 [6.05339408e-02]
 [4.35549021e-03]
 [0.00000000e+00]
 [1.29915774e-02]
 [5.99199593e-01]
 [8.05991769e-01]
 [2.91242421e-01]
 [1.86203420e-02]
 [9.32574272e-04]
 [8.18366110e-02]
 [5.44548035e-03]
 [1.78813934e-07]
 [1.000000

In [56]:
checking = pred_test >= 0.5
print(checking)

[[False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [ True]
 

In [28]:
answer = checking.astype(int)
print(answer)

[[0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]


In [93]:
f = open("labels.txt", "w")
for i in range(len(answer)):
    f.write(str(answer[i][0]) + "\n")
f.close()