In [18]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import csv
import os
import collections
import time
import math
import pickle
import re
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import InputLayer, Input
from tensorflow.keras.layers import Reshape, MaxPooling2D
from tensorflow.keras.layers import Conv2D, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import SGD
from tensorflow.python.keras.models import load_model

In [2]:
tf.__version__

'1.15.4'

In [3]:
train_labels = open("SR-ARE-train/names_labels.txt","r")
content = train_labels.read()
#print(content)

In [4]:
content_list = re.split(",|\n",content)
#print(content_list)
#print(len(content_list))

print(content_list.count("0"),content_list.count("1"))

6069 1098


In [5]:
drug_dict = {content_list[2*i]:float(content_list[2*i+1]) for i in range(len(content_list)//2)}
#print(drug_dict)
#print(len(drug_dict))

In [6]:
train_smiles = open("SR-ARE-train/names_smiles.txt","r")
dtent = train_smiles.read()
#print(dtent)

In [7]:
dtent_list = re.split(",|\n",dtent)
drug_name_dict = {dtent_list[2*i]:dtent_list[2*i+1] for i in range(len(dtent_list)//2)}
#print(drug_name_dict)
#print(len(drug_name_dict))

In [8]:
train_one_hot = open("SR-ARE-train/names_onehots.pickle","rb")
rtent = pickle.load(train_one_hot)
#print(rtent)

In [9]:
print(type(rtent))
print(type(rtent["onehots"]))
print(rtent["onehots"].shape)


<class 'dict'>
<class 'numpy.ndarray'>
(7167, 70, 325)


In [10]:
print(type(rtent["names"]))

<class 'list'>


In [11]:
x_train = rtent["onehots"]
label = rtent["names"]
zlabel = [drug_dict[label[i]] for i in range(len(label))]
#print(len(zlabel))
y_train = np.array(zlabel,float)
print(x_train.shape)
print(y_train.shape)


(7167, 70, 325)
(7167,)


In [12]:
correct = y_train >= 0.5
print(correct)
true_label = correct[correct].astype(int)
print(true_label)
print(len(true_label))
true_find = x_train[correct]
print(len(true_find))

incorrect = y_train <= 0.5
false_label = incorrect[incorrect].astype(int)
false_find = x_train[incorrect]
print(len(false_label))

[False  True False ... False False  True]
[1 1 1 ... 1 1 1]
1098
1098
6069


In [13]:
weight_for_zero = 1.0 / len(false_label)
weight_for_one = 1.0 / len(true_label)
print(weight_for_zero,weight_for_one)

0.00016477179106936892 0.0009107468123861566


In [15]:
drug_height = x_train.shape[1]
print(drug_height)
drug_width = x_train.shape[2]
print(drug_width)
drug_shape = (drug_height , drug_width)
print(drug_shape)
drug_full_shape = (drug_height , drug_width,1)
print(drug_full_shape)
no_class = 1
no_channel = 1

70
325
(70, 325)
(70, 325, 1)


In [36]:
def find_formula(dictlist,namelabel,y):
    for i in range(9):
        print(namelabel[i],dictlist[namelabel[i]],y[i])

In [14]:
find_formula(drug_name_dict,label[0:9],y_train[0:9])

NCGC00254346-01 CC(C)=CCC\C(C)=C/CO 0.0
NCGC00254668-01 FC1=CC=CC(F)=C1C(=O)NC(=O)NC2=C(F)C=C(OC3=C(Cl)C=C(C=C3)C(F)(F)F)C=C2 1.0
NCGC00015646-05 Cl.CC1=CNN=C1 0.0
NCGC00255698-01 CC1(C)[C@H](C=C(Cl)Cl)[C@@H]1C(=O)O[C@@H](C#N)C2=CC=CC(OC3=CC=CC=C3)=C2 0.0
NCGC00160628-03 C[C@@H]1O[C@@H](OC[C@H]2O[C@@H](OC3=C(OC4=C(C3=O)C(O)=CC(O)=C4)C5=CC(O)=C(O)C=C5)[C@H](O)[C@@H](O)[C@@H]2O)[C@H](O)[C@H](O)[C@H]1O 0.0
NCGC00255567-01 CCCCCCOC1=C(C=CC=C1)C(O)=O 0.0
NCGC00262943-01 OCCNCCO.CC(OC(=O)C1(C)CCC(C(O)=O)C1(C)C)C2=CC=C(C)C=C2 0.0
NCGC00256129-01 CCCCOC(=O)CCCCCCCC(=O)OCCCC 0.0
NCGC00016666-06 Br.COC1=C2CN(C)CCC2=CC3=C1OCO3 0.0


In [37]:
def find_example_error(pred,dictlist,namelabel,y):
    incorrect = (pred != y)
    namelabel = namelabel[incorrect]
    pred = pred[incorrect]
    find_formula(dictlist,namelabel[0:9],y[0:9])

<tensorflow.python.keras.regularizers.L1L2 at 0x2573b0db550>

In [16]:
model = Sequential()
model.add(InputLayer(input_shape=(drug_height,drug_width,)))

model.add(Reshape(drug_full_shape))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=8,padding='same',activation='relu',name="conv1",kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1,4), strides=(1,4),padding='same'))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=16,padding='same',activation='relu',name="conv2",kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1,4), strides=(1,4),padding='same'))

model.add(Conv2D(kernel_size=(1,16),strides=1,filters=32,padding='same',activation='relu',name="conv3",kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(1,4), strides=(1,4),padding='same'))

model.add(Flatten())

model.add(Dense(no_class,activation="sigmoid",kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))



Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [17]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 70, 325, 1)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 70, 325, 8)        136       
_________________________________________________________________
batch_normalization (BatchNo (None, 70, 325, 8)        32        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 70, 82, 8)         0         
_________________________________________________________________
conv2 (Conv2D)               (None, 70, 82, 16)        2064      
_________________________________________________________________
batch_normalization_1 (Batch (None, 70, 82, 16)        64        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 70, 21, 16)        0

In [19]:
optimizer = SGD(lr=0.01)

In [20]:
metrics = [
    keras.metrics.BinaryAccuracy(name="accuracy"),
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

In [21]:
class_weight = {0: weight_for_zero, 1: weight_for_one}

In [22]:
model.compile(optimizer=optimizer,loss="binary_crossentropy", metrics=metrics)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [45]:
model.fit(x=x_train, y=y_train, epochs=5, batch_size=128,class_weight=class_weight,validation_data=(x_test, y_test))

Train on 7167 samples, validate on 234 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x29a8cdc8fd0>

In [None]:
#30 epochs

In [23]:
test_labels = open("SR-ARE-test/names_labels.txt","r")
ttcontent = test_labels.read()

In [24]:
ttcontent_list = re.split(",|\n",ttcontent)
#print(ttcontent_list)
#print(len(ttcontent_list))
print(ttcontent_list.count("0"),ttcontent_list.count("1"))

186 48


In [25]:
ttdrug_dict = {ttcontent_list[2*i]:float(ttcontent_list[2*i+1]) for i in range(len(ttcontent_list)//2)}
#print(drug_dict)
#print(len(drug_dict))

In [26]:
test_smiles = open("SR-ARE-test/names_smiles.txt","r")
ttdtent = test_smiles.read()
#print(ttdtent)

In [27]:
ttdtent_list = re.split(",|\n",ttdtent)
ttdrug_name_dict = {ttdtent_list[2*i]:ttdtent_list[2*i+1] for i in range(len(ttdtent_list)//2)}
#print(ttdrug_name_dict)
print(len(ttdrug_name_dict))

234


In [28]:
test_one_hot = open("SR-ARE-test/names_onehots.pickle","rb")
ttrtent = pickle.load(test_one_hot)
#print(rtent)

In [29]:
x_test = ttrtent["onehots"]
ttlabel = ttrtent["names"]
ttzlabel = [ttdrug_dict[ttlabel[i]] for i in range(len(ttlabel))]
#print(len(zlabel))
y_test = np.array(ttzlabel,float)

In [46]:
result = model.evaluate(x=x_test,y=y_test)



In [31]:
for name, value in zip(model.metrics_names, result):
    print(name, value)

loss 1.0538651107722878
accuracy 0.36324787
fn 27.0
fp 122.0
tn 64.0
tp 21.0
precision 0.14685315
recall 0.4375


In [47]:
## save model
path_model = 'model/model3-3.keras'
model.save(path_model)

In [24]:
path_model = 'model/model3-2.keras'
model = tf.keras.models.load_model(path_model)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [37]:
pred_test = model.predict(x=x_test)
print(pred_test)

[[0.5103441 ]
 [0.5074876 ]
 [0.51405525]
 [0.50875705]
 [0.50796735]
 [0.5097908 ]
 [0.5140344 ]
 [0.5122361 ]
 [0.5159416 ]
 [0.5135099 ]
 [0.5073363 ]
 [0.5059578 ]
 [0.51180446]
 [0.53566784]
 [0.51156247]
 [0.5101705 ]
 [0.5139971 ]
 [0.50937355]
 [0.5125999 ]
 [0.50897247]
 [0.50649524]
 [0.5098321 ]
 [0.5137751 ]
 [0.5126956 ]
 [0.518037  ]
 [0.513056  ]
 [0.51674235]
 [0.5099023 ]
 [0.5067698 ]
 [0.51355964]
 [0.5074271 ]
 [0.50878876]
 [0.5114385 ]
 [0.5116914 ]
 [0.51758784]
 [0.50911534]
 [0.5120421 ]
 [0.50612015]
 [0.5143452 ]
 [0.50985754]
 [0.51492786]
 [0.5081566 ]
 [0.5080467 ]
 [0.50814956]
 [0.51106536]
 [0.5048374 ]
 [0.508334  ]
 [0.5106522 ]
 [0.5077485 ]
 [0.5118264 ]
 [0.51112336]
 [0.5095141 ]
 [0.51183873]
 [0.5150552 ]
 [0.51486003]
 [0.51372457]
 [0.5117149 ]
 [0.5022941 ]
 [0.51337016]
 [0.5083797 ]
 [0.50894463]
 [0.51046   ]
 [0.5128305 ]
 [0.5066146 ]
 [0.5089966 ]
 [0.5094206 ]
 [0.507725  ]
 [0.51077443]
 [0.5103711 ]
 [0.5128319 ]
 [0.50868464]
 [0.51

In [33]:
checking = pred_test >= 0.5
print(checking)

[[False]
 [False]
 [ True]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [False]
 [False]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [ True]
 [False]
 [False]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [ True]
 [False]
 [ True]
 [ True]
 [ True]
 [ True]
 [False]
 [ True]
 [False]
 [ True]
 [ True]
 [False]
 

In [34]:
answer = checking.astype(int)
print(answer)

[[0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]


In [93]:
f = open("labels.txt", "w")
for i in range(len(answer)):
    f.write(str(answer[i][0]) + "\n")
f.close()