In [17]:
from keras.models import Model
from keras.layers import Dense, Flatten, Input, Lambda, Dropout
from keras.layers.merge import concatenate
from keras import backend as K
import molecule_vae
from molecule_vae import get_zinc_tokenizer
import zinc_grammar
from vectorized_cmap import computecs
import h5py

# VAE + Sequential merged model

In [18]:
# Sequential
visible_1 = Input(shape=(978, 2))
flaten_1_sq = Flatten()(visible_1)
dense_11 = Dense(1024, activation='relu')(flaten_1_sq)
drop_1 = Dropout(0.2)(dense_11)
dense_12 = Dense(512, activation='relu')(drop_1)
drop_2 = Dropout(0.2)(dense_12)
dense_13 = Dense(256, activation='relu')(drop_2)
drop_3 = Dropout(0.2)(dense_13)
dense_14 = Dense(128, activation='relu')(drop_3)
drop_4 = Dropout(0.2)(dense_14)
dense_15 = Dense(56, activation='relu')(drop_4)
output_1 = Dense(1, activation='linear')(dense_15)
sequential = Model(inputs=visible_1, outputs=output_1)
# VAE
def sampling(args):
    z_mean_, z_log_var_ = args
    batch_size = K.shape(z_mean_)[0]
    epsilon = K.random_normal(shape=(batch_size, 56), mean=0., stddev = 0.01)
    return z_mean_ + K.exp(z_log_var_ / 2) * epsilon
grammar_weights = '../../data/vae.hdf5'
grammar_model = molecule_vae.ZincGrammarModel(grammar_weights)
z_mn, z_var = grammar_model.vae.encoderMV.output
visible_vae = grammar_model.vae.encoderMV.input
output_vae = Lambda(sampling, output_shape=(56,), name='lambda')([z_mn, z_var])
grammar_model = Model(inputs=visible_vae, outputs=output_vae)
# merge input models
merge = concatenate([dense_15, output_vae])
# interpretation model
hidden1 = Dense(10, activation='relu')(merge)
hidden2 = Dense(10, activation='relu')(hidden1)
output = Dense(1, activation='linear')(hidden2)
model = Model(inputs=[visible_1, visible_vae], outputs=output)
# summarize layers
print(model.summary())

Model: "model_21"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 978, 2)       0                                            
__________________________________________________________________________________________________
flatten_3 (Flatten)             (None, 1956)         0           input_11[0][0]                   
__________________________________________________________________________________________________
dense_23 (Dense)                (None, 1024)         2003968     flatten_3[0][0]                  
__________________________________________________________________________________________________
dropout_9 (Dropout)             (None, 1024)         0           dense_23[0][0]                   
___________________________________________________________________________________________

# Load the model weights

In [19]:
model_weight = '../../data/vae_sequential.h5'
model.load_weights(model_weight)

# Test the model

In [20]:
# Load test set for gene expression data
h5f = h5py.File('../../data/gene_exp_test_data.h5', 'r')
seq_test_data = h5f['data'][:]
h5f = h5py.File('../../data/vae_test_data.h5', 'r')
vae_test_data = h5f['data'][:]

print(seq_test_data.shape)
print(vae_test_data.shape)

(910, 978, 2)
(910, 277, 76)


In [21]:
vae_input = vae_test_data
sequential_input = seq_test_data
ytest = model.predict([sequential_input, vae_input])

In [22]:
len(ytest)

910

In [23]:
import pandas as pd
df_drugs = pd.read_csv('../../data/vae_drugs.csv')
df_drugs

Unnamed: 0,drug_name,SMILES,standardized_SMILES
0,acadesine,C1=NC(=C(N1C2C(C(C(O2)CO)O)O)N)C(=O)N,NC(=O)c1ncn(C2OC(CO)C(O)C2O)c1N
1,acamprosate,CC(=O)NCCCS(=O)(=O)O,CC(=O)NCCCS(=O)(=O)O
2,acarbose,CC1C(C(C(C(O1)OC2C(OC(C(C2O)O)OC3C(OC(C(C3O)O)...,CC1OC(OC2C(CO)OC(OC3C(CO)OC(O)C(O)C3O)C(O)C2O)...
3,acea,CCCCCC=CCC=CCC=CCC=CCCCC(=O)NCCCl,CCCCCC=CCC=CCC=CCC=CCCCC(=O)NCCCl
4,acebutolol,CCCC(=O)NC1=CC(=C(C=C1)OCC(CNC(C)C)O)C(=O)C,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1
...,...,...,...
905,fluvastatin,CC(C)N1C2=CC=CC=C2C(=C1C=CC(CC(CC(=O)O)O)O)C3=...,CC(C)n1c(C=CC(O)CC(O)CC(=O)O)c(-c2ccc(F)cc2)c2...
906,fluvoxamine,COCCCCC(=NOCCN)C1=CC=C(C=C1)C(F)(F)F,COCCCCC(=NOCCN)c1ccc(C(F)(F)F)cc1
907,fmk,CC1=CC=C(C=C1)C2=C(N(C3=NC=NC(=C23)N)CCCO)C(=O)CF,Cc1ccc(-c2c(C(=O)CF)n(CCCO)c3ncnc(N)c23)cc1
908,folic-acid,C1=CC(=CC=C1C(=O)NC(CCC(=O)O)C(=O)O)NCC2=CN=C3...,Nc1nc2ncc(CNc3ccc(C(=O)NC(CCC(=O)O)C(=O)O)cc3)...


In [24]:
result_df = pd.DataFrame(columns=['drug_name', 'SMILES', 'score'])
result_df['drug_name'] = list(df_drugs['drug_name'])
result_df['SMILES'] = list(df_drugs['standardized_SMILES'])
result_df['score'] = ytest
result_df = result_df.sort_values('score', ascending=False)
result_df

Unnamed: 0,drug_name,SMILES,score
536,cyclothiazide,NS(=O)(=O)c1cc2c(cc1Cl)NC(C1CC3C=CC1C3)NS2(=O)=O,0.877037
719,efaroxan,CCC1(C2=NCCN2)Cc2ccccc2O1,0.753360
175,astemizole,COc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2...,0.648095
433,chrysophanic-acid,Cc1cc(O)c2c(c1)C(=O)c1cccc(O)c1C2=O,0.623187
641,diisononyl-phthalate,CC(C)CCCCCCOC(=O)c1ccccc1C(=O)OCCCCCCC(C)C,0.602389
...,...,...,...
119,amprenavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(...,-0.518918
24,actinomycin-d,Cc1c2oc3c(C)ccc(C(=O)NC4C(=O)NC(C(C)C)C(=O)N5C...,-0.522309
31,adipamide,NC(=O)CCCCC(N)=O,-0.541850
84,amanitin,CCC(C)C1NC(=O)CNC(=O)C2Cc3c([nH]c4cc(O)ccc34)S...,-0.550782


In [None]:
result_df.to_csv('../../data/combined_model_results_910.csv', index=False)
result_df[:100].to_csv('../../data/combined_model_results_top_100.csv', index=False)

# Additional work - delete afterwards

In [32]:
import pandas as pd
df_results = pd.read_csv('../../data/results.csv')
df_results

Unnamed: 0,drug_name,id,SMILES,score,drug_name.1,SMILES.1,score.1
0,Phenoxybenzamine,ZINC000053045055,C[C@H](COc1ccccc1)N(CCCl)Cc1ccccc1,0.560407,cyclothiazide,,0.929265
1,Ketoprofen,ZINC000000005560,C[C@H](C(=O)O)c1cccc(C(=O)c2ccccc2)c1,0.555724,efaroxan,,0.797355
2,Clomifeno,ZINC000012402836,CCN(CC)CCOc1ccc(/C(=C(\Cl)c2ccccc2)c2ccccc2)cc1,0.548674,cpd-b,,0.650772
3,,ZINC000169621220,C[C@@H]1C/C=C/C=C/C=C/C=C/[C@H](O[C@H]2O[C@H](...,0.547296,astemizole,,0.649289
4,Remifentanil,ZINC000000538283,CCC(=O)N(c1ccccc1)C1(C(=O)OC)CCN(CCC(=O)OC)CC1,0.545854,fenoprofen,,0.614407
...,...,...,...,...,...,...,...
95,,ZINC000100008319,C[N+]1(C)[C@H]2C[C@H](OC(=O)C(O)(c3cccs3)c3ccc...,0.473283,florfenicol,,0.439168
96,Dtic,ZINC000018099446,CN(C)/N=N\c1[nH]cnc1C(N)=O,0.471676,debrisoquine,,0.438553
97,Atropine,ZINC000100009278,CN1[C@H]2CC[C@@H]1C[C@H](OC(=O)[C@@H](CO)c1ccc...,0.471501,alosetron,,0.438176
98,Isocarboxacide,ZINC000000001587,Cc1cc(C(=O)NNCc2ccccc2)no1,0.470706,digitoxin,,0.438079


In [33]:
standardized_SMILES = []
for i in range(len(df_results)):
    for j in range(len(df_drugs)):
        if df_results['drug_name.1'][i] == df_drugs['drug_name'][j]:
            standardized_SMILES.append(df_drugs['standardized_SMILES'][j])
df_results['SMILES.1'] = standardized_SMILES
df_results

Unnamed: 0,drug_name,id,SMILES,score,drug_name.1,SMILES.1,score.1
0,Phenoxybenzamine,ZINC000053045055,C[C@H](COc1ccccc1)N(CCCl)Cc1ccccc1,0.560407,cyclothiazide,NS(=O)(=O)c1cc2c(cc1Cl)NC(C1CC3C=CC1C3)NS2(=O)=O,0.929265
1,Ketoprofen,ZINC000000005560,C[C@H](C(=O)O)c1cccc(C(=O)c2ccccc2)c1,0.555724,efaroxan,CCC1(C2=NCCN2)Cc2ccccc2O1,0.797355
2,Clomifeno,ZINC000012402836,CCN(CC)CCOc1ccc(/C(=C(\Cl)c2ccccc2)c2ccccc2)cc1,0.548674,cpd-b,Cc1cc(Oc2ncc(C(F)(F)F)cc2Cl)ccc1CC1SC(=O)NC1=O,0.650772
3,,ZINC000169621220,C[C@@H]1C/C=C/C=C/C=C/C=C/[C@H](O[C@H]2O[C@H](...,0.547296,astemizole,COc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2...,0.649289
4,Remifentanil,ZINC000000538283,CCC(=O)N(c1ccccc1)C1(C(=O)OC)CCN(CCC(=O)OC)CC1,0.545854,fenoprofen,CC(C(=O)O)c1cccc(Oc2ccccc2)c1,0.614407
...,...,...,...,...,...,...,...
95,,ZINC000100008319,C[N+]1(C)[C@H]2C[C@H](OC(=O)C(O)(c3cccs3)c3ccc...,0.473283,florfenicol,CS(=O)(=O)c1ccc(C(O)C(CF)NC(=O)C(Cl)Cl)cc1,0.439168
96,Dtic,ZINC000018099446,CN(C)/N=N\c1[nH]cnc1C(N)=O,0.471676,debrisoquine,N=C(N)N1CCc2ccccc2C1,0.438553
97,Atropine,ZINC000100009278,CN1[C@H]2CC[C@@H]1C[C@H](OC(=O)[C@@H](CO)c1ccc...,0.471501,alosetron,Cc1[nH]cnc1CN1CCc2c(c3ccccc3n2C)C1=O,0.438176
98,Isocarboxacide,ZINC000000001587,Cc1cc(C(=O)NNCc2ccccc2)no1,0.470706,digitoxin,CC1OC(OC2C(O)CC(OC3C(O)CC(OC4CCC5(C)C(CCC6C5CC...,0.438079


In [35]:
df_results.to_csv('../../data/results.csv', index=False)

# Check if the test drugs are already present in training set drugs 

In [1]:
import pandas as pd
df_train = pd.read_csv('../../data/training_dataset_smiles_standardized.csv')
df_train


Unnamed: 0,DRUG_NAME,SMILES,standardized_SMILES
0,Levetiracetam,CCC(C(=O)N)N1CCCC1=O,CCC(C(N)=O)N1CCCC1=O
1,Daptomycin,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,CCCCCCCCCC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)NC(CC(...
2,Lansoprazole,CC1=C(C=CN=C1CS(=O)C2=NC3=CC=CC=C3N2)OCC(F)(F)F,Cc1c(OCC(F)(F)F)ccnc1CS(=O)c1nc2ccccc2[nH]1
3,Adapalene,COC1=C(C=C(C=C1)C2=CC3=C(C=C2)C=C(C=C3)C(=O)O)...,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...
4,Granisetron HCl,CN1C2CCCC1CC(C2)NC(=O)C3=NN(C4=CC=CC=C43)C.Cl,CN1C2CCCC1CC(NC(=O)c1nn(C)c3ccccc13)C2.Cl
...,...,...,...
1733,Scopolamine hydrobromide,CN1C2CC(CC1C3C2O3)OC(=O)C(CO)C4=CC=CC=C4.Br,Br.CN1C2CC(OC(=O)C(CO)c3ccccc3)CC1C1OC12
1734,Chitosamine hydrochloride,C(C(C(C(C(C=O)N)O)O)O)O.Cl,Cl.NC(C=O)C(O)C(O)C(O)CO
1735,Nicotine,CN1CCCC1C2=CN=CC=C2,CN1CCCC1c1cccnc1
1736,Cepharanthine,CN1CCC2=CC3=C(C4=C2C1CC5=CC=C(C=C5)OC6=C(C=CC(...,COc1ccc2cc1Oc1ccc(cc1)CC1c3c(cc4c(c3Oc3cc5c(cc...


In [4]:
df_test_910 = pd.read_csv('../../data/combined_model_results_910.csv')
df_test_100 = pd.read_csv('../../data/combined_model_results_top_100.csv')
df_test_910

Unnamed: 0,drug_name,SMILES,score
0,cyclothiazide,NS(=O)(=O)c1cc2c(cc1Cl)NC(C1CC3C=CC1C3)NS2(=O)=O,0.872713
1,efaroxan,CCC1(C2=NCCN2)Cc2ccccc2O1,0.764166
2,astemizole,COc1ccc(CCN2CCC(Nc3nc4ccccc4n3Cc3ccc(F)cc3)CC2...,0.655603
3,chrysophanic-acid,Cc1cc(O)c2c(c1)C(=O)c1cccc(O)c1C2=O,0.623146
4,diisononyl-phthalate,CC(C)CCCCCCOC(=O)c1ccccc1C(=O)OCCCCCCC(C)C,0.608616
...,...,...,...
905,amprenavir,CC(C)CN(CC(O)C(Cc1ccccc1)NC(=O)OC1CCOC1)S(=O)(...,-0.516846
906,actinomycin-d,Cc1c2oc3c(C)ccc(C(=O)NC4C(=O)NC(C(C)C)C(=O)N5C...,-0.523516
907,adipamide,NC(=O)CCCCC(N)=O,-0.532908
908,amanitin,CCC(C)C1NC(=O)CNC(=O)C2Cc3c([nH]c4cc(O)ccc34)S...,-0.540386


In [13]:
train_test_910_count = [[i, j] for i, j in zip(list(df_test_910['SMILES']), list(df_test_910['drug_name'])) if i in list(df_train['SMILES'])]
train_test_100_count = [[i, j] for i, j in zip(list(df_test_100['SMILES']), list(df_test_100['drug_name'])) if i in list(df_train['SMILES'])]
print(train_test_910_count)
print(train_test_100_count)

[['CC(O)(P(=O)(O)O)P(=O)(O)O', 'etidronic-acid'], ['COC(=O)C=CC(=O)OC', 'dimethyl-fumarate']]
[]
