In [1]:
import os

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
import tqdm.notebook as tqdm

In [54]:
import tensorflow as tf
#tf.config.set_visible_devices([], 'GPU')

In [6]:
from tensorflow.keras.utils import to_categorical

In [7]:
import tensorflow.keras as keras

In [8]:
templates_all = pd.read_csv("Purified_Templates.csv")

In [9]:
reactions_all = pd.read_csv("Purified_Reactions.csv")

In [10]:
templates_single = templates_all.drop_duplicates(subset="reaction_smarts")

In [11]:
fp = pd.read_csv("Purified_Fingerprints.csv")

In [12]:
smarts_templates = templates_single['reaction_smarts'].to_list()
my_map = dict(list(enumerate(smarts_templates)))

In [13]:
inv_map = {v: k for k, v in my_map.items()}

In [14]:
def fp_from_fp_db(fp_db):
    res = np.zeros(2048)
    list_idx = fp_db.split(" ")
    for el in list_idx[:-1]:
        lfp = el.split("-")
        n = len(lfp)
        if n == 1:
            res[int(lfp[0])] = 1
        elif n==2:
            res[int(lfp[0])] = int(lfp[-1])
        else:
            res[int(lfp[0])] = -int(lfp[-1])
    return res

In [15]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, batch_size=32, n_channels=1,
                 n_classes=10, shuffle=True, _filename_fp="", _filename_templates = "", _data_fp = None, _data_templates=None):
        'Initialization'
        
        self.batch_size = batch_size
        
        self.list_IDs = list_IDs
        
        self.n_channels = n_channels
        
        self.n_classes = n_classes
        
        self.shuffle = shuffle
        
        self.p_bar = tqdm.tqdm(total = self.__len__())
        
        if (_filename_fp != "" and _filename_templates != ""):
            self.filename_fp = _filename_fp
            self.filename_templates = _filename_templates
        
            self.data_fp = pd.read_csv(_filename_fp)
            self.data_templates = pd.read_csv(_filename_templates)
        else:
            self.data_fp = _data_fp
            self.data_templates = _data_templates
            
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)
        
        self.p_bar.update(1)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' 
        # Initialization
        X = []
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i,val in enumerate(list_IDs_temp):
            # Store sample
           # print(val)
            r = fp_from_fp_db(self.data_fp.iloc[val,1])-fp_from_fp_db(self.data_fp.iloc[val,2])
            X.append(r)
            # Store class
            y[i] = inv_map[self.data_templates.iloc[val,4]]
        X = np.array(X)
        y = keras.utils.to_categorical(y, num_classes=self.n_classes)
        return np.array(X), y

In [16]:
n_s = templates_single.shape[0]
batch_size_val = 256
nb_epochs = 50

In [17]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from tensorflow.keras.optimizers import Adam
adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)

model = Sequential()
model.add(Dense(512,input_shape=(2048,),activation="elu"))
model.add(Dropout(0.4))
model.add(Dense(n_s, activation="softmax"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'top_k_categorical_accuracy'])
# Design model

In [18]:
training_generator = DataGenerator(np.arange(reactions_all.shape[0]//100), batch_size=batch_size_val, n_channels=1,
                 n_classes=n_s, shuffle=True, _data_fp = fp, _data_templates = templates_all)
# Train model on dataset

model.fit_generator(generator=training_generator,
                    use_multiprocessing=False,
                    verbose=2,
                    initial_epoch=0,
                    max_queue_size=20,
                    epochs=nb_epochs,
                    workers=20)

  0%|          | 0/66 [00:00<?, ?it/s]

  model.fit_generator(generator=training_generator,


Epoch 1/50
66/66 - 26s - loss: 11.1443 - accuracy: 0.0392 - top_k_categorical_accuracy: 0.0739 - 26s/epoch - 398ms/step
Epoch 2/50
66/66 - 25s - loss: 8.2690 - accuracy: 0.1181 - top_k_categorical_accuracy: 0.2103 - 25s/epoch - 384ms/step
Epoch 3/50
66/66 - 25s - loss: 6.3615 - accuracy: 0.2605 - top_k_categorical_accuracy: 0.3924 - 25s/epoch - 384ms/step
Epoch 4/50
66/66 - 26s - loss: 4.6745 - accuracy: 0.4112 - top_k_categorical_accuracy: 0.5646 - 26s/epoch - 397ms/step
Epoch 5/50
66/66 - 25s - loss: 3.2303 - accuracy: 0.5447 - top_k_categorical_accuracy: 0.6916 - 25s/epoch - 384ms/step
Epoch 6/50
66/66 - 25s - loss: 2.1077 - accuracy: 0.6574 - top_k_categorical_accuracy: 0.8284 - 25s/epoch - 385ms/step
Epoch 7/50
66/66 - 25s - loss: 1.2736 - accuracy: 0.7808 - top_k_categorical_accuracy: 0.9440 - 25s/epoch - 386ms/step
Epoch 8/50
66/66 - 25s - loss: 0.7841 - accuracy: 0.8624 - top_k_categorical_accuracy: 0.9811 - 25s/epoch - 380ms/step
Epoch 9/50
66/66 - 25s - loss: 0.5001 - accurac

<keras.callbacks.History at 0x2510267a8b0>

In [20]:
model.save("our_recommender_model")

INFO:tensorflow:Assets written to: our_recommender_model\assets


In [55]:
ink = [fp_from_fp_db(fp.iloc[x,1])-fp_from_fp_db(fp.iloc[x,2]) for x in range(17000,30000)]
#print(ink.shape)
v = model.predict(np.array(ink))

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [None]:
v.shape

In [None]:
res = [max(v[i])]

In [47]:
max_index

82

In [49]:
my_map[max_index]

'[C:1]-[NH;D2;+0:2]-[CH2;D2;+0:4]-[C:3]-[OH;D1;+0:5]>>[C:1]-[NH2;D1;+0:2].[C:3]1-[CH2;D2;+0:4]-[O;H0;D2;+0:5]-1'

In [50]:
inv_map[templates_all.iloc[20000,4]]

82

In [38]:
v[0][1975]

0.0019511969