# Recommender pour les fausses réactions

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tqdm.notebook as tqdm
import tensorflow as tf
#tf.config.set_visible_devices([], 'GPU')
import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,Dropout
from tensorflow.keras.optimizers import Adam

In [3]:
templates_all = pd.read_csv("Purified_Templates.csv")
reactions_all = pd.read_csv("Purified_Reactions.csv")
templates_single = templates_all.drop_duplicates(subset="reaction_smarts")

In [9]:
my_map = dict(list(enumerate(smarts_templates)))
inv_map = {v: k for k, v in my_map.items()}

In [19]:
#On sépare les données sous forme d'une liste d'indices
indexes = np.arange(200000)
val_prop = 0.1
indexes_train, indexes_test = train_test_split(indexes, random_state=42, test_size=val_prop)

In [13]:
def fp_from_fp_db(fp_db):
    '''
    Cette fonction permet de recréer le fingerprint de taille 2048
    à partir du fingerprint codé en mémoire
    '''
    res = np.zeros(2048)
    #On initialise un array de 2048 zéros
    list_idx = fp_db.split(" ")
    #On récupère tous les indices non nuls du fingerprint
    for el in list_idx[:-1]:
        lfp = el.split("-")
        n = len(lfp)
        #En fonction des cas, on complète avec la bonne valeur
        if n == 1:
            res[int(lfp[0])] = 1
        elif n==2:
            res[int(lfp[0])] = int(lfp[-1])
        else:
            res[int(lfp[0])] = -int(lfp[-1])
    return res

In [14]:
class DataGenerator(keras.utils.Sequence):
    #Générateur "classique" : voir Generateur Filtre pour un générateur entièrement commenté
    def __init__(self, list_IDs, batch_size=32, n_channels=1,
                 n_classes=10, shuffle=True, _filename_fp="", _filename_templates = "", _data_fp = None, _data_templates=None):
        #Initialisation des paramètres
        self.batch_size = batch_size
        
        self.list_IDs = list_IDs
        
        self.n_channels = n_channels
        
        self.n_classes = n_classes
        
        self.shuffle = shuffle
        
        self.p_bar = tqdm.tqdm(total = self.__len__())
        
        if (_filename_fp != "" and _filename_templates != ""):
            self.filename_fp = _filename_fp
            self.filename_templates = _filename_templates
        
            self.data_fp = pd.read_csv(_filename_fp)
            self.data_templates = pd.read_csv(_filename_templates)
        else:
            self.data_fp = _data_fp
            self.data_templates = _data_templates
            
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
       
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_IDs_temp = [self.list_IDs[k] for k in indexes]

    
        X, y = self.__data_generation(list_IDs_temp)
        
        self.p_bar.update(1)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):

        X = []
        y = np.empty((self.batch_size), dtype=int)

        for i,val in enumerate(list_IDs_temp):
            r = fp_from_fp_db(self.data_fp.iloc[val,1])-fp_from_fp_db(self.data_fp.iloc[val,2])
            X.append(r)
            y[i] = inv_map[self.data_templates.iloc[val,4]]
            
        X = np.array(X)
        y = keras.utils.to_categorical(y, num_classes=self.n_classes)
        return np.array(X), y

In [15]:
#Sélection des paramètres
n_s = temp_2_unique.shape[0]
batch_size_val = 256
nb_epochs = 30

In [12]:
#Création du Modèle
adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)

model = Sequential()
model.add(Dense(512,input_shape=(2048,),activation="elu", name = "Dense_1"))
model.add(Dropout(0.4, name = "Dropout"))
model.add(Dense(57959, activation="softmax", name = "Dense_2_sortie"))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'top_k_categorical_accuracy'])
# Design model

In [21]:
training_generator = DataGenerator(indexes_train, batch_size=batch_size_val, n_channels=1,
                 n_classes=n_s, shuffle=True, _data_fp = fp_2, _data_templates = temp_2)

testing_generator = DataGenerator(indexes_test, batch_size=batch_size_val, n_channels=1, n_classes=n_s, shuffle=True, _data_fp = fp_2, _data_templates= temp_2)
# Train model on dataset

model.fit_generator(generator=training_generator,
                    use_multiprocessing=False,
                    verbose=2,
                    initial_epoch=0,
                    max_queue_size=20,
                    epochs=nb_epochs,
                    workers=20,
                    validation_data  = testing_generator)

  0%|          | 0/703 [00:00<?, ?it/s]

  0%|          | 0/78 [00:00<?, ?it/s]

  model.fit_generator(generator=training_generator,


Epoch 1/30
703/703 - 85s - loss: 0.5810 - accuracy: 0.9398 - top_k_categorical_accuracy: 0.9708 - val_loss: 2.8228 - val_accuracy: 0.7396 - val_top_k_categorical_accuracy: 0.7884 - 85s/epoch - 121ms/step
Epoch 2/30
703/703 - 84s - loss: 0.3617 - accuracy: 0.9463 - top_k_categorical_accuracy: 0.9771 - val_loss: 2.1100 - val_accuracy: 0.7966 - val_top_k_categorical_accuracy: 0.8759 - 84s/epoch - 119ms/step
Epoch 3/30
703/703 - 84s - loss: 0.2026 - accuracy: 0.9579 - top_k_categorical_accuracy: 0.9913 - val_loss: 1.8730 - val_accuracy: 0.8454 - val_top_k_categorical_accuracy: 0.8878 - 84s/epoch - 120ms/step
Epoch 4/30
703/703 - 84s - loss: 0.1238 - accuracy: 0.9751 - top_k_categorical_accuracy: 0.9984 - val_loss: 1.9175 - val_accuracy: 0.8506 - val_top_k_categorical_accuracy: 0.8883 - 84s/epoch - 119ms/step
Epoch 5/30
703/703 - 83s - loss: 0.1077 - accuracy: 0.9785 - top_k_categorical_accuracy: 0.9986 - val_loss: 1.9602 - val_accuracy: 0.8525 - val_top_k_categorical_accuracy: 0.8891 - 83s

KeyboardInterrupt: 

In [22]:
#On sauve le modèle
model.save("our_recommender_model")

INFO:tensorflow:Assets written to: our_recommender_model_2\assets
