In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import tensorflow.keras as keras

In [2]:
import tqdm.notebook as tqdm

In [3]:
templates_all = pd.read_csv("Purified_Templates.csv")

In [4]:
reactions_data = pd.read_json("uspto-reactions.json")

In [5]:
reactions_fp_data = pd.read_csv('Reactions_Fingerprint_True.csv')

In [6]:
from aizynthfinder.chem import Molecule, Reaction, MoleculeException
from aizynthfinder.training.utils import (
    Config,
    create_reactants_molecules,
    reverse_template,
    reaction_hash,
    reactants_to_fingerprint,
)
from aizynthfinder.utils.models import CUSTOM_OBJECTS, load_keras_model


In [7]:
model = keras.models.load_model("our_recommender_model_2")

In [8]:
def fp_from_fp_db(fp_db):
    res = np.zeros(2048)
    list_idx = fp_db.split(" ")
    for el in list_idx[:-1]:
        lfp = el.split("-")
        n = len(lfp)
        if n == 1:
            res[int(lfp[0])] = 1
        elif n==2:
            res[int(lfp[0])] = int(lfp[-1])
        else:
            res[int(lfp[0])] = -int(lfp[-1])
    return res

In [9]:
def recommended_rows(row1,row2):
    fingerprints = np.array([fp_from_fp_db(reactions_fp_data.iloc[row,1])- fp_from_fp_db(reactions_fp_data.iloc[row,2]) for row in range(row1,row2)])
    return np.argsort(model.predict(fingerprints))[::-1][:20]

In [10]:
def predictions(row1,row2):
    fingerprints = []
    skip = []
    for row in range(row1,row2):
        try : 
            fingerprints+= [fp_from_fp_db(reactions_fp_data.iloc[row,1])- fp_from_fp_db(reactions_fp_data.iloc[row,2])]
        except:
            skip.append(row)
            fingerprints += [np.zeros(2048)]
    fingerprints = np.array(fingerprints)
    predictions = model.predict(fingerprints)
    return [np.argsort(predictions[i])[::-1][:20] for i in range(fingerprints.shape[0])], skip
    

In [11]:
pred,skip = predictions(0,10000)

In [42]:
def recommender_sample_lib(r1,r2):
    data_frame = pd.DataFrame(columns=['Reactifs', 'Produits'])
    set_val = (set(list(range(r1,r2)))-set(skip))
    p_bar = tqdm.tqdm(total = len(set_val))
    for val in set_val:
        if val in skip:
            continue
        row_reactants = reactions_data.iloc[val,1]
        row_products  = reactions_data.iloc[val,2]
        if (type(row_reactants) == str):
             mols = create_reactants_molecules(row_reactants)
        else:
             continue
        try:
             ref_mol = Molecule(smiles=row_products, sanitize=True)
        except MoleculeException:
             continue
        new_product=None
        for template_row in pred[val-r1]:
            smarts_fwd = reverse_template(my_map[template_row])
            try:
                new_product = Reaction(mols=mols, smarts=smarts_fwd).apply()[0][0]
            except (ValueError, IndexError):
                continue
            if new_product.basic_compare(ref_mol):
                continue
            break  # If we have reached here, we have found a match that fits all criteria

        if not new_product:
            continue
        data_frame = pd.concat([pd.DataFrame(data={"Reactifs" : row_reactants, "Products" : [new_product.smiles]}),data_frame], ignore_index=True)
        p_bar.update(1)
               
    data_frame.to_csv(f"Data_frame_{r1}.csv")
    return data_frame

In [37]:
smarts_templates = templates_all[:200000].drop_duplicates(subset ='reaction_smarts')['reaction_smarts'].to_list()
my_map = dict(list(enumerate(smarts_templates)))

In [38]:
inv_map = {v: k for k, v in my_map.items()}

In [53]:
l2 = []
p_b = tqdm.tqdm(total=20)
for i in range(21,40):
    pred,skip = predictions(10000*i,10000*(i+1))
    l2.append(recommender_sample_lib(10000*i,10000*(i+1)))
    p_b.update(1)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/9959 [00:00<?, ?it/s]

  0%|          | 0/9955 [00:00<?, ?it/s]

  0%|          | 0/9963 [00:00<?, ?it/s]

  0%|          | 0/9953 [00:00<?, ?it/s]

  0%|          | 0/9972 [00:00<?, ?it/s]

  0%|          | 0/9956 [00:00<?, ?it/s]

  0%|          | 0/9964 [00:00<?, ?it/s]

  0%|          | 0/9967 [00:00<?, ?it/s]

  0%|          | 0/9961 [00:00<?, ?it/s]

  0%|          | 0/9977 [00:00<?, ?it/s]

  0%|          | 0/9979 [00:00<?, ?it/s]

  0%|          | 0/9961 [00:00<?, ?it/s]

  0%|          | 0/9968 [00:00<?, ?it/s]

  0%|          | 0/9909 [00:00<?, ?it/s]

  0%|          | 0/9973 [00:00<?, ?it/s]

  0%|          | 0/9972 [00:00<?, ?it/s]

  0%|          | 0/9955 [00:00<?, ?it/s]

  0%|          | 0/9937 [00:00<?, ?it/s]

  0%|          | 0/9962 [00:00<?, ?it/s]

In [54]:
len(l2)

19

In [46]:
l2.append(l)

In [55]:
dataaaa2 = pd.concat(l2,ignore_index=True)

In [56]:
dattaaaa = pd.concat([dataaaa, dataaaa2], ignore_index=True)

In [57]:
dattaaaa

Unnamed: 0,Reactifs,Products,Produits
0,C[O:2][C:3](=[O:33])/[CH:4]=[C:5](\[O:10][C:11...,C[O:8][C:6](C(=CC(=O)O)[O:10][c:11]1[c:12]([CH...,
1,O=C1[C:7]2[CH:8]=[CH:9][C:10]([O:12][CH2:13][C...,O=C(O)c1cc(=O)c2ccc(O[CH2:13][CH2:14][CH2:15][...,
2,[O:1]=[C:2]1[C:7]2[CH:8]=[CH:9][C:10]([O:12][C...,N#Cc1o[c:6]2[c:7]([c:2](=[O:1])[cH:3]1)[cH:8][...,
3,[O:1]=[C:2]1[C:7]2[CH:8]=[CH:9][C:10]([O:12][C...,c1c(-c2nnn[nH]2)o[c:6]2[c:7]([c:2]1=[O:1])[cH:...,
4,[O:1]=[C:2]1[C:7]2[CH:8]=[CH:9][C:10]([O:12][C...,O=C([O-])[c:4]1[cH:3][c:2](=[O:1])[c:7]2[c:6](...,
...,...,...,...
261820,[H-].[Al+3].[Li+].[H-].[H-].[H-].C(O[C:10]([CH...,OCC1[CH2:13][CH2:14][CH:15]([CH:18]2[O:19][C:2...,
261821,[F:1][CH2:2][CH2:3][CH2:4][Br:5].[C:6]1([P:12]...,[Br-],
261822,[F:1][C:2]1[CH:3]=[C:4](Br)[CH:5]=[C:6]([F:8])...,c1c[c:2]([F:1])[cH:7][c:6]([F:8])c1,
261823,[N:1]([CH2:4][CH2:5][CH2:6][Si:7]([O:14][CH2:1...,O=C(NC[CH2:5][CH2:6][Si:7]([O:8][CH2:9][CH3:10...,


In [52]:
dattaaaa.to_csv('Data_False_Reactions_Recommen.csv')