In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [5]:
data = pd.read_csv("data_petit_petit.csv")

In [6]:
data.shape

(7, 3)

In [4]:
""" Module routines for pre-processing data for filter policy training
"""
import argparse
from typing import Sequence, Optional

import pandas as pd
import numpy as np
from scipy import sparse

from aizynthfinder.training.utils import (
    Config,
    split_and_save_data,
    smiles_to_fingerprint,
    reaction_to_fingerprints,
    split_reaction_smiles,
)


def _get_config(optional_args: Optional[Sequence[str]] = None) -> Config:
    parser = argparse.ArgumentParser(
        "Tool to pre-process a template library to be used to train a in-scope filter network policy"
    )
    parser.add_argument("config", help="the filename to a configuration file")
    args = parser.parse_args(optional_args)

    return Config(args.config)


def main(optional_args: Optional[Sequence[str]] = None) -> None:
    """Entry-point for the preprocess_filter tool"""
    config = _get_config(optional_args)

    true_dataset = pd.read_csv(
        config.filename("library"),
        index_col=False,
        header=0 if config["in_csv_headers"] else None,
        names=None if config["in_csv_headers"] else config["library_headers"][:-1],
        sep=config["csv_sep"],
    )
    true_dataset["true_product"] = 1
    false_dataset = pd.read_csv(
        config.filename("false_library"),
        index_col=False,
        header=0 if config["in_csv_headers"] else None,
        names=None if config["in_csv_headers"] else config["library_headers"][:-1],
        sep=config["csv_sep"],
    )
    false_dataset["true_product"] = 0
    dataset = true_dataset.append(false_dataset, sort=False)

    if config["reaction_smiles_column"]:
        dataset = split_reaction_smiles(dataset, config)

    print("Dataset loaded, generating Labels...", flush=True)
    labels = dataset["true_product"].to_numpy()
    split_and_save_data(labels, "labels", config)

    print("Labels created and split, generating Inputs...", flush=True)
    products = dataset[config["column_map"]["products"]].to_numpy()
    reactants = dataset[config["column_map"]["reactants"]].to_numpy()
    inputs = np.apply_along_axis(
        reaction_to_fingerprints, 0, [products, reactants], config
    ).astype(np.int8)
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs2", config)

    inputs = np.apply_along_axis(smiles_to_fingerprint, 0, [products], config).astype(
        np.int8
    )
    inputs = sparse.lil_matrix(inputs.T).tocsr()
    split_and_save_data(inputs, "inputs", config)

    print("Inputs created and split, splitting Full Dataset...", flush=True)
    split_and_save_data(dataset, "library", config)


if __name__ == "__main__":
    main()
 

usage: Tool to pre-process a template library to be used to train a in-scope filter network policy [-h] config
Tool to pre-process a template library to be used to train a in-scope filter network policy: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [111]:
def get_reactant_product(dataframe, column_name):
    smiles_split = dataframe[[column_name]]
    reactants = []
    products = []
    for i in smiles_split.index:
        diviser = smiles_split[column_name][i].split(">")
        reactants.append(diviser[0])
        products.append(diviser[2])
    return reactants, products

#We do not consider the agent of the midlle

In [8]:
reactants, products = get_reactant_product(data)

In [9]:
reactants

['[Br:1][CH2:2][CH2:3][OH:4].[CH2:5]([S:7](Cl)(=[O:9])=[O:8])[CH3:6].CCOCC',
 '[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7](Cl)(=[O:9])=[O:8].CCOCC',
 '[CH2:1]([Cl:4])[CH2:2][OH:3].CCOCC.[CH2:10]([S:14](Cl)(=[O:16])=[O:15])[CH:11]([CH3:13])[CH3:12]',
 '[Br:1][CH2:2][CH2:3][OH:4].[CH2:5]([S:7](Cl)(=[O:9])=[O:8])[CH3:6].CCOCC',
 '[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7](Cl)(=[O:9])=[O:8].CCOCC',
 '[CH2:1]([Cl:4])[CH2:2][OH:3].CCOCC.[CH2:10]([S:14](Cl)(=[O:16])=[O:15])[CH:11]([CH3:13])[CH3:12]',
 '[Cl:1][C:2]1[N:3]=[CH:4][C:5]2[C:10]([CH:11]=1)=[C:9]([N+:12]([O-])=O)[CH:8]=[CH:7][CH:6]=2.O.[OH-].[Na+]']

In [10]:
products

['[CH2:5]([S:7]([O:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8])[CH3:6]',
 '[CH3:6][S:7]([O:5][CH2:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8]',
 '[CH2:10]([S:14]([O:3][CH2:2][CH2:1][Cl:4])(=[O:16])=[O:15])[CH:11]([CH3:13])[CH3:12]',
 '[CH2:5]([S:7]([O:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8])[CH3:6]',
 '[CH3:6][S:7]([O:5][CH2:4][CH2:3][CH2:2][Br:1])(=[O:9])=[O:8]',
 '[CH2:10]([S:14]([O:3][CH2:2][CH2:1][Cl:4])(=[O:16])=[O:15])[CH:11]([CH3:13])[CH3:12]',
 '[Cl:1][C:2]1[N:3]=[CH:4][C:5]2[C:10]([CH:11]=1)=[C:9]([NH2:12])[CH:8]=[CH:7][CH:6]=2 |f:2.3|']

In [84]:

    return (
        Molecule(smiles=smiles)
        .fingerprint(
            config["fingerprint_radius"],
            config["fingerprint_len"],
        )
        .astype(np.int8)
    )

SyntaxError: 'return' outside function (<ipython-input-84-5370351c53f5>, line 1)

In [11]:
from aizynthfinder.chem import Molecule, MoleculeException

In [12]:
def fingerprint(smiles):
    return Molecule(smiles=smiles).fingerprint(2,2048).astype(np.int8)

In [13]:
reactant_fingerprints = []
for reac in reactants:
    reactant_fingerprints.append(fingerprint(reac))


In [14]:
reactant_fingerprints

[array([0, 0, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 1, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 1, 0, ..., 0, 0, 0], dtype=int8),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int8)]

In [103]:
data_large = pd.read_csv("reactions_large.csv", delimiter=';')
data_large

Unnamed: 0,ReactionSmiles,PatentNumber,Year
0,[Br:1][CH2:2][CH2:3][OH:4].[CH2:5]([S:7](Cl)(=...,US03930836,1976
1,[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7]...,US03930836,1976
2,[CH2:1]([Cl:4])[CH2:2][OH:3].CCOCC.[CH2:10]([S...,US03930836,1976
3,[Br:1][CH2:2][CH2:3][OH:4].[CH2:5]([S:7](Cl)(=...,US03930839,1976
4,[Br:1][CH2:2][CH2:3][CH2:4][OH:5].[CH3:6][S:7]...,US03930839,1976
...,...,...,...
193424,Cl[C:2]([F:27])([F:26])[C:3]1[N:8]=[C:7]([C:9]...,US04835279,1989
193425,Cl[C:2]1[C:7]([C:8]([O:10][CH3:11])=[O:9])=[C:...,US04835279,1989
193426,Cl[C:2]1[C:7]([C:8]([O:10][CH3:11])=[O:9])=[C:...,US04835279,1989
193427,Cl[C:2]1[C:7]([C:8]([O:10][CH3:11])=[O:9])=[C:...,US04835279,1989


In [178]:
reactants_large, products_large = get_reactant_product(data_large,"ReactionSmiles")
number = 150
reactants_large, products_large = reactants_large[:number], products_large[:number]

In [179]:
reactants_large_fingerprint = []
products_large_fingerprint = []
for reac in reactants_large:
    reactants_large_fingerprint.append(fingerprint(reac))
for prod in products_large:
    products_large_fingerprint.append(fingerprint(prod))

len(products_large_fingerprint)

150

In [180]:
syba_reac_score = []
n = len(reactants_large_fingerprint)
for i in range(n):
    syba_reac_score.append(0.1*syba.predict(smi=reactants_large[i]) +0.9*syba.predict(smi=products_large[i]))

In [181]:
#k = len(syba_reac_score)
#syba_reac_score = np.array(syba_reac_score)
#mea = syba_reac_score.mean()
#std = syba_reac_score.std()
#syba_reac_score = (syba_reac_score-mea)/std
syba_reac_score

[13.695508615519575,
 19.52607875243147,
 19.59054089859256,
 13.695508615519575,
 19.52607875243147,
 19.59054089859256,
 26.579339293967152,
 36.981782761857204,
 36.2495120558943,
 48.035406516130124,
 46.59302814095119,
 47.50298127311075,
 46.80738385981445,
 35.14878355367847,
 -9.157833656019148,
 -23.47957157682729,
 6.980376462997723,
 21.5923560522979,
 29.784548626507817,
 57.77509446241738,
 32.04616694269324,
 -2.086502633917258,
 60.722137653799535,
 66.91393767931494,
 -13.708247821326687,
 17.659539945399047,
 13.029746942486264,
 28.7382829203047,
 1.4019478220740287,
 40.49535499758619,
 22.689695856649628,
 11.797342337409985,
 24.678798248832173,
 22.414784518699655,
 29.229685351144568,
 32.13232035808455,
 45.24445963136218,
 20.481317577746758,
 20.481317577746758,
 -0.6610489765218374,
 17.58526540529345,
 8.684510208351695,
 26.461839632593225,
 13.593422485605034,
 24.010356881298584,
 9.267530093765206,
 13.17971127872727,
 26.722618624471302,
 7.508248568884

In [154]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [182]:
entree_data = []
for i in range(n):
    entree_data += [np.concatenate((reactants_large_fingerprint[i],products_large_fingerprint[i]))]
print(reactants_large_fingerprint)

[array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 1, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 0, ..., 0, 0, 0], dtype=int8), array([0, 0, 1, ..., 0, 0, 0], 

In [141]:
len(entree_data[0])

4096

In [142]:
len(syba_reac_score)

100

In [183]:
x_train, x_test, y_train, y_test = train_test_split(entree_data, syba_reac_score, test_size=0.2)

In [184]:
for i in range(len(y_train)):
    y_train[i] = int(y_train[i])

In [185]:
log_regression_alg = LinearRegression()

In [186]:
log_regression_alg.fit(x_train,y_train)

LinearRegression()

In [187]:
test_predict = log_regression_alg.predict(x_test)

In [176]:
print(test_predict)

[ 7.09839718e+12  1.48594687e+12  3.58728485e+13 -7.76666716e+11
  1.44424364e+13  5.12835007e+12 -1.68822240e+13 -3.34619564e+12
  1.03524465e+12  6.30936151e+12  3.94895738e+13  2.72398068e+11
  4.19660580e+11 -6.16585545e+11  2.48265560e+12  5.12835007e+12
 -6.66650249e+11  4.29632079e+11 -1.53245453e+13  1.46212446e+12
 -4.36894325e+12  2.66791992e+01  7.09839718e+12  7.01987885e+12
  1.12711716e+12 -1.79873689e+12  5.77354290e+12 -3.02055677e+11
 -1.64231201e+12  3.43114247e+13]


In [188]:
print(y_test)

[24.678798248832173, 19.52607875243147, 21.139196570916184, 19.560285125592678, 50.8049130553514, 59.71032229722921, 64.99360554908222, 13.479472651249655, 50.6320817958367, 53.81201804733018, 6.809090638847915, 66.91393767931494, 46.93759781049551, 23.215649108898404, 11.463524477685464, 13.593422485605034, 10.796349447828733, 40.903791071181956, 25.359616009751157, 11.797342337409985, 13.281638229824289, 26.722618624471302, 21.5923560522979, 23.972585597567054, 33.60492179350062, 11.07920055139373, 54.11090622603745, 20.919577036914557, 27.267894351670126, 32.04616694269324]


In [189]:
smi_test = "CCC"

In [190]:
syba.predict(smi_test)

11.292091326780156

In [191]:
from aizynthfinder.interfaces import AiZynthApp

In [192]:
configfile=r"C:\Users\Yassine\Desktop\aizynthfinder-master\models_to_load\config.yml"
app = AiZynthApp(configfile)

Loading template-based expansion policy model from C:\Users\Yassine\Desktop\aizynthfinder-master\models_to_load\uspto_model.hdf5 to uspto
Loading templates from C:\Users\Yassine\Desktop\aizynthfinder-master\models_to_load\uspto_templates.hdf5 to uspto
Loading filter policy model from C:\Users\Yassine\Desktop\aizynthfinder-master\models_to_load\uspto_filter_model.hdf5 to uspto
Loading stock from C:\Users\Yassine\Desktop\aizynthfinder-master\models_to_load\zinc_stock.hdf5 to zinc
Selected as molecule cost: zero


Text(value='', continuous_update=False, description='SMILES')

Output(layout=Layout(border='1px solid silver', height='180px', width='50%'))

Tab(children=(HBox(children=(VBox(children=(Label(value='Stocks'), Checkbox(value=True, description='zinc', st…

HBox(children=(Button(description='Run Search', style=ButtonStyle()), Button(description='Extend Search', styl…

Output(layout=Layout(border='1px solid silver', height='320px', overflow='auto', width='99%'))

HBox(children=(Button(description='Show Reactions', style=ButtonStyle()), Dropdown(description='Routes: ', opt…

Output(layout=Layout(border='1px solid silver', width='99%'))