In [1]:
from __future__ import print_function

from rdkit import Chem
from rdkit.Chem import AllChem

import gzip, cPickle
import copy
import progressbar
import random

In [2]:
def cano(smiles): # canonicalize smiles by MolToSmiles function
    return Chem.MolToSmiles(Chem.MolFromSmiles(smiles))

def block(ch, smiles):
    return (ch + cano(smiles)) if (smiles != '') else ''

In [3]:
with gzip.open('data/subst/acid.pkl.gz', 'rb') as f:
    acid_list = cPickle.load(f)
    
with gzip.open('data/subst/alcohol_1.pkl.gz', 'rb') as f:
    alcohol_1_list = cPickle.load(f)
    
with gzip.open('data/subst/amine_1.pkl.gz', 'rb') as f:
    amine_1_list = cPickle.load(f)
    
length = len(acid_list)
print(length)

357


In [5]:
rxns = []

bar = progressbar.ProgressBar(max_value=length)

## Carboxylic acid

DIAZO = AllChem.ReactionFromSmarts('[C:1](=O)[O:2].[C:3]=[N+]=[N-]>>[C:1](=O)[O:2][C:3]')
RED = AllChem.ReactionFromSmarts('[C:1](=O)[O:2]>>[C:1][O:2]')
RLI = AllChem.ReactionFromSmarts('[C:1](=[O:2])O.[C:3][Li]>>[C:1](=[O:2])[C:3]')
CHLORI = AllChem.ReactionFromSmarts('[C:1](=[O:2])O>>[C:1](=[O:2])Cl')
HVZ = AllChem.ReactionFromSmarts('[CH2:1][C:2](=O)O>>[C:1](Br)[C:2](=O)O')
ESTER = AllChem.ReactionFromSmarts('[C:1](=[O:2])[O:3].[O:4]>>[C:1](=[O:2])[O:4]')

acid_reactions_list = [
    
    ('DIAZO', DIAZO, ['C=[N+]=[N-]'], ['']),
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]', 'B.C1CCOC1']),
    ('RLI', RLI, ['[Li]C', '[Li]CC', '[Li]CCC', '[Li]CCCC', '[Li]c1ccccc1'], ['']),
    ('CHLORI', CHLORI, [''], ['ClS(Cl)=O', 'ClC(=O)C(Cl)=O']),
    ('HVZ', HVZ, [''], ['BrBr.BrP(Br)Br']),
    ('ESTER', ESTER, [''], ['Cl', 'OS(O)(=O)=O']),
    
]

for i, acid_smi in enumerate(acid_list):
    acid = Chem.MolFromSmiles(acid_smi)
    
    for reaction in acid_reactions_list:
        if reaction[0] == 'ESTER':
            reagent_list = random.sample(alcohol_1_list,10)
        else: reagent_list = reaction[2]
        
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((acid,))
            else: products = reaction[1].RunReactants((acid, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(acid_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)
    
bar.finish()

100% (357 of 357) |########################| Elapsed Time: 0:00:03 Time: 0:00:03


In [6]:
print(len(rxns))

10458


In [7]:
acid_mol = [Chem.MolFromSmiles(smi) for smi in acid_list]
chloride_list = [Chem.MolToSmiles(CHLORI.RunReactants((mol,))[0][0]) for mol in acid_mol]
print(len(chloride_list))

357


In [8]:
ANHYD = AllChem.ReactionFromSmarts('[C:1](=O)[Cl:2].[C:3](=O)[O:4]>>[C:1](=O)[O:4][C:3](=O)')
ESTER = AllChem.ReactionFromSmarts('[C:1](=O)[Cl:2].[O:3]>>[C:1](=O)[O:3]')
AMIDE = AllChem.ReactionFromSmarts('[C:1](=O)[Cl:2].[N:3]>>[C:1](=O)[N:3]')
CARBO = AllChem.ReactionFromSmarts('[C:1](=O)[Cl:2].[O:3]>>[C:1](=O)[O:3]')

METAL = AllChem.ReactionFromSmarts('[C:1](=[O:2])[Cl:3].[C,c:4][Mg+:5]>>[C:1]([*:4])([*:4])[O:2]')
RED = AllChem.ReactionFromSmarts('[C:1](=[O:2])[Cl:3]>>[C:1](=[O:2])')

chloride_reactions_list = [
    
    ('ANHYD', ANHYD, [''], ['']),
    ('ESTER', ESTER, [''], ['']),
    ('AMIDE', AMIDE, [''], ['']),
    ('CARBO', CARBO, ['O'], ['']),
    
    ('METAL', METAL, ['[Br-].[Mg+]C', '[Br-].[Mg+]CC', '[Br-].[Mg+]CCC', \
                      '[Br-].[Mg+]CCCC', '[Br-].[Mg+]c1ccccc1'], ['']),
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, chloride_smi in enumerate(chloride_list):
    chloride = Chem.MolFromSmiles(chloride_smi)
    
    for reaction in chloride_reactions_list:
        if reaction[0] == 'ANHYD':
            reagent_list = random.sample(acid_list,10)
        elif reaction[0] == 'ESTER':
            reagent_list = random.sample(alcohol_1_list,10)
        elif reaction[0] == 'AMIDE':
            reagent_list = random.sample(amine_1_list,10)
        else: reagent_list = reaction[2]
        
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((chloride,))
            else: products = reaction[1].RunReactants((chloride, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(chloride_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)
    
bar.finish()

100% (357 of 357) |########################| Elapsed Time: 0:00:04 Time: 0:00:04


In [9]:
print(len(rxns))

23667


In [10]:
anhyd_list = [Chem.MolToSmiles(ANHYD.RunReactants((Chem.MolFromSmiles(chloride), \
                                                   Chem.MolFromSmiles(acid)))[0][0]) \
              for chloride, acid in zip(chloride_list, acid_list)]
print(len(anhyd_list))

357


In [13]:
ESTER = AllChem.ReactionFromSmarts('[C:1](=O)[O:2].[O:3]>>[C:1](=O)[O:3]')
AMIDE = AllChem.ReactionFromSmarts('[C:1](=O)[O:2].[N:3]>>[C:1](=O)[N:3]')
CARBO = AllChem.ReactionFromSmarts('[C:1](=O)[O:2].[O:3]>>[C:1](=O)[O:3]')

anhydride_reactions_list = [
    
    ('ESTER', ESTER, [''], ['Cl', 'OS(O)(=O)=O']),
    ('AMIDE', AMIDE, [''], ['']),
    ('CARBO', CARBO, ['O'], ['']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, anhyd_smi in enumerate(anhyd_list):
    anhyd = Chem.MolFromSmiles(anhyd_smi)
    
    for reaction in anhydride_reactions_list:
        if reaction[0] == 'ESTER':
            reagent_list = random.sample(alcohol_1_list,10)
        elif reaction[0] == 'AMIDE':
            reagent_list = random.sample(amine_1_list,10)
        else: reagent_list = reaction[2]
        
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((anhyd,))
            else: products = reaction[1].RunReactants((anhyd, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(anhyd_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)
    
bar.finish()

100% (357 of 357) |########################| Elapsed Time: 0:00:04 Time: 0:00:04


In [14]:
print(len(rxns))

34734


In [15]:
with gzip.open('data/subst/ester.pkl.gz', 'rb') as f:
    ester_list = cPickle.load(f)

_DEP = AllChem.ReactionFromSmarts('[O:1]>>[O-:1]')
_PRO = AllChem.ReactionFromSmarts('[O-:1]>>[O-0:1][H]')
alkoxide_list = [Chem.MolToSmiles(_DEP.RunReactants((Chem.MolFromSmiles(smi),))[0][0]) \
                 for smi in alcohol_1_list]

length = len(ester_list)
print(length)
print(len(alkoxide_list))

1036
1026


In [16]:
ESTER = AllChem.ReactionFromSmarts('[C$(C[#6]):1](=O)[O:2][#6:3].[O-:4]>>([C:1](=O)[O-0:4].[O:2][*:3])')
# hydrolysis included in esterification
AMIDE = AllChem.ReactionFromSmarts('[C:1](=O)[O:2].[N:3]>>([C:1](=O)[N:3].[O:2])')

RED = AllChem.ReactionFromSmarts('[C:1](=O)[O:2]>>([C:1]O.[O:2])')
METAL = AllChem.ReactionFromSmarts('[C:1](=[O:2])[O:3].[C,c:4][Mg+:5]>>([C:1]([*:4])([*:4])[O:2].[O:3])')

ester_reactions_list = [
    
    ('ESTER', ESTER, [''], ['']),
    ('AMIDE', AMIDE, [''], ['']),
    
    ('METAL', METAL, ['[Br-].[Mg+]C', '[Br-].[Mg+]CC', '[Br-].[Mg+]CCC', \
                      '[Br-].[Mg+]CCCC', '[Br-].[Mg+]c1ccccc1'], ['']),
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, ester_smi in enumerate(ester_list):
    ester = Chem.MolFromSmiles(ester_smi)
    
    for reaction in ester_reactions_list:
        if reaction[0] == 'ESTER':
            reagent_list = random.sample(alkoxide_list,10)
            reagent_list.append('[OH-]') # for hydrolysis
        elif reaction[0] == 'AMIDE':
            reagent_list = random.sample(amine_1_list,10)
        else: reagent_list = reaction[2]
            
        for reagent in reagent_list:
            if reagent == '': products = reaction[1].RunReactants((ester,))
            else: products = reaction[1].RunReactants((ester, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            if reaction[0] == 'ESTER':
                alcohol_smi = Chem.MolToSmiles(_PRO.RunReactants((Chem.MolFromSmiles(reagent),))[0][0])
                r_ester = [('[Na+].'+reagent, alcohol_smi if len(alcohol_smi) < 5 else ''), \
                           (alcohol_smi, 'Cl'), (alcohol_smi, 'OS(O)(=O)=O')]
                for r in r_ester:
                    rxns.append(cano(ester_smi) + block('.', r[0]) + '>' + cano(r[1]) + '>' + '.'.join(product_smi))
            else:
                for sub_reagent in reaction[3]:
                    rxns.append(cano(ester_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))

    bar.update(i)
    
bar.finish()

100% (1036 of 1036) |######################| Elapsed Time: 0:00:17 Time: 0:00:17


In [17]:
print(len(rxns))

85498


In [18]:
with gzip.open('data/subst/amide.pkl.gz', 'rb') as f:
    amide_list = cPickle.load(f)
    
length = len(amide_list)
print(length)

2479


In [19]:
HYD = AllChem.ReactionFromSmarts('[C:1](=O)[N:2].[O:3]>>([C:1](=O)[O:3].[N:2])')
HYD_BASE = AllChem.ReactionFromSmarts('[C:1](=O)[N:2].[O:3]>>([C:1](=O)[O-0:3].[N:2])')
RED = AllChem.ReactionFromSmarts('[C:1](=[O:2])[N:3]>>([C:1][N:3])')

amide_reactions_list = [
    
    ('HYD', HYD, ['O'], ['Cl', 'OS(O)(=O)=O']),
    ('HYD_BASE', HYD_BASE, ['[Na+].[OH-]'], ['O']),
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, amide_smi in enumerate(amide_list):
    amide = Chem.MolFromSmiles(amide_smi)
    
    for reaction in amide_reactions_list:
        for reagent in reaction[2]:
            if reagent == '': products = reaction[1].RunReactants((amide,))
            else: products = reaction[1].RunReactants((amide, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(amide_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)
    
bar.finish()

100% (2479 of 2479) |######################| Elapsed Time: 0:00:02 Time: 0:00:02


In [20]:
with gzip.open('data/subst/amide_1.pkl.gz', 'rb') as f:
    amide_1_list = cPickle.load(f)
    
length = len(amide_1_list)
print(length)

274


In [21]:
# primary amide
HOFMANN = AllChem.ReactionFromSmarts('[C:1]C(=O)N>>[C:1]N')
DEHYD = AllChem.ReactionFromSmarts('[C:1](=O)N>>[C:1]#N')

amide_1_reactions_list = [
    
    ('HOFMANN', HOFMANN, [''], ['BrBr.[Na+].[OH-]']),
    ('DEHYD', DEHYD, [''], ['ClP(Cl)(Cl)=O', 'O=P12OP3(=O)OP(=O)(O1)OP(=O)(O2)O3']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, amide_1_smi in enumerate(amide_1_list):
    amide_1 = Chem.MolFromSmiles(amide_1_smi)
    
    for reaction in amide_1_reactions_list:
        for reagent in reaction[2]:
            if reagent == '': products = reaction[1].RunReactants((amide_1,))
            else: products = reaction[1].RunReactants((amide_1, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(amide_1_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)
    
bar.finish()

100% (274 of 274) |########################| Elapsed Time: 0:00:00 Time: 0:00:00


In [22]:
with gzip.open('data/subst/nitrile.pkl.gz', 'rb') as f:
    nitrile_list = cPickle.load(f)
    
length = len(nitrile_list)
print(length)

997


In [23]:
HYD = AllChem.ReactionFromSmarts('[C:1]#N.O>>[C:1](=O)O')
RED = AllChem.ReactionFromSmarts('[C:1]#[N:2]>>[C:1][N:2]')
METAL = AllChem.ReactionFromSmarts('[C:1]#N.[C,c:2][Mg+]>>[C:1](=O)[*:2]')

nitrile_reactions_list = [
    
    ('HYD', HYD, ['O'], ['Cl', 'OS(O)(=O)=O', '[Na+].[OH-]']),
    ('RED', RED, [''], ['[H-].[H-].[H-].[H-].[Al+3].[Li+]']),
    ('METAL', METAL, ['[Br-].[Mg+]C', '[Br-].[Mg+]CC', '[Br-].[Mg+]CCC', \
                      '[Br-].[Mg+]CCCC', '[Br-].[Mg+]c1ccccc1'], ['']),
    
]

bar = progressbar.ProgressBar(max_value=length)

for i, nitrile_smi in enumerate(nitrile_list):
    nitrile = Chem.MolFromSmiles(nitrile_smi)
    
    for reaction in nitrile_reactions_list:
        for reagent in reaction[2]:
            if reagent == '': products = reaction[1].RunReactants((nitrile,))
            else: products = reaction[1].RunReactants((nitrile, Chem.MolFromSmiles(reagent)))
            if len(products) == 0: continue
            product_smi = [Chem.MolToSmiles(product) for product in products[0]]
            for sub_reagent in reaction[3]:
                rxns.append(cano(nitrile_smi) + block('.', reagent) + '>' + cano(sub_reagent) + '>' + '.'.join(product_smi))
                
    bar.update(i)

bar.finish()

100% (997 of 997) |########################| Elapsed Time: 0:00:02 Time: 0:00:02


In [24]:
print(len(rxns))

105205


In [25]:
with gzip.open('data/rxns/acid.pkl.gz', 'wb') as f:
    cPickle.dump(rxns, f, 2)