In [1]:
from rdkit.Chem import AllChem, MolFromSmiles, MolToSmiles, Draw
from rdkit.Chem.rdmolops import FastFindRings
from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger  
from rdkit import Chem
from tqdm import tqdm
import pandas as pd
import numpy as np

RDLogger.DisableLog('rdApp.*') # removes annoying RDKit warnings

# Diels-Alder cycloaddition
Define a simple Diels-Alder reaction SMARTS template between two molecules (does not count self-reactions):

In [2]:
import itertools

diels_alder = AllChem.ReactionFromSmarts('[C,N,O:1]=[C,N,O:2][C,N,O:3]=[C,N,O:4].[C:5]=[C:6]>>[C,N,O:1]1[C:2]=[C:3][C:4][C:5][C:6]1')
diels_alder2 = AllChem.ReactionFromSmarts('[C,N,O:1]=[C,N,O:2][C,N,O:3]=[C,N,O:4].[C:5]=[C:6]>>[C,N,O:1]1[C:2]=[C:3][C:4][C:6][C:5]1')

def check_dielsalder(src,prod):
    src_list = src.split('.')
    prod = MolToSmiles(MolFromSmiles(prod),isomericSmiles=False)
    src_mols = [MolFromSmiles(mol) for mol in src_list]
    if None in src_mols:
        return 0
    for pair in itertools.permutations(src_mols, r=2): # iterate over all src reactant pairings
        prods = diels_alder.RunReactants((pair[0],pair[1]))
        prods2 = diels_alder2.RunReactants((pair[0],pair[1])) # alternate arrangement
        prods = prods+prods2
        if prods!=():
            for mol in prods:
                pred = MolToSmiles(mol[0])
                if pred==prod:
                    return 1
    return 0

Check reactions from Pistachio labelled as 'Diels-Alder' reactions:

In [3]:
df = pd.read_csv('pist-diels.txt')
n=0
for i,row in tqdm(df.iterrows(), total=df.shape[0]):
    val=check_dielsalder(row['src'],row['prod'])
    if val==1:
        n+=1
    
print("Number of simple Diels-Alder cycloadditions in Pistachio: "+str(n))

100%|██████████| 2397/2397 [00:06<00:00, 352.94it/s]

Number of simple Diels-Alder cycloadditions in Pistachio: 1493





Iterate over USPTO and count lines:

In [4]:
df = pd.read_csv('MIT_train.txt')

n=0
rxn_list = []
for i,row in tqdm(df.iterrows(), total=df.shape[0]):
    val=check_dielsalder(row['src'],row['prod'])
    if val==1:
        n+=1
        rxn_list.append(row['src']+'>>'+row['prod'])
print("Number of simple Diels-Alder cycloadditions in USPTO: "+str(n))
for rxn in rxn_list:
    print(rxn)

100%|██████████| 377419/377419 [08:33<00:00, 734.85it/s]

Number of simple Diels-Alder cycloadditions in USPTO: 7
C1=CCCCC=C1.ClCCl.O=C1C=CC(=O)C=C1>>O=C1C=CC(=O)C2C3C=CC(CCC3)C12
C=CC(=C)CCCCC.C=CC(=O)OCC.Cc1ccccc1.[Al+3].[Cl-].[Cl-].[Cl-]>>CCCCCC1=CCC(C(=O)OCC)CC1
C=CC(=C)Cl.C=CC(C)=O.ClCCl.O.[Al+3].[Cl-].[Cl-].[Cl-]>>CC(=O)C1CC=C(Cl)CC1
C=C(C)C=CC.C=Cc1ccccn1.CC1CCC(c2ccccn2)C(C)C1.c1ccncc1>>CC1=CC(C)C(c2ccccn2)CC1
C1=CCCC=C1.CN1C(=O)C=CS1=O.ClCCCl>>CN1C(=O)C2C3C=CC(CC3)C2S1=O
C1=CCCC=C1.O=C1C=CC(=O)C=C1.c1ccccc1>>O=C1C=CC(=O)C2C3C=CC(CC3)C12
C1=CCCC=C1.Cc1ccccc1.O=S(=O)(C=CS(=O)(=O)c1ccccc1)c1ccccc1>>O=S(=O)(c1ccccc1)C1C2C=CC(CC2)C1S(=O)(=O)c1ccccc1





# Friedel-Crafts Acylation
Define SMARTS patterns for ortho-, meta-, and para- acylations on Benzene rings (ignores heteroatoms and 5-membered rings etc):

In [5]:
para_rxn = AllChem.ReactionFromSmarts('[*:1]-!:[c:2]1[c:3][c:4][c:5][c:6][c:7]1.[C:8](=[O:9])~[*:10]>>[*:1]-!:[c:2]1[c:3][c:4][c:5]([C:8](=[O:9]))[c:6][c:7]1')
meta_rxn = AllChem.ReactionFromSmarts('[*:1]-!:[c:2]1[c:3][c:4][c:5][c:6][c:7]1.[C:8](=[O:9])~[*:10]>>[*:1]-!:[c:2]1[c:3][c:4]([C:8](=[O:9]))[c:5][c:6][c:7]1')
ortho_rxn = AllChem.ReactionFromSmarts('[*:1]-!:[c:2]1[c:3][c:4][c:5][c:6][c:7]1.[C:8](=[O:9])~[*:10]>>[*:1]-!:[c:2]1[c:3]([C:8](=[O:9]))[c:4][c:5][c:6][c:7]1')

def check_para(src,prod):
    src_list = src.split('.')
    prod = MolToSmiles(MolFromSmiles(prod),isomericSmiles=False)
    src_mols = [MolFromSmiles(mol) for mol in src_list]
    if None in src_mols:
        return 0
    for pair in itertools.permutations(src_mols, r=2):
        prods = para_rxn.RunReactants((pair[0],pair[1]))
        if prods!=():
            for mol in prods:
                pred = MolToSmiles(mol[0])
                if pred==prod:
                    return 1
    return 0

def check_ortho(src,prod):
    src_list = src.split('.')
    prod = MolToSmiles(MolFromSmiles(prod),isomericSmiles=False)
    src_mols = [MolFromSmiles(mol) for mol in src_list]
    if None in src_mols:
        return 0
    for pair in itertools.permutations(src_mols, r=2):
        prods = ortho_rxn.RunReactants((pair[0],pair[1]))
        if prods!=():
            for mol in prods:
                pred = MolToSmiles(mol[0])
                if pred==prod:
                    return 1
    return 0

def check_meta(src,prod):
    src_list = src.split('.')
    prod = MolToSmiles(MolFromSmiles(prod),isomericSmiles=False)
    src_mols = [MolFromSmiles(mol) for mol in src_list]
    if None in src_mols:
        return 0
    for pair in itertools.permutations(src_mols, r=2):
        prods = meta_rxn.RunReactants((pair[0],pair[1]))
        if prods!=():
            for mol in prods:
                pred = MolToSmiles(mol[0])
                if pred==prod:
                    return 1
    return 0

Check reactions from Pistachio labelled as 'Friedel-Crafts Acylation':

In [6]:
df = pd.read_csv('pist-friedel.txt')
df['para']=False
df['meta']=False
df['ortho']=False
n_para=0
n_meta=0
n_ortho=0
neither=0
for i,row in tqdm(df.iterrows(), total=df.shape[0]):
    para=check_para(row['src'],row['prod'])
    meta=check_meta(row['src'],row['prod'])
    ortho=check_ortho(row['src'],row['prod'])
    if para==1:
        n_para+=1
        df.loc[i, 'para']=True
    if meta==1:
        n_meta+=1
        df.loc[i, 'meta']=True
    if ortho==1:
        n_ortho+=1
        df.loc[i, 'ortho']=True
    if meta==0 and para==0 and ortho==0:
        neither+=1
        
print("Number of para-directing Friedel-Crafts in Pistachio: "+str(n_para))
print("Number of meta-directing Friedel-Crafts in Pistachio: "+str(n_meta))
print("Number of ortho-directing Friedel-Crafts in Pistachio: "+str(n_ortho))
print("Number of non-Benzene Friedel-Crafts in Pistachio: "+str(neither))

100%|██████████| 3592/3592 [00:23<00:00, 151.20it/s]

Number of para-directing Friedel-Crafts in Pistachio: 1534
Number of meta-directing Friedel-Crafts in Pistachio: 896
Number of ortho-directing Friedel-Crafts in Pistachio: 574
Number of non-Benzene Friedel-Crafts in Pistachio: 1888





### Para:

In [7]:
df_para = df[df['para']]
df_para = df_para[~df_para['meta']]
df_para = df_para[~df_para['ortho']]
print('Number of pure para rxn: {}\n'.format(len(df_para)))
print('Some examples:')
for i,row in df_para[:5].iterrows():
    print(row['src']+'>>'+row['prod'])

Number of pure para rxn: 609
Some examples:
Brc1cccc2c(cccc12)Br.O=C(Cl)CCl.Brc1cccc2c(cccc12)Br.ClCCCl.Cl[Al](Cl)Cl.O>>O=C(CCl)c1ccc(c2cccc(c12)Br)Br
C=C(C)C(=O)Cl.Brc1ccccc1.ClCCl.Cl[Al](Cl)Cl>>C=C(C)C(=O)c1ccc(cc1)Br
C=C(C)C(=O)Cl.Fc1ccccc1.Cl[Al](Cl)Cl>>C=C(C)C(=O)c1ccc(cc1)F
C=CC(=O)Cl.Brc1ccccc1.ClCCl.Cl[Al](Cl)Cl>>C=CC(=O)c1ccc(cc1)Br
C=CC(=O)Cl.CCc1ccccc1.ClCCl.Cl[Al](Cl)Cl>>C=CC(=O)c1ccc(cc1)CC


### Meta:

In [8]:
df_meta = df[df['meta']]
df_meta = df_meta[~df_meta['para']]
df_meta = df_meta[~df_meta['ortho']]
print('Number of pure meta: {}\n'.format(len(df_meta)))
print('Some examples:')
for i,row in df_meta[:5].iterrows():
    print(row['src']+'>>'+row['prod'])

Number of pure meta: 23
Some examples:
CC(=O)Cl.CC(=O)NCCc1cccc2ccc(cc12)OC.ClCCl.Cl[Al](Cl)Cl>>COc1ccc2cc(cc(c2c1)CCNC(C)=O)C(C)=O
CC(=O)Cl.CC(=O)NOCc1cccc2ccc(cc12)OC.[O-][N+](=O)c1ccccc1.Cl[Al](Cl)Cl>>CC(=O)NOCc1cc(cc2ccc(cc12)OC)C(C)=O
CC(=O)Cl.COc1ccc2cccc(c2c1)F.O.Cl.[O-][N+](=O)c1ccccc1.Cl[Al](Cl)Cl>>COc1ccc2cc(cc(c2c1)F)C(C)=O
CC(=O)NCCc1cccc2ccc(cc12)OC.O=C(Cl)c1ccccc1.[O-][N+](=O)c1ccccc1.Cl[Al](Cl)Cl>>COc1ccc2cc(cc(c2c1)CCNC(C)=O)C(=O)c1ccccc1
CC(C)(C)c1cc(cc(c1O)C(C)(C)C)CCC(=O)Cl.Oc1cccc(c1)O.Cl[Zn]Cl>>CC(C)(C)c1cc(cc(c1O)C(C)(C)C)CCC(=O)c1cc(cc(c1)O)O


### Ortho:

In [9]:
df_ortho = df[df['ortho']]
df_ortho = df_ortho[~df_ortho['para']]
df_ortho = df_ortho[~df_ortho['meta']]
print('Number of pure ortho: {}\n'.format(len(df_ortho)))
print('Some examples:')
for i,row in df_ortho[:5].iterrows():
    print(row['src']+'>>'+row['prod'])

Number of pure ortho: 28
Some examples:
C=CC(=O)Cl.c1ccc(cc1)-c1cccc2ccccc21.ClCCl.Cl[Al](Cl)Cl>>C=CC(=O)c1ccc2ccccc2c1-c1ccccc1
CC(=O)Cl.CC(=O)Oc1cc2CCc3ccccc3-c2c2ccc(cc12)OC.ClCCl.Cl[Al](Cl)Cl>>CC(=O)Oc1cc2CCc3ccccc3-c2c2cc(c(cc12)OC)C(C)=O
CC(=O)Cl.CCOC(=O)c1oc2cccc(c2c1C)O.Clc1ccccc1.Cl[Ti](Cl)(Cl)Cl>>CCOC(=O)c1oc2ccc(c(c2c1C)O)C(C)=O
CC(=O)Cl.CCOC(=O)c1oc2cccc(c2c1C)O.O.CC(Cl)Cl.Cl[Al](Cl)Cl>>CCOC(=O)c1oc2ccc(c(c2c1C)O)C(C)=O
CC(=O)Cl.COc1ccc(cc1)-c1ccccc1.O.Cl.S=C=S.Cl[Al](Cl)Cl>>COc1ccc(cc1)-c1ccccc1C(C)=O


Count occurences of overlaps:

In [10]:
df_ortho = df[df['ortho']]
df_para = df[df['para']]
df_meta = df[df['meta']]

print('Number of ortho-meta in Pistachio: {}'.format(len(df_ortho[~df_ortho['para'] & df_ortho['meta']])))
print('Number of ortho-para in Pistachio: {}'.format(len(df_ortho[~df_ortho['meta'] & df_ortho['para']])))
print('Number of para-meta in Pistachio: {}'.format(len(df_para[~df_para['ortho'] & df_para['meta']])))
print('Number of all three in Pistachio: {}'.format(len(df_ortho[df_ortho['para'] & df_ortho['meta']])))

Number of ortho-meta: 119
Number of ortho-para: 171
Number of para-meta: 498
Number of all three: 256


Check USPTO (takes ~1hr):

In [11]:
df = pd.read_csv('MIT_train.txt')

df['para']=False
df['meta']=False
df['ortho']=False

n_para=0
n_meta=0
n_ortho=0
neither=0
for i,row in tqdm(df.iterrows(), total=df.shape[0]):
    para=check_para(row['src'],row['prod'])
    meta=check_meta(row['src'],row['prod'])
    ortho=check_ortho(row['src'],row['prod'])
    if para==1:
        n_para+=1
        df.loc[i, 'para']=True
    if meta==1:
        n_meta+=1
        df.loc[i, 'meta']=True
    if ortho==1:
        n_ortho+=1
        df.loc[i, 'ortho']=True
    
print("Number of para-directing Friedel-Crafts in USPTO: "+str(n_para))
print("Number of meta-directing Friedel-Crafts in USPTO: "+str(n_meta))
print("Number of ortho-directing Friedel-Crafts in USPTO: "+str(n_ortho))

100%|██████████| 377419/377419 [1:15:24<00:00, 83.41it/s] 

Number of para-directing Friedel-Crafts in USPTO: 952
Number of meta-directing Friedel-Crafts in USPTO: 680
Number of ortho-directing Friedel-Crafts in USPTO: 615





Occurence counting:

In [12]:
df_ortho = df[df['ortho']]
df_para = df[df['para']]
df_meta = df[df['meta']]

print('Number of ortho-meta in USPTO: {}'.format(len(df_ortho[~df_ortho['para'] & df_ortho['meta']])))
print('Number of ortho-para in USPTO: {}'.format(len(df_ortho[~df_ortho['meta'] & df_ortho['para']])))
print('Number of para-meta in USPTO: {}'.format(len(df_para[~df_para['ortho'] & df_para['meta']])))
print('Number of all three in USPTO: {}'.format(len(df_ortho[df_ortho['para'] & df_ortho['meta']])))

df_para = df_para[~df_para['meta']]
df_para = df_para[~df_para['ortho']]
print('Number of pure para in USPTO: {}'.format(len(df_para)))
df_meta = df_meta[~df_meta['para']]
df_meta = df_meta[~df_meta['ortho']]
print('Number of pure meta in USPTO: {}'.format(len(df_meta)))
df_ortho = df_ortho[~df_ortho['para']]
df_ortho = df_ortho[~df_ortho['meta']]
print('Number of pure ortho in USPTO: {}'.format(len(df_ortho)))


Number of ortho-meta in USPTO: 219
Number of ortho-para in USPTO: 151
Number of para-meta in USPTO: 261
Number of all three in USPTO: 193
Number of pure para in USPTO: 347
Number of pure meta in USPTO: 7
Number of pure ortho in USPTO: 52
