In [4]:
! pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.4


In [5]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from itertools import combinations_with_replacement
import pandas as pd
import random
import itertools
import re
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
from rdkit.Chem import rdRGroupDecomposition

In [6]:
# substituent list
subs_list_LR = ['C', 'F', 'Cl', 'Br', 'C#N', 'C(=O)OC', 'C(=O)C', 'C(=O)NC',
                'c1ccccc1', 'OC', 'C(F)(F)F', None]

In [58]:
# auxiliary functions
def generate_dipolarophiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol)

def single_edit_mol(mol, label, subs):
    if subs != None:
        mod_mol = Chem.ReplaceSubstructs(mol, Chem.MolFromSmiles(label), Chem.MolFromSmiles(subs))[0]
    else:
        mod_mol = Chem.DeleteSubstructs(mol, Chem.MolFromSmiles(label))
    return mod_mol

def modify_mol(dipole, subs_comb_LR, labels):
    mol = Chem.MolFromSmiles(dipole)
    for i, subs in enumerate(subs_comb_LR):
        mol = single_edit_mol(mol, labels[i], subs)

    return Chem.MolFromSmiles(Chem.MolToSmiles(mol))

def unmap_smiles(smiles):
    """Unmap atoms of SMILES"""
    mol = Chem.MolFromSmiles(smiles)
    [atom.SetAtomMapNum(0) for atom in mol.GetAtoms()]

    return Chem.MolToSmiles(mol)

In [55]:
# generate all acetylene-based dipolarophiles
dipolarophile = 'C(*)#C(*)'
labels = ['[Ti]', '[Cr]', '[Mn]', '[Fe]'] # make easy the replacement
connectable_substituents = set(['C', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1', 'OC'])
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
  dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
for subs_comb in substituent_combs:
  generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))

In [None]:
Chem.Draw.MolsToGridImage(
    generated_full_dipolarophiles, molsPerRow=10, subImgSize=(250, 250), maxMols=150
)

In [7]:
df = pd.read_csv("data_smiles_curated.csv", sep=';')

In [77]:
rxns = df['rxn_smiles'].tolist()
subs_list = []
for rxn in rxns:
  reactants = rxn.split(">>")[0].split('.')
  for reac in reactants:
    mol = Chem.MolFromSmiles(reac)
    patt = Chem.MolFromSmarts('C=CC=C')  # diene core
    if mol.HasSubstructMatch(patt):
      gs, _ = rdRGroupDecomposition.RGroupDecompose([patt], [mol], asSmiles=True)
    subs = [unmap_smiles(gs[0][key]) for key in gs[0].keys() if key != 'Core']
    subs = list(set(subs))
  [subs_list.append(sub) for sub in subs]



In [79]:
list(set(subs_list))

['*C(=N[N+]#N)C(F)=C(*)F',
 '*CCC*',
 '*/C=C/C.*Sc1ccccc1',
 '*CCCC1=CC(=O)CCC1',
 '*/C(C)=C/C',
 '*/C=C/C(=O)O[C@H](CC)[C@H](C)/C=C(C)/C=C/[C@H](O)C[C@H](O)C*',
 '*COB(/C=C/c1ccccc1)OC',
 '*C(*)=C(C)C',
 '*CCC[C@@H](C)/C=C1/C(=O)NC=C(c2ccccc2)C1=O',
 '*S(*)(=O)=O',
 '*[CH-]/C(*)=C(\\C)C#[O+]',
 '*CCCN(CC*)c1ccccc1',
 '*C=CC(*)=C(C#N)C(=O)OC',
 '*C(=O)O',
 '*OC',
 '*CCS(=O)(=O)CC*',
 '*[C@H](C[C@H](C/C=C(/C)C(=O)OC)OC)OC',
 '*C(C)(C)C',
 '*C(=O)C(*)=O',
 '*O/C([O-])=C(/[N+]#N)c1ccccc1.*c1ccccc1',
 '*COC/C=C\\COC*',
 '*CN(C)CC#CCN(C)C*',
 '*O/C([O-])=C(\\[N+]#N)c1ccccc1.*c1ccccc1',
 '*C.*C[C@H](C)CC/C=C/C(=O)C1=C[C@H](C)NC1=O',
 '*C[C@@H](C)C[C@@H](C)/C=C/C(C)=O',
 '*/C=C/C.*C',
 '*C(=O)OC/C(*)=N\\C',
 '*/C=C/C.*C(=O)OC',
 '*C',
 '*/C=C/CCCC/C=C/C=C/CCC*',
 '*C(=O)CCC/C(*)=N\\C',
 '*OC.*[O-]',
 '*[C@@H](C)[Si](C)(C)C',
 '*COCC#CC',
 '*C=CC(*)=C(C#N)C#N',
 '*/C=C/C[C@@H](O)C/C=C/C=C/C(=O)[C@H](C)[C@@H](O)CCC[C@H](CC)OC(*)=O',
 '*/C(C)=C/[C@@H]1[C@@H](O)C[C@@H](C)N1C(=O)/C=C/C=C/C(C)=C/[C