In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from itertools import combinations_with_replacement
import pandas as pd
import random
import itertools
import re
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions

In [2]:
# substituent list
subs_list_LR = ['C', 'F', 'Cl', 'Br', 'C#N', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 
                'c1ccccc1', 'N', 'C(F)(F)F', None]

In [3]:
# auxiliary functions
def generate_dipolarophiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol)

def single_edit_mol(mol, label, subs):
    if subs != None:
        mod_mol = Chem.ReplaceSubstructs(mol, Chem.MolFromSmiles(label), Chem.MolFromSmiles(subs))[0]
    else: 
        mod_mol = Chem.DeleteSubstructs(mol, Chem.MolFromSmiles(label))
    return mod_mol

def modify_mol(dipole, subs_comb_LR, labels):
    mol = Chem.MolFromSmiles(dipole)
    mod_mol = single_edit_mol(mol, labels[0],subs_comb_LR[0])
    for i, subs in enumerate(subs_comb_LR[1:]):
        mod_mol = single_edit_mol(mod_mol, labels[i + 1], subs)
    
    return Chem.MolFromSmiles(Chem.MolToSmiles(mod_mol))

In [4]:
# generate all ethylene-based dipolarophiles
dipolarophile = 'C(*)(*)=C(*)(*)'
labels = ['[Ti]', '[Cr]', '[Mn]', '[Fe]']
connectable_substituents = set(['C', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1', 'N'])
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
    dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
for subs_comb in substituent_combs:
    if connectable_substituents.intersection(subs_comb) != set(): # make sure at least one substituent is connectable
        if len(set(subs_comb)) == len(subs_comb) - 2: # make sure there are only two different type of substituents
            generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))
    else:
        continue

In [5]:
full_dipolarophile_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipolarophiles)))
dipolarophiles_ethylene = set()

for full_dipolarophile in full_dipolarophile_set:
    isomers = tuple(EnumerateStereoisomers(Chem.MolFromSmiles(full_dipolarophile)))
    for smi in set(list(map(lambda x: Chem.MolToSmiles(x), isomers))):
        dipolarophiles_ethylene.add(smi)

print(len(dipolarophiles_ethylene))

255


In [6]:
# generate all acetylene-based dipolarophiles
dipolarophile = 'C(*)#C(*)'
connectable_substituents = set(['C', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1', 'N'])
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
    dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
for subs_comb in substituent_combs:
    if connectable_substituents.intersection(subs_comb) != set(): # make sure at least one substituent is connectable
        generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))
    else:
        continue

In [7]:
full_dipolarophile_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipolarophiles)))
dipolarophiles_acetylene = set()

for full_dipolarophile in full_dipolarophile_set:
    isomers = tuple(EnumerateStereoisomers(Chem.MolFromSmiles(full_dipolarophile)))
    for smi in set(list(map(lambda x: Chem.MolToSmiles(x), isomers))):
        dipolarophiles_acetylene.add(smi)

print(len(dipolarophiles_acetylene))

57


In [8]:
# generate all norbornen-based dipolarophiles
dipolarophile = 'C(*)1=C(*)C2CCC1C2'
connectable_substituents = set(['C', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1', 'N'])
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
    dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
for subs_comb in substituent_combs:
    generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))

In [9]:
full_dipolarophile_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipolarophiles)))
dipolarophiles_norbornene = set()

for full_dipolarophile in full_dipolarophile_set:
    dipolarophiles_norbornene.add(full_dipolarophile)

print(len(dipolarophiles_norbornene))

78


In [10]:
# generate all oxo-norbornadiene-based dipolarophiles
dipolarophile = 'C(*)1=C(*)C2C=CC1O2'
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
    dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))

for subs_comb in substituent_combs:
    generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))

In [11]:
full_dipolarophile_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipolarophiles)))
dipolarophiles_oxonorbornadiene = set()

for full_dipolarophile in full_dipolarophile_set:
    dipolarophiles_oxonorbornadiene.add(full_dipolarophile)

print(len(dipolarophiles_oxonorbornadiene))

78


In [12]:
# generate all cyclooctyne-based dipolarophiles
dipolarophile = f'C1CCC(*)(*)C#CC(*)(*)C1'
generated_full_dipolarophiles = []

valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipolarophile)]
for i in range(len(valency_indices)):
    dipolarophile = dipolarophile.replace('*', labels[i], 1)
substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))

for subs_comb in substituent_combs:
    if subs_comb[0] == subs_comb[1] or subs_comb[2] == subs_comb[3]: # make sure the reactant is achiral
    # if len(set(subs_comb)) != len(subs_comb): # make sure there are only two different type of substituents
        generated_full_dipolarophiles.append(modify_mol(dipolarophile, subs_comb, labels))

In [14]:
full_dipolarophile_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipolarophiles)))
dipolarophiles_cyclooctyne = set()

for full_dipolarophile in full_dipolarophile_set:
    dipolarophiles_cyclooctyne.add(full_dipolarophile)

print(len(dipolarophiles_cyclooctyne))

870


In [15]:
# turn lists into dataframes
df_ethylene = pd.DataFrame(list(dipolarophiles_ethylene))
df_acetylene = pd.DataFrame(list(dipolarophiles_acetylene))
df_norbornene = pd.DataFrame(list(dipolarophiles_norbornene))
df_oxonorbornadiene = pd.DataFrame(list(dipolarophiles_oxonorbornadiene))
df_cyclooctyne = pd.DataFrame(list(dipolarophiles_cyclooctyne))

In [16]:
# sample from the dataframes
df_ethylene_sample = df_ethylene.sample(n=200, replace=True)
df_acetylene_sample = df_acetylene.sample(n=200, replace=True)
df_norbornene_sample = df_norbornene.sample(n=300, replace=True)
df_oxonorbornadiene_sample = df_oxonorbornadiene.sample(n=300, replace=True)
df_cyclooctyne_sample = df_cyclooctyne.sample(n=500, replace=True)

In [17]:
# concatenate
df_sample = pd.concat((df_ethylene_sample, df_acetylene_sample, df_norbornene_sample, df_oxonorbornadiene_sample, df_cyclooctyne_sample))
df = pd.concat((df_ethylene, df_acetylene, df_norbornene, df_oxonorbornadiene, df_cyclooctyne))

In [19]:
print(len(df), len(df_sample))

1338 1500


In [None]:
df_sample.to_csv('dipolarophiles_sample.csv')
df.to_csv('dipolarophiles.csv')