In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from itertools import combinations_with_replacement
import pandas as pd
import random
import itertools
import re
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions

In [3]:
# substituent lists
subs_list_LR = ['C', 'C#N', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1', None] 
subs_list_M = ['C', None]

In [4]:
# auxiliary functions
def generate_dipoles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol)

def single_edit_mol(mol, label, subs):
    if subs != None:
        mod_mol = Chem.ReplaceSubstructs(mol, Chem.MolFromSmiles(label), Chem.MolFromSmiles(subs))[0]
    else:
        mod_mol = Chem.ReplaceSubstructs(mol, Chem.MolFromSmiles(label), Chem.MolFromSmiles('[H]'))[0]
        mod_mol = Chem.RemoveHs(mod_mol)
    return mod_mol

def modify_mol(dipole, subs_comb_LR, subs_M, labels):
    mol = Chem.MolFromSmiles(dipole)
    if 'Sc' in dipole:
        mod_mol = single_edit_mol(mol, '[Sc]', subs_M)
        mod_mol = single_edit_mol(mod_mol, labels[0], subs_comb_LR[0])
        for i, subs in enumerate(subs_comb_LR[1:]):
            mod_mol = single_edit_mol(mod_mol, labels[i + 1], subs)
    else:
        mod_mol = single_edit_mol(mol, labels[0],subs_comb_LR[0])
        for i, subs in enumerate(subs_comb_LR[1:]):
            mod_mol = single_edit_mol(mod_mol, labels[i + 1], subs)
    
    return Chem.MolFromSmiles(Chem.MolToSmiles(mod_mol))

In [5]:
# construct allyl-type dipoles
dipole_scaffolds = []

for L in ['C(*)(*)', 'N(*)']: # O on left side doesn't make sense because then there can be no connection site 
    for M in ['[O+]', '[N+]([Sc])']:
        for R in ['[O-]', '[C-](*)(*)', '[N-](*)']:
            dipole_scaffolds.append(f'{L}={M}{R}')

            
# remove pseudo-duplicates (resonance structure can be "pushed" to other side)
dipole_scaffolds.remove('N(*)=[N+]([Sc])[C-](*)(*)')
dipole_scaffolds.remove('N(*)=[O+][C-](*)(*)')
print(len(dipole_scaffolds))
print(dipole_scaffolds)

10
['C(*)(*)=[O+][O-]', 'C(*)(*)=[O+][C-](*)(*)', 'C(*)(*)=[O+][N-](*)', 'C(*)(*)=[N+]([Sc])[O-]', 'C(*)(*)=[N+]([Sc])[C-](*)(*)', 'C(*)(*)=[N+]([Sc])[N-](*)', 'N(*)=[O+][O-]', 'N(*)=[O+][N-](*)', 'N(*)=[N+]([Sc])[O-]', 'N(*)=[N+]([Sc])[N-](*)']


In [6]:
labels = ['[Ti]', '[Cr]', '[Mn]', '[Fe]']
connectable_substituents = set(['C', 'C(=O)OC', 'C(=O)C', 'C(=O)NC', 'c1ccccc1'])
generated_full_dipoles = []

for dipole in dipole_scaffolds:
    valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipole)]
    for i in range(len(valency_indices)):
        dipole = dipole.replace('*', labels[i], 1)
    substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
    for subs_comb in substituent_combs:
        if connectable_substituents.intersection(subs_comb) != set(): # make sure at least one substituent is connectable
            for subs_M in subs_list_M:
                generated_full_dipoles.append(modify_mol(dipole,subs_comb, subs_M, labels))
        else:
            continue



















































In [7]:
print(len(generated_full_dipoles))
full_dipole_set = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipoles)))
dipoles_double = set()

for smi in full_dipole_set:
    dipoles_double.add(smi)

print(len(dipoles_double))

11260
3120


In [8]:
# construct propargyl-type dipoles
dipole_scaffolds2 = []

for L in ['C(*)', 'N']: # O on left side doesn't make sense because then there can be not connection site 
    for M in ['[N+]']:
        for R in ['[O-]', '[C-](*)(*)', '[N-](*)']:
            dipole_scaffolds2.append(f'{L}#{M}{R}')

dipole_scaffolds2.remove('N#[N+][O-]')
print(len(dipole_scaffolds2))

5


In [9]:
generated_full_dipoles2 = []

for dipole in dipole_scaffolds2:
    valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipole)]
    for i in range(len(valency_indices)):
        dipole = dipole.replace('*', labels[i], 1)
    substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
    for subs_comb in substituent_combs:
        if connectable_substituents.intersection(subs_comb) != set(): # make sure at least one substituent is connectable
            for subs_M in subs_list_M:
                generated_full_dipoles2.append(modify_mol(dipole,subs_comb, subs_M, labels))
        else:
            continue





In [10]:
full_dipole_set2 = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipoles2)))
dipoles_triple = set()

for full_dipole in full_dipole_set2:
    isomers = tuple(EnumerateStereoisomers(Chem.MolFromSmiles(full_dipole)))
    for smi in set(list(map(lambda x: Chem.MolToSmiles(x), isomers))):
        dipoles_triple.add(smi)
        
print(len(dipoles_triple))

270


In [11]:
#construct cyclic dipoles
dipole_scaffolds3 = []

for L in ['C(*)', 'N']: # O on left side doesn't make sense because then there can be not connection site 
    for M in ['[O+]', '[N+]([Sc])']:
        for R in ['[C-](*)', '[N-]']:
            dipole_scaffolds3.append(f'{L}2={M}{R}C(=O)O2')
            
print(len(dipole_scaffolds3))

8


In [12]:
generated_full_dipoles3 = []

for dipole in dipole_scaffolds3:
    valency_indices = [valency.start() for valency in re.finditer('\(\*\)', dipole)]
    for i in range(len(valency_indices)):
        dipole = dipole.replace('*', labels[i], 1)
    substituent_combs = itertools.product(subs_list_LR, repeat = len(valency_indices))
    for subs_comb in substituent_combs:
        if connectable_substituents.intersection(subs_comb) != set(): # make sure at least one substituent is connectable
            for subs_M in subs_list_M:
                generated = modify_mol(dipole,subs_comb, subs_M, labels)
                if generated != None:
                    generated_full_dipoles3.append(generated)
                else:
                    print(dipole, subs_comb)
        else:
            continue



In [15]:
full_dipole_set3 = set(list(map(lambda x: Chem.MolToSmiles(x), generated_full_dipoles3)))       

In [16]:
dipoles_ring = set()

for smi in full_dipole_set3:
    dipoles_ring.add(smi)
        
print(len(dipoles_ring))

165


In [14]:
# turn lists into dataframes
df_double = pd.DataFrame(list(dipoles_double))
df_triple = pd.DataFrame(list(dipoles_triple))
df_ring = pd.DataFrame(list(dipoles_ring))

In [15]:
# sample from the dataframes - synthetic
df_double_sample = df_double.sample(n=1000, replace=True, random_state=5)
df_triple_sample = df_triple.sample(n=300, replace=True, random_state=5)
df_ring_sample = df_ring.sample(n=200, replace=True, random_state=5)

In [16]:
# concatenate
df_sample = pd.concat((df_double_sample, df_triple_sample, df_ring_sample))
df = pd.concat((df_double, df_triple, df_ring))

In [17]:
print(len(df))

3555


In [None]:
df_sample.to_csv('dipoles_sample.csv')
df.to_csv('dipoles.csv')

In [18]:
# sample from the dataframes - biofragme
df_double_sample2 = df_double.sample(n=1000, replace=True, random_state=6)
df_triple_sample2 = df_triple.sample(n=300, replace=True, random_state=6)
df_ring_sample2 = df_ring.sample(n=200, replace=True, random_state=6)

In [19]:
df_sample = pd.concat((df_double_sample2, df_triple_sample2, df_ring_sample2))
df_sample.to_csv('dipoles_sample_ultimate_bio.csv')