Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Stereoisomer Enumeration #21

Merged
merged 14 commits into from
Mar 14, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 79 additions & 34 deletions easydock/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from easydock.auxiliary import take
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions


def create_db(db_fname, args, args_to_save=(), config_args_to_save=('protein', 'protein_setup'), unique_smi=False):
Expand All @@ -32,7 +33,8 @@ def create_db(db_fname, args, args_to_save=(), config_args_to_save=('protein', '
cur = conn.cursor()
cur.execute(f"""CREATE TABLE IF NOT EXISTS mols
(
id TEXT PRIMARY KEY,
id TEXT,
stereo_id TEXT,
smi TEXT {'UNIQUE' if unique_smi else ''},
smi_protonated TEXT,
source_mol_block TEXT,
Expand All @@ -41,7 +43,8 @@ def create_db(db_fname, args, args_to_save=(), config_args_to_save=('protein', '
pdb_block TEXT,
mol_block TEXT,
dock_time REAL,
time TEXT
time TEXT,
PRIMARY KEY (id, stereo_id)
)""")

# this will create a setup table with the first item in YAML format which contains all input args, and additional
Expand Down Expand Up @@ -136,6 +139,18 @@ def restore_setup_from_db(db_fname):

return d, tmpfiles

def get_isomers(mol):
opts = StereoEnumerationOptions(tryEmbedding=True,maxIsomers=32,rand=0xf00d)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_isomers should be added to arguments and elevated to the level of script arguments (argparse arguments) with the default value 1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add please spaces after the commas

# this is a workaround for rdkit issue - if a double bond has STEREOANY it will cause errors at
# stereoisomer enumeration, we replace STEREOANY with STEREONONE in these cases
try:
isomers = tuple(EnumerateStereoisomers(mol, options=opts))
except RuntimeError:
for bond in mol[1].GetBonds():
if bond.GetStereo() == Chem.BondStereo.STEREOANY:
bond.SetStereo(Chem.rdChem.BondStereo.STEREONONE)
isomers = tuple(EnumerateStereoisomers(mol,options=opts))
return isomers

def init_db(db_fname, input_fname, prefix=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_isomers should be added to arguments with the default value 1


Expand All @@ -144,15 +159,17 @@ def init_db(db_fname, input_fname, prefix=None):
data_smi = [] # non 3D structures
data_mol = [] # 3D structures
for mol, mol_name in read_input.read_input(input_fname):
smi = Chem.MolToSmiles(mol, isomericSmiles=True)
if prefix:
mol_name = f'{prefix}-{mol_name}'
if mol_is_3d(mol):
data_mol.append((mol_name, smi, Chem.MolToMolBlock(mol)))
else:
data_smi.append((mol_name, smi))
cur.executemany(f'INSERT INTO mols (id, smi) VALUES(?, ?)', data_smi)
cur.executemany(f'INSERT INTO mols (id, smi, source_mol_block) VALUES(?, ?, ?)', data_mol)
isomers = get_isomers(mol)
for stereo_index, stereo_mol in enumerate(isomers):
smi = Chem.MolToSmiles(stereo_mol, isomericSmiles=True)
if prefix:
mol_name = f'{prefix}-{mol_name}'
if mol_is_3d(mol):
data_mol.append((mol_name, stereo_index, smi, Chem.MolToMolBlock(stereo_mol)))
else:
data_smi.append((mol_name, stereo_index, smi))
Copy link
Contributor

@DrrDom DrrDom Feb 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

        if prefix:
            mol_name = f'{prefix}-{mol_name}'
        if mol_is_3d(mol):
            data_mol.append((mol_name, 0, smi, Chem.MolToMolBlock(stereo_mol)))
        else:
            isomers = get_isomers(mol, max_isomers=max_isomers)
            for stereo_index, stereo_mol in enumerate(isomers):
                smi = Chem.MolToSmiles(stereo_mol, isomericSmiles=True)
                data_smi.append((mol_name, stereo_index, smi))
  1. Enumeration is only needed if input is not a 3D molecule
  2. I added max_isomers to input arguments
  3. I added 0 as a default stereo_id value

cur.executemany(f'INSERT INTO mols (id, stereo_id, smi) VALUES(?, ?, ?)', data_smi)
cur.executemany(f'INSERT INTO mols (id, stereo_id, smi, source_mol_block) VALUES(?, ?, ?, ?)', data_mol)
conn.commit()


Expand All @@ -166,7 +183,7 @@ def get_protonation_arg_value(db_conn):
return not d['no_protonation']


def update_db(db_conn, mol_id, data, table_name='mols', commit=True):
def update_db(db_conn, mol_id, data, table_name='mols', commit=True,complex_id=True):
"""

:param db_conn:
Expand All @@ -176,13 +193,17 @@ def update_db(db_conn, mol_id, data, table_name='mols', commit=True):
:return:
"""
if data:
if complex_id and '_' in mol_id:
mol_id, stereo_index = mol_id.rsplit('_',1)
else:
stereo_index = ''
cols, values = zip(*data.items())
db_conn.execute(f"""UPDATE {table_name}
SET {', '.join(['%s = ?'] * len(cols))},
time = CURRENT_TIMESTAMP
WHERE
id = ?
""" % cols, list(values) + [mol_id])
id = ? AND stereo_id = ?
""" % cols, list(values) + [mol_id,stereo_index])
if commit:
db_conn.commit()

Expand Down Expand Up @@ -242,24 +263,27 @@ def select_mols_to_dock(db_conn, table_name='mols', add_sql=None):
smi_field_name = 'smi_protonated' if protonation_status else 'smi'
mol_field_name = 'source_mol_block_protonated' if protonation_status else 'source_mol_block'

sql = f"""SELECT id, {smi_field_name}, {mol_field_name}
sql = f"""SELECT id, stereo_id, {smi_field_name}, {mol_field_name}
FROM {table_name}
WHERE docking_score IS NULL AND
(({smi_field_name} IS NOT NULL AND {smi_field_name != ''}) OR
({mol_field_name} IS NOT NULL AND {mol_field_name != ''})) """
if isinstance(add_sql, str) and add_sql:
sql += add_sql
for mol_id, smi, mol_block in cur.execute(sql):
for mol_id, stereo_index, smi, mol_block in cur.execute(sql):
if mol_block is None:
mol = Chem.MolFromSmiles(smi)
else:
mol = Chem.MolFromMolBlock(mol_block, removeHs=False)
if mol:
mol.SetProp('_Name', mol_id)
if stereo_index == '' or stereo_index == None:
mol.SetProp('_Name', mol_id)
else:
mol.SetProp('_Name', mol_id + '_' + stereo_index)
yield mol


def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):
def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql='',complex_id=True):
'''
Protonate SMILES by Chemaxon cxcalc utility to get molecule ionization states at pH 7.4
:param db_fname:
Expand All @@ -274,7 +298,7 @@ def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):
try:
cur = conn.cursor()

sql = f"""SELECT smi, source_mol_block, id
sql = f"""SELECT smi, source_mol_block, id, stereo_id
FROM {table_name}
WHERE docking_score is NULL AND smi_protonated is NULL """
sql += add_sql
Expand All @@ -286,14 +310,14 @@ def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):

smi_ids = []
mol_ids = []
for i, (smi, mol_block, mol_name) in enumerate(data_list):
for i, (smi, mol_block, mol_name, stereo_index) in enumerate(data_list):
if mol_block is None:
smi_ids.append(mol_name)
# add missing mol blocks
m = Chem.MolFromSmiles(data_list[i][0])
m.SetProp('_Name', mol_name)
m_block = Chem.MolToMolBlock(m)
data_list[i] = (smi, m_block, mol_name)
data_list[i] = (smi, m_block, mol_name, stereo_index)
else:
mol_ids.append(mol_name)
smi_ids = set(smi_ids)
Expand All @@ -302,19 +326,36 @@ def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):
output_data_smi = []
output_data_mol = []
with tempfile.NamedTemporaryFile(suffix='.smi', mode='w', encoding='utf-8') as tmp:
fd1, inputs = tempfile.mkstemp()
fd, output = tempfile.mkstemp() # use output file to avoid overflow of stdout in extreme cases
try:
for smi, _, mol_id in data_list:
tmp.write(f'{smi}\t{mol_id}\n')
# for smi, _, mol_id in data_list:
# tmp.write(f'{smi}\t{mol_id}\n')
# tmp.flush()
# cmd_run = ['cxcalc', '-S', '--ignore-error', 'majormicrospecies', '-H', '7.4', '-K',
# f'{"-M" if tautomerize else ""}', tmp.name]
# with open(output, 'w') as file:
# subprocess.run(cmd_run, stdout=file, text=True)
with open(inputs,'w') as input_file:
for smi, _, mol_id, stereo_index in data_list:
input_file.write(f'{smi}\t{mol_id}\t{stereo_index}\n')
tmp.flush()
cmd_run = ['cxcalc', '-S', '--ignore-error', 'majormicrospecies', '-H', '7.4', '-K',
f'{"-M" if tautomerize else ""}', tmp.name]
with open(output, 'w') as file:
subprocess.run(cmd_run, stdout=file, text=True)
for mol in Chem.SDMolSupplier(output, sanitize=False):
cmd_run = ['python','/home/user/miniforge3/envs/easydock/lib/python3.9/site-packages/easydock/dimorphite_dl.py', \
'--smiles_file',inputs,'--output_file',output,'--min_ph','7.3','--max_ph','7.5','--max_variants','1','--silent']
subprocess.run(cmd_run,text=True)

# for mol in Chem.SDMolSupplier(output, sanitize=False):
for index,line in enumerate([x.strip() for x in open(output,'r').readlines()]):
smi = line.split()[0]
mol = Chem.MolFromSmiles(line,sanitize=False)
if mol:
mol_name = mol.GetProp('_Name')
smi = mol.GetPropsAsDict().get('MAJORMS', None)
if complex_id and len(mol.GetProp('_Name').split()) >1:
mol_name, stereo_index = mol.GetProp('_Name').rsplit(maxsplit=1)
else:
mol_name = mol.GetProp('_Name')
stereo_index = ''

# smi = mol.GetPropsAsDict().get('MAJORMS', None)
if smi is not None:
try:
cansmi = Chem.CanonSmiles(smi)
Expand All @@ -323,7 +364,7 @@ def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):
f'could not be read by RDKit. The molecule was skipped.\n')
continue
if mol_name in smi_ids:
output_data_smi.append((cansmi, mol_name))
output_data_smi.append((cansmi, mol_name, stereo_index))
elif mol_name in mol_ids:
try:
# mol block in chemaxon sdf is an input molecule but with 2d structure
Expand All @@ -338,25 +379,29 @@ def add_protonation(db_fname, tautomerize=True, table_name='mols', add_sql=''):
ref_mol = Chem.RemoveHs(Chem.MolFromSmiles(smi))
mol = AllChem.AssignBondOrdersFromTemplate(ref_mol, mol3d)
Chem.AssignStereochemistryFrom3D(mol) # not sure whether it is necessary
output_data_mol.append((cansmi, Chem.MolToMolBlock(mol), mol_name))
output_data_mol.append((cansmi, Chem.MolToMolBlock(mol), mol_name,stereo_index))
except ValueError:
continue
finally:
os.remove(output)
os.close(fd)
os.remove(inputs)
os.close(fd1)

cur.executemany(f"""UPDATE {table_name}
SET
smi_protonated = ?
WHERE
id = ?
id = ? AND
stereo_id = ?
""", output_data_smi)
cur.executemany(f"""UPDATE {table_name}
SET
smi_protonated = ?,
source_mol_block_protonated = ?
WHERE
id = ?
id = ? AND
stereo_id = ?
""", output_data_mol)
conn.commit()

Expand Down