In [1]:
import sys
sys.path.append("../")
from multiprocessing import Pool
import tqdm
import pickle
from pathlib import Path
from contextlib import closing
from rdkit import Chem
import rdkit
print(rdkit.__version__)

2021.03.3


In [2]:
from phenixml.fragmentation.fragmenter_restraints import BondFragmenter, AngleFragmenter

## Load openbabel .err files to see which files were converted to mol2 successfuly

In [3]:
%%time
filtered_dir = Path("/dev/shm/cschlick/COD_filtered_converted/")
err_files = [path for path in filtered_dir.glob("**/*") if path.suffix == ".err"] # all err files

CPU times: user 2.1 s, sys: 533 ms, total: 2.64 s
Wall time: 2.64 s


In [4]:
%%time
success_converted = [] # successfully converted err files
for err_file in err_files:
  with err_file.open("r") as fh:
    lines = fh.readlines()
    if len(lines)==1 and "1 molecule converted" in lines[0]:
      success_converted.append(Path(err_file.parent,err_file.stem+".mol2"))
print("Success:",len(success_converted))

Success: 65365
CPU times: user 2.38 s, sys: 391 ms, total: 2.77 s
Wall time: 2.78 s


In [5]:
# elements and parameters
elements_considered = ["O","C","H","N","P","S","Cl","B","F","I","Br"]
# I only converted files in this directory that were of these elements,
# but we will verify that when reading

## Load .mol2 files as RDKit molecules

In [6]:
def worker(mol2_file):
  # read mol2file
  sio = sys.stderr = StringIO()
  rdmol = Chem.MolFromMol2File(mol2_file.as_posix(),sanitize=True,removeHs=False,cleanupSubstructures=True)
  if rdmol is None:
    rdmol = Chem.MolFromMol2File(mol2_file.as_posix(),sanitize=False,removeHs=False,cleanupSubstructures=True)
  
  results = {"filepath":mol2_file,"rdmol":rdmol,"err":sio.getvalue()}
  return results

In [7]:
from rdkit import RDLogger
RDLogger.EnableLog('rdApp.*') 
from io import StringIO
work = success_converted
with closing(Pool(processes=32)) as pool:
  results = []
  for result in tqdm.tqdm(pool.imap_unordered(worker, work), total=len(work)):
      results.append(result)
  pool.terminate()

  0%|                                                                                                                                                          | 0/65365 [00:00<?, ?it/s][16:14:48] Can't kekulize mol.  Unkekulized atoms: 9 10 29 30

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 8 9 10 11 12 13 14 15 16

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 13 15 16 18 19 20 22 23 42

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 0 12 13 15 16 17 19 20 22

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 0 2 4 6 19

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 11 12 18 20 22 26 27 28 29 32 34 35 39 41 45 51 53

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 7 10 16 18 19 20 22 29 32 33 41 52 53 55 66 68 71

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 5 6 7 9 11 13 15 16 18

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 23 60

[16:14:48] Can't kekulize mol.  Unkekulized atoms: 0 1 2 4 6 8 10 11 12 13 14 16 18 20 22 23 24

[16:14

In [11]:
success_initialized = []
failed_initialized = []
have_errors = []
for result in results:
  if result["rdmol"] is not None:
    success_initialized.append(result)
  else:
    failed_initialized.append(result)
  if len(result["err"])>0:
    have_errors.append(result)
print("Success initialized:",len(success_initialized))
print("Failed initialized:",len(failed_initialized))
print("Have errors:",len(have_errors))

Success initialized: 65365
Failed initialized: 0
Have errors: 11202


In [12]:
kekulize_errors = []
other_errors = []
for result in have_errors:
  if "Can't kekulize mol" in result["err"]:
    kekulize_errors.append(result)
  else:
    other_errors.append(result)
    

# Fragment molecules on all bonds and angles

In [13]:
angle_fragmenter = AngleFragmenter(exclude_symbols=["H"])
bond_fragmenter = BondFragmenter(exclude_symbols="H")

In [14]:
from phenixml.featurizers.bond_angle_tokenizer import BondTokenizer, AngleTokenizer, bond_order
from phenixml.labelizers.bonds_angles import AngleFragLabeler

In [15]:
angle_tokenizer = AngleTokenizer(ignore_bond_type=True)
bond_tokenizer = BondTokenizer(ignore_bond_type=True)
angle_labeler = AngleFragLabeler()

def worker(work_dict):
  rdmol = work_dict["rdmol"]
  angle_fragments = angle_fragmenter.fragment(rdmol)
  angle_fragments = [frag for frag in angle_fragments if "H" not in frag.atom_symbols]
  angle_tokens = ["".join(angle_tokenizer.featurize(frag)) for frag in angle_fragments]
  bond_fragments = bond_fragmenter.fragment(rdmol)
  bond_fragments = [frag for frag in bond_fragments if "H" not in frag.atom_symbols]
  bond_tokens = ["".join(bond_tokenizer.featurize(frag)) for frag in bond_fragments]
    
  work_dict["angle_fragments"]=[]
  for i,frag in enumerate(angle_fragments):
    if angle_tokens[i] in ["CNO"]:
      work_dict["angle_fragments"].append(frag)
      

  work_dict["bond_fragments"]=[]
  
  
  # labels
  for frag in work_dict["angle_fragments"]:
    frag.properties["angle_deg"] = angle_labeler.labelize(frag)
    frag.properties["filepath"] = work_dict["filepath"]
  
  return work_dict

In [16]:
work = success_initialized
with closing(Pool(processes=32)) as pool:
  results = []
  for result in tqdm.tqdm(pool.map(worker, work), total=len(work)):
      results.append(result)
  pool.terminate()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65365/65365 [00:00<00:00, 2867639.57it/s]


In [17]:
%%time
fragments_path = Path("/dev/shm/cschlick/COD_filtered_converted/fragmentation_results.pkl")
with fragments_path.open("wb") as fh:
  pickle.dump(results,fh)

CPU times: user 5.37 s, sys: 983 ms, total: 6.35 s
Wall time: 6.37 s


## Stop

In [28]:
results

[{'filepath': PosixPath('/net/cci/cschlick/Filtered_COD3/1/01/1520162/1520162.mol2'),
  'rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1e56f0c770>,
  'err': '',
  'angle_fragments': [],
  'bond_fragments': [<phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819490>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819bb0>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819b20>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819880>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819d60>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819400>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819dc0>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e588195e0>,
   <phenixml.fragmentation.fragments_base.Fragment at 0x7f1e58819700>]},
 {'filepath': PosixPath('/net/cci/cschlick/Filtered_COD3/1/01/1100172/1100172.mol2'),
  'rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1e56f0c130>,
  'err': '',
  'angle_f