In [1]:
import sys
sys.path.append("../")
from multiprocessing import Pool
import tqdm
import pickle
from pathlib import Path
from contextlib import closing
from rdkit import Chem

In [3]:
from phenixml.fragmentation.fragmenter_restraints import BondFragmenter, AngleFragmenter

## Load openbabel .err files to see which files were converted to mol2 successfuly

In [5]:
%%time
filtered_dir = Path("/net/cci/cschlick/Filtered_COD3/")
err_files = [path for path in filtered_dir.glob("**/*") if path.suffix == ".err"] # all err files

CPU times: user 3.63 s, sys: 7.36 s, total: 11 s
Wall time: 1min 14s


In [6]:
%%time
success_converted = [] # successfully converted err files
for err_file in err_files:
  with err_file.open("r") as fh:
    lines = fh.readlines()
    if len(lines)=elementsand "1 molecule converted" in lines[0]:
      success_converted.append(Path(err_file.parent,err_file.stem+".mol2"))
print("Success:",len(success_converted))

Success: 74249
CPU times: user 3.19 s, sys: 3.6 s, total: 6.79 s
Wall time: 1min 4s


In [11]:
# elements and parameters
elements_considered = ["O","C","H","N","P","S","Cl","B","F","I","Br"]
# I only converted files in this directory that were of these elements,
# but we will verify that when reading

## Load .mol2 files as RDKit molecules

In [16]:
def worker(mol2_file):
  # read mol2file
  rdmol = Chem.MolFromMol2File(mol2_file.as_posix(),removeHs=False)
  
  if rdmol is not None:
    elements = [atom.GetSymbol() for atom in rdmol.GetAtoms()]
    if not set(elements).issubset(elements_considered):
      rdmol = None
  results = {"filepath":mol2_file,"rdmol":rdmol}
  return results

In [17]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*') 

work = success_converted
with closing(Pool(processes=32)) as pool:
  results = []
  for result in tqdm.tqdm(pool.map(worker, work), total=len(work)):
      results.append(result)
  pool.terminate()

100%|██████████| 74249/74249 [00:00<00:00, 2907233.74it/s]


In [18]:
success_initialized = []
failed_initialized = []
for result in results:
  if result["rdmol"] is not None:
    success_initialized.append(result)
  else:
    failed_initialized.append(result)
print("Success initialized:",len(success_initialized))
print("Failed initialized:",len(failed_initialized))

Success initialized: 63018
Failed initialized: 11231


# Fragment molecules on all bonds and angles

In [19]:
angle_fragmenter = AngleFragmenter()
bond_fragmenter = BondFragmenter()

In [20]:
def worker(work_dict):
  rdmol = work_dict["rdmol"]
  angle_fragments = angle_fragmenter.fragment(rdmol)
  angle_fragments = [frag for frag in angle_fragments if "H" not in frag.atom_symbols]

  
  bond_fragments = bond_fragmenter.fragment(rdmol)
  bond_fragments = [frag for frag in bond_fragments if "H" not in frag.atom_symbols]
  
  for frag in angle_fragments:
    frag.properties["filepath"]=work_dict["filepath"]
    conf = frag.rdmol.GetConformer()
    i,j,k = frag.atom_indices
    angle_deg = Chem.rdMolTransforms.GetAngleDeg(conf,i,j,k)
    frag.properties["angle_deg"] = angle_deg

    
  for frag in bond_fragments:
    frag.properties["filepath"]=work_dict["filepath"]
    
    conf = frag.rdmol.GetConformer()
    i,j = frag.atom_indices
    bond_length = Chem.rdMolTransforms.GetBondLength(conf,i,j)
    frag.properties["bond_length"] = bond_length
    
  work_dict["angle_fragments"]=angle_fragments
  work_dict["bond_fragments"]=bond_fragments
  return work_dict

In [21]:
work = success_initialized
with closing(Pool(processes=32)) as pool:
  results = []
  for result in tqdm.tqdm(pool.map(worker, work), total=len(work)):
      results.append(result)
  pool.terminate()

100%|██████████| 63018/63018 [00:00<00:00, 1123757.06it/s]


In [22]:
angle_fragments = []
bond_fragments = []
for result in results:
  angle_fragments+=result["angle_fragments"]
  bond_fragments+=result["bond_fragments"]

In [24]:
%%time
fragments_path = Path("/net/cci/cschlick/Filtered_COD3/fragmentation_results.pkl")
with fragments_path.open("wb") as fh:
  pickle.dump(results,fh)

CPU times: user 14.4 s, sys: 838 ms, total: 15.2 s
Wall time: 18.7 s


## Stop