In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import itertools
import re
import warnings
from pdbfixer import PDBFixer
from openmm.app import PDBxFile, PDBFile
from openmm.unit import *
import mdtraj as md

In [2]:
def make_dir(output_path):
    """
    Create output directory
    """
    if os.path.isdir(output_path):
        print(">remove directory")
        shutil.rmtree(output_path)
    
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

In [3]:
def check_file(files):    
    curated_files = []
    
    for file in files:
        n_warnings = 0
        basename = os.path.basename(file)


        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("default")
            fixer = PDBFixer(filename=file)


        # check missing atoms
        fixer.findMissingResidues()
        fixer.findMissingAtoms()
        if fixer.missingAtoms:
            n_warnings = 1            
            print("{}: Missing atoms\n{}".format(basename, fixer.missingAtoms))
            
            fixer.addMissingAtoms()
            shutil.move(file, file + ".missingAtoms")
            PDBFile.writeFile(fixer.topology, fixer.positions, open(file, "w"))

        # check duplicate atoms (same residue ID)
        x = [ x for x in w if "duplicate atom" in str(x) ]
        if len(x) != 0:
            n_warnings = 2        
            print("{}: Duplicate atoms -> Renumber residue ID".format(basename))

        if n_warnings == 0:
            curated_files.append(file)

    return curated_files

In [4]:
def export_file(output_path, file):    
    basename = os.path.basename(file)    
    dumpfile = os.path.join(output_path, "dump", basename)
    outfile = os.path.join(output_path, basename)

    # load pdb, check connectivity, and handle 5'
    fixer = PDBFixer(filename=file)
    
    #PDBFile.writeFile(fixer.topology, fixer.positions, open(dumpfile, 'w'), keepIds=True)       # save cif as pdb (dump)
    PDBFile.writeFile(fixer.topology, fixer.positions, open(dumpfile, 'w'))       # save cif as pdb (dump)

    
    atom_indices = [ atom.index for atom in fixer.topology.atoms() if atom.name in ["O3'", "P"] ]
    indice_pairs = list(itertools.combinations(atom_indices, 2))
    #print(indice_pairs)

    d = []
    for i in indice_pairs:
        d.append(calc_dist(fixer, i))

    if max(d) < 2:
        print("{} maybe connected")
    else:
        indices = []
        indices = [ atom.index for atom in fixer.topology.atoms() if atom.name not in ["P", "OP1", "OP2"] and atom.element.symbol != "H" ]   # exclude P, OP1, and OP2 to handle as 5' base residue

        traj = md.load_pdb(dumpfile, atom_indices=indices)
        traj.save_pdb(outfile)
        
        # insert TER
        t = md.load(outfile)
        if t.topology.n_chains != t.topology.n_residues:
            insert_ter(t, outfile)

In [5]:
def insert_ter(t, f):
    
    # read
    c = 0
    ref = None
    arr = []
    with open(f, "r") as rf:
        for l in rf.readlines():
            _l = l.strip('\n').split()
            if l.startswith("ATOM"):
                aname = _l[2]
                if ref == None:
                    ref = _l[2]
                    
                if aname == ref and c > 0:
                    arr.append("TER\n")
                    arr.append(l)
                else:
                    arr.append(l)
                    c += 1
            elif l.startswith("TER"):
                pass
            else:
                arr.append(l)
                
                
    # write
    with open(f, "w") as wf:
        for a in arr:
            wf.write(a)

In [6]:
def calc_dist(fixer, i):
    x = fixer.positions[i[0]] - fixer.positions[i[1]]
    d = sum([ v**2 for v in x.value_in_unit(angstroms) ])
    
    return sqrt(d)

In [7]:
if __name__ == "__main__":
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    release_versions = [ "bpcatalog", "triplebase" ]
    #release_versions = [ "triplebase" ]
        
    # create directory
    for release_version in release_versions:
        #print(">{}".format(release_version))
        output_path = os.path.join(base_path, "pdb", release_version)
        make_dir(output_path)
        make_dir(os.path.join(output_path, "dump"))
        
        files = glob.glob(os.path.join(base_path) + "data/{}/*/*.pdb".format(release_version))
        #files = [ file for file in files if file.find("exemplar") == -1 ]   # remove exemplar files from triplebase

        curated_files = check_file(files)    
        
        for file in curated_files:
            export_file(output_path, file)  