In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import re
import warnings
from pdbfixer import PDBFixer
from openmm.app import PDBxFile, PDBFile
from openmm.unit import *
import mdtraj as md

In [2]:
#warnings.filterwarnings("ignore")

In [3]:
def make_dir(output_path):
    """
    Create output directory
    """
    if os.path.isdir(output_path):
        print(">remove directory")
        shutil.rmtree(output_path)
    
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

In [4]:
def check_file(files):
    """
    """
    curated_files = []
    
    for file in files:
        basename = os.path.basename(file)

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("default")
            fixer = PDBFixer(filename=file)

        fixer.findMissingResidues()
        fixer.findMissingAtoms()
        if fixer.missingAtoms:
            print("{}: Missing atoms".format(basename))

        n_atoms = 0
        n_residues = 0
        with open(file, "r") as rf:
            for l in rf.readlines():
                _l = l.strip('\n').split()
                if l.startswith("ATOM"):
                    n_atoms += 1
                    symbol = _l[2]
                    if symbol == "P":
                        n_residues += 1


        if fixer.topology.getNumAtoms() != n_atoms:
            print("{}: number of atoms does not match (residues: {}->{} / atoms: {}->{})".format(basename, \
                                                                                                 n_residues, fixer.topology.getNumResidues(), \
                                                                                                 n_atoms, fixer.topology.getNumAtoms()))
            # export pdb
            shutil.move(file, file + ".warning")
        else:
            curated_files.append(file)

    return curated_files

In [5]:
def export_file(output_path, file):
    basename_noext = os.path.basename(file).split('.cif')[0]
    #print(basename_noext)
    
    dumpfile = os.path.join(output_path, "dump", basename_noext + ".pdb")

    # load cif, check connectivity, and split structures
    fixer = PDBFixer(filename=file)
    
    PDBFile.writeFile(fixer.topology, fixer.positions, open(dumpfile, 'w'))       # save cif as pdb (dump)
    residues = [ residue for residue in fixer.topology.residues() ]
    
    for i in range(1, len(residues)-1):
        """
        Examples
        ---------
        [Bond(<Atom 8 (O3') of chain 0 residue 0 (U)>, <Atom 20 (P) of chain 0 residue 1 (G)>)]
        [Bond(<Atom 8 (O3') of chain 0 residue 0 (U)>, <Atom 20 (P) of chain 0 residue 1 (G)>), Bond(<Atom 28 (O3') of chain 0 residue 1 (G)>, <Atom 43 (P) of chain 0 residue 2 (C)>)]
        [Bond(<Atom 28 (O3') of chain 0 residue 1 (G)>, <Atom 43 (P) of chain 0 residue 2 (C)>), Bond(<Atom 51 (O3') of chain 0 residue 2 (C)>, <Atom 63 (P) of chain 0 residue 3 (A)>)]
        [Bond(<Atom 51 (O3') of chain 0 residue 2 (C)>, <Atom 63 (P) of chain 0 residue 3 (A)>), Bond(<Atom 71 (O3') of chain 0 residue 3 (A)>, <Atom 85 (P) of chain 0 residue 4 (A)>)]
        [Bond(<Atom 71 (O3') of chain 0 residue 3 (A)>, <Atom 85 (P) of chain 0 residue 4 (A)>), Bond(<Atom 93 (O3') of chain 0 residue 4 (A)>, <Atom 107 (P) of chain 0 residue 5 (C)>)]
        [Bond(<Atom 93 (O3') of chain 0 residue 4 (A)>, <Atom 107 (P) of chain 0 residue 5 (C)>)]
        """

        b = [ r for r in residues[i].external_bonds() ]

        try:
            #b2 = [ r for r in residues[i].external_bonds() ][1]
        
            dist1 = calc_dist(fixer, b[0])
            dist2 = calc_dist(fixer, b[1])

            if dist1 < 2 and dist2 < 2:
                #print("{}{}-{}{}-{}{} are connected".format(b[0].atom1.residue.id, b[0].atom1.residue.name, \
                #                                            b[1].atom1.residue.id, b[1].atom1.residue.name, \
                #                                            b[1].atom2.residue.id, b[1].atom2.residue.name))

                indices = []
                indices = [ atom.index for atom in b[0].atom1.residue.atoms() if atom.name not in ["P", "OP1", "OP2"] and atom.element.symbol != "H" ]   # exclude P, OP1, and OP2 to handle as 5' base residue
                indices += [ atom.index for atom in b[1].atom1.residue.atoms() if atom.element.symbol != "H" ]
                indices += [ atom.index for atom in b[1].atom2.residue.atoms() if atom.element.symbol != "H" ]

                rna_seq = b[0].atom1.residue.name + b[1].atom1.residue.name + b[1].atom2.residue.name
                outfile = basename_noext + "_" + rna_seq + "_" + str(i) + ".pdb"

                traj = md.load_pdb(dumpfile, atom_indices=indices)
                traj.save_pdb(os.path.join(output_path, "triplebase", outfile))
        except:
            if len(b) == 0:
                warnings.warn("Warning: no connectivity found ({})".format(file))
                warnings.warn("{}".format(b))
            elif len(b) == 1:
                pass
            else:
                warnings.warn("Warning: two connectivity found but could not process ({})".format(file))
                warnings.warn("{}".format(b))
            
                

In [6]:
def calc_dist(fixer, b):
    x = fixer.positions[b.atom1.index] - fixer.positions[b.atom2.index]
    d = sum([ v**2 for v in x.value_in_unit(angstroms) ])
    
    return sqrt(d)

In [7]:
if __name__ == "__main__":
    """
    """
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')    
    release_versions = [ "HairpinLoopMotifAtlasRelease3.57", "InternalLoopMotifAtlasRelease3.57", "nrlist_3.233_2.5A/JunctionLoop" ]
    
    
    # create directory
    for release_version in release_versions:
        output_path = os.path.join(base_path, "pdb", "motif", release_version)
        
        make_dir(output_path)
        make_dir(os.path.join(output_path, "dump"))
        make_dir(os.path.join(output_path, "triplebase"))
        
        files = glob.glob(os.path.join(base_path) + "data/{}/*.cif".format(release_version))
        curated_files = check_file(files)    
        
        for file in curated_files:
            export_file(output_path, file)    

### check warning

In [None]:
f = "/Users/takabak/work/rna_bgsu/data/HairpinLoopMotifAtlasRelease3.57/HL_01181.4.cif"

In [None]:
fixer = PDBFixer(filename=f)
residues = [ residue for residue in fixer.topology.residues() ]

In [None]:
for i in range(1, len(residues)-1):
    b = [ r for r in residues[i].external_bonds() ]
    print(i, len(b), b)

In [None]:
residues