In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import re
import warnings
from pdbfixer import PDBFixer
from openmm.app import PDBxFile, PDBFile

In [2]:
#warnings.filterwarnings("ignore")

In [3]:
def replace_string(l, chain, new_chain, rid, new_rid):
    if len(rid) == 1:
        nl = l.replace("{}   {}".format(chain, rid), "{}   {}".format(new_chain, new_rid))
    elif len(rid) == 2:
        nl = l.replace("{}  {}".format(chain, rid), "{}  {}".format(new_chain, new_rid))
    elif len(rid) == 3:
        nl = l.replace("{} {}".format(chain, rid), "{} {}".format(new_chain, new_rid))
    elif len(rid) == 4:
        nl = l.replace("{}{}".format(chain, rid), "{}{}".format(new_chain, new_rid))
    return nl

In [4]:
def renumber_resid(f):
    arr = []
    count = 0
    new_rid = 0
    with open(f, "r") as wf:
        for l in wf.readlines():
            if l.startswith('ATOM'):
                _l = l.strip('\n').split()
                aid = str(_l[1])
                aname = str(_l[2])
                
                # handle four digit resid (e.g. C1983)
                x = str(_l[4])
                if len(x) == 5:
                    chain = str(x[:1])
                    rid = str(x[1:])
                else:
                    chain = str(_l[4])
                    rid = str(_l[5])

                # convert numeric chain to alphabet
                if chain.isnumeric():
                    new_chain = "X"
                else:
                    new_chain = chain
                    
                # check resid update
                if aname == 'P':
                    new_rid = str(int(rid) + count)
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
                    count += 1
                else:
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
            else:
                arr.append(l)

    # export pdb
    shutil.move(f, f + ".duplicateAtoms")
    with open(f, "w") as wf:
        for a in arr:
            wf.write(a)

In [5]:
def check_pdb(f):    
    n_warnings = 0
    basename = os.path.basename(f)
    #print(">{}".format(basename))
    
    
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("default")
        fixer = PDBFixer(filename=f)
    
    
    # check missing atoms
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    if fixer.missingAtoms:
        n_warnings += 1

        print("{}: Missing atoms -> Will be removed in the next step".format(basename))
        shutil.move(f, f + ".missingAtoms")

        
    # check duplicate atoms (same residue ID)
    x = [ x for x in w if "duplicate atom" in str(x) ]
    if len(x) != 0:
        n_warnings += 1
        
        print("{}: Duplicate atoms -> Renumber residue ID".format(basename))
        renumber_resid(f)
        
    if n_warnings == 0:
        #print("pass")
        pass

In [6]:
if __name__ == "__main__":
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
    
    #files = glob.glob("/Users/takabak/sftp/test/*.pdb")
    
    files_for_bpcatalog = glob.glob(os.path.join(base_path) + "pdb/bpcatalog/*.pdb")
    files_for_triplebase = glob.glob(os.path.join(base_path) + "pdb/triplebase/*/*.pdb")
    files_for_hairpins = glob.glob(os.path.join(base_path) + "pdb/motif/HairpinLoopMotifAtlasRelease3.57/*/*.pdb")
    files_for_internals = glob.glob(os.path.join(base_path) + "pdb/motif/InternalLoopMotifAtlasRelease3.57/*/*.pdb")
    files_for_junctions = glob.glob(os.path.join(base_path) + "pdb/motif/nrlist_3.233_2.5A/JunctionLoop/*/*.pdb")
    
    files = files_for_bpcatalog + files_for_triplebase + files_for_hairpins + files_for_internals + files_for_junctions
    #files = files_for_triplebase
    
    for f in files:
        check_pdb(f)

BP_cWS_AC.pdb: Missing atoms -> Will be removed in the next step
BP_cWH_GG.pdb: Duplicate atoms -> Renumber residue ID
BP_tWH_CG.pdb: Missing atoms -> Will be removed in the next step
BP_tWS_AG.pdb: Missing atoms -> Will be removed in the next step
BP_cWH_UU.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tHS_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tSW_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tSH_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_cSS_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_cHS_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tHH_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_cSW_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tHW_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_cSH_AAC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tSW_tSW_UCC.pdb: Duplicate atoms -> Renumber residue ID
Triple_tHW_tHS_GCC.pdb: Duplicate atoms -> Renumber residue ID
Trip