In [1]:
import os, sys, shutil
import pathlib
import glob as glob
import numpy as np
import re
import warnings
from pdbfixer import PDBFixer
from openmm.app import PDBxFile, PDBFile

In [2]:
#warnings.filterwarnings("ignore")

In [3]:
def replace_string(l, chain, new_chain, rid, new_rid):
    if len(rid) == 1:
        nl = l.replace("{}   {}".format(chain, rid), "{}   {}".format(new_chain, new_rid))
    elif len(rid) == 2:
        nl = l.replace("{}  {}".format(chain, rid), "{}  {}".format(new_chain, new_rid))
    elif len(rid) == 3:
        nl = l.replace("{} {}".format(chain, rid), "{} {}".format(new_chain, new_rid))
    elif len(rid) == 4:
        nl = l.replace("{}{}".format(chain, rid), "{}{}".format(new_chain, new_rid))
    return nl

In [4]:
def renumber_resid(f):
    arr = []
    count = 0
    new_rid = 0
    with open(f, "r") as wf:
        for l in wf.readlines():
            if l.startswith('ATOM'):
                _l = l.strip('\n').split()
                aid = str(_l[1])
                aname = str(_l[2])
                
                # handle four digit resid (e.g. C1983)
                x = str(_l[4])
                if len(x) == 5:
                    chain = str(x[:1])
                    rid = str(x[1:])
                else:
                    chain = str(_l[4])
                    rid = str(_l[5])

                # convert numeric chain to alphabet
                if chain.isnumeric():
                    new_chain = "X"
                else:
                    new_chain = chain
                    
                # check resid update
                if aname == 'P':
                    new_rid = str(int(rid) + count)
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
                    count += 1
                else:
                    nl = replace_string(l, chain, new_chain, rid, new_rid)
                    arr.append(nl)
            else:
                arr.append(l)

    # export pdb
    shutil.move(f, f + ".duplicateAtoms")
    with open(f, "w") as wf:
        for a in arr:
            wf.write(a)

In [5]:
def check_pdb(file):    
    n_warnings = 0
    basename = os.path.basename(file)
    #print(">{}".format(basename))
    
    try:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("default")
            fixer = PDBFixer(filename=file)

        # check missing atoms
        fixer.findMissingResidues()
        fixer.findMissingAtoms()
        if fixer.missingAtoms:
            print("{}: Missing atoms -> Will be removed in the next step".format(basename))
            n_warnings = 1
            shutil.copy(file, file + ".missingAtoms")

        # check duplicate atoms (same residue ID)
        x = [ x for x in w if "duplicate atom" in str(x) ]
        if len(x) != 0:
            print("{}: Duplicate atoms -> Renumber residue ID".format(basename))
            n_warnings = 2
            renumber_resid(file)

    except:
        print("{}: Could not load file".format(basename))
        shutil.move(file, file + ".error")

In [6]:
def check_cif(file):
    basename = os.path.basename(file)
    #print(">{}".format(basename))

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("default")
        fixer = PDBFixer(filename=file)

    # seems to only work for pdb...
    #fixer.findMissingResidues()
    #fixer.findMissingAtoms()
    #if fixer.missingAtoms:
    #    print("{}: Missing atoms -> Will be removed in the next step".format(basename))

    n_atoms = 0
    n_residues = 0
    with open(file, "r") as rf:
        for l in rf.readlines():
            _l = l.strip('\n').split()
            if l.startswith("ATOM"):
                n_atoms += 1
                symbol = _l[2]
                if symbol == "P":
                    n_residues += 1


    if fixer.topology.getNumAtoms() != n_atoms:
        print("{}: number of atoms does not match (residues: {}->{} / atoms: {}->{})".format(basename, \
                                                                                             n_residues, fixer.topology.getNumResidues(), \
                                                                                             n_atoms, fixer.topology.getNumAtoms()))
        # export pdb
        shutil.move(file, file + ".warning")
        
    else:
        outfile = file.split('.cif')[0] + ".pdb"
        PDBFile.writeFile(fixer.topology, fixer.positions, open(outfile, "w"))

In [7]:
if __name__ == "__main__":
    base_path = os.path.dirname(os.path.abspath("__file__")).strip('notebooks')
       
    files_for_bpcatalog = glob.glob(os.path.join(base_path) + "data/bpcatalog/*/*.cif")
    files_for_triplebase = glob.glob(os.path.join(base_path) + "data/triplebase/*/*.pdb")
    
    # triple
    for file in files_for_triplebase:
        check_pdb(file)
        
    # double
    for file in files_for_bpcatalog:
        check_cif(file)

BP_cWH_GG.cif: number of atoms does not match (residues: 2->1 / atoms: 46->23)
BP_cWH_UU.cif: number of atoms does not match (residues: 2->1 / atoms: 40->20)


### check modfied files

In [9]:
f = base_path + "data/bpcatalog/cWH/BP_cWH_GG.cif"
check_cif(f)

In [10]:
f = base_path + "data/bpcatalog/cWH/BP_cWH_UU.cif"
check_cif(f)