## Intro
PDBBind has its own way of cleaning up original PDB entries, some of them are investigated here 
## Conclusions

* Most PDBBind PDB files have SEQRES records that **exactly** correspond to polymer-derived ATOM/HETATM records
* There are ~70 exceptions, mostly due to dummy residues in SEQRES at the beginning or at the end 
* All none-polymer HETATM records were pushed to the end of the sequence section, with chainID information removed 
* There are **few** exceptions of polymer chains having empty chainID


In [2]:
from matplotlib import pyplot as plt 
from pathlib import Path
from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [3]:
general_set_dir = Path('../pdbbind/v2016/general-set-except-refined')
refined_set_dir = Path('../pdbbind/v2016/refined-set')
set_dirs = [general_set_dir, refined_set_dir]

In [4]:
convert_residue = {
'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
'MSE':'M', 'ASX':'B', 'UNK' : 'X', 'SEC':'U','PYL':'O'
}

In [None]:
s='PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKFQDGDLTLYQSNTILRHLGRTLGLYGKDQQEAALVDMVNDGVEDLRCKYISLIYTNYEAGKDDYVKALPGQLKPFETLLSQNQGGKTFIVGDQISFADYNLLDLLLIHEVLAPGCLDAFPLLSAYVGRLSARPKLKAFLASPEYVNLPINGNGKQPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKFQDGDLTLYQSNTILRHLGRTLGLYGKDQQEAALVDMVNDGVEDLRCKYISLIYTNYEAGKDDYVKALPGQLKPFETLLSQNQGGKTFIVGDQISFADYNLLDLLLIHEVLAPGCLDAFPLLSAYVGRLSARPKLKAFLASPEYVNLPINGNGKQ'

In [None]:
print(s[:208])
print(s[:208] == s[208:])

In [None]:
len(s)

In [None]:
for c in s:
    assert c in convert_residue.values()

In [51]:
def parse_seqres(file, fasta=True):
    seqs = {}
    with open(file, 'r') as reader:
        for line in reader:
            if not line.startswith('SEQRES'):
                continue
            chainID = line[11]
            if not chainID in seqs:
                seqs[chainID] = []
            subseq = [s.strip() for s in line[19:].split()]
            seqs[chainID] += subseq
        for chainID in seqs:
            if fasta:
                seqs[chainID] = ''.join([convert_residue.get(res, 'X') for res in seqs[chainID]])
    return seqs

In [56]:
cc_lens = []
sc_lens = []
exceptions = []
num_jump_occurred = 0
for set_dir in set_dirs:
    print(set_dir)
    for data_dir in tqdm(list(set_dir.glob('*'))):
        if data_dir.name in ['readme', 'index']:
            continue
        protein_file = data_dir / f'{data_dir.name}_protein.pdb'
        seqres_dict = parse_seqres(protein_file, fasta=False) 
        chain_dict = {}
        chainID = None 
        resSeq = None
        iCode = None
        jump_occurred = False
        with open(protein_file, 'r') as f:
            for line in f:
                if line.startswith('ATOM') or line.startswith('HETATM'):
                    prev_chainID = chainID
                    prev_resSeq = resSeq
                    prev_iCode = iCode
                    
                    chainID = line[21]
                    resSeq = int(line[22:26])
                    iCode = line[26]
                    resname = line[17:20]
                    
                    if chainID == prev_chainID and resSeq > prev_resSeq + 1:
                        jump_occurred = True
                        pass
                        #raise Exception(data_dir.name, chainID, resSeq) 
                    
                    if chainID == ' ':
                        continue 
                    
                    if chainID in chain_dict:
                        if prev_chainID != chainID:
                            raise Exception('Non-continugous:', str(protein_file), chainID, resSeq)
                    if (resSeq, iCode) != (prev_resSeq, prev_iCode):
                        if not chainID in chain_dict:
                            chain_dict[chainID] = []
                        chain_dict[chainID].append(resname)
        for chainID, chain in chain_dict.items():
            seq = seqres_dict[chainID]
            cc_lens.append(len(chain))
            sc_lens.append(len(seq))
            if chain != seq:
                exceptions.append((data_dir.name, chainID, seq_len - len(chain)))
        if jump_occurred:
            num_jump_occurred += 1 

print(len(exceptions))
print(num_jump_occurred)
                    

                    

../pdbbind/v2016/general-set-except-refined


100%|███████████████████████████████████████| 9228/9228 [01:18<00:00, 117.29it/s]


../pdbbind/v2016/refined-set


100%|███████████████████████████████████████| 4059/4059 [00:33<00:00, 119.84it/s]

70
6359





In [39]:
len(cc_lens)

23023

In [18]:
examples['ACE']

'../pdbbind/v2016/general-set-except-refined/2r3f/2r3f_protein.pdb'

In [None]:
fastas = []
nums = []
for set_dir in set_dirs:
    print(set_dir)
    for data_dir in tqdm(list(set_dir.glob('*'))):
        if data_dir.name in ['readme', 'index']:
            continue
        protein_file = data_dir / f'{data_dir.name}_protein.pdb'
        seqres_dict = parse_seqres(protein_file)
        fasta = '^'.join(seqres_dict.values())
        fastas.append(fasta)
        nums.append(len(fasta))
    