# Using Biopython's PDB Header parser to get missing residues

Currently, in a development version of Biopython that I got working [here](https://github.com/fomightez/BernBiopython) and this is to demonstrate that.

This eventually should be moved to [structurework cl_demo-binder](https://github.com/fomightez/cl_demo-binder).

You may also be interested in the notebook entitled, 'Using Biopython's PDB module to list resolved residues and construct fit commands'. Think of this notebook as complementing that one. Depending on what you are trying to do (or use as the source of information), one may be better suited.

In [3]:
#get stucture
!curl -OL https://files.rcsb.org/download/6AGB.pdb.gz
!gunzip 6AGB.pdb.gz
!curl -OL https://files.rcsb.org/download/6AH3.pdb.gz
!gunzip 6AH3.pdb.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  491k  100  491k    0     0   902k      0 --:--:-- --:--:-- --:--:--  901k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  519k  100  519k    0     0  1114k      0 --:--:-- --:--:-- --:--:-- 1114k


In [2]:
from Bio.PDB import *
h =parse_pdb_header('6AGB.pdb')
h['has_missing_residues']

True

In [None]:
from Bio.PDB import *
h =parse_pdb_header('6AGB.pdb')
h['missing_residues']

In [22]:
# Missing residue positions for specific chains
from Bio.PDB import *
from collections import defaultdict
h =parse_pdb_header('6AGB.pdb')
#parse per chain
chains_of_interest = ["F","G"]
# make a dictionary for each chain of interest with value of a list. The list will be the list of residues later
missing_per_chain = defaultdict(list)
# go through missing residues and populate each chain's list
for residue in h['missing_residues']:
    if residue["chain"] in chains_of_interest:
        missing_per_chain[residue["chain"]].append(residue["ssseq"])
#print(missing_per_chain)



print('')
print("Missing from chain 'G':\n{}".format(missing_per_chain['G']))
print('\n\n')
for chain in missing_per_chain:
    print(chain,missing_per_chain[chain])

defaultdict(<class 'list'>, {'F': [1], 'G': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 108, 109, 110, 111, 112, 113, 114]})

Missing from chain 'G':
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 108, 109, 110, 111, 112, 113, 114]



F [1]
G [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 108, 109, 110, 111, 112, 113, 114]


In [23]:
# Missing residue positions for ALL chains
from Bio.PDB import *
from collections import defaultdict
# extract information on chains in structure
structure = PDBParser().get_structure('6AGB', '6AGB.pdb')
chains = [each.id for each in structure.get_chains()]

h =parse_pdb_header('6AGB.pdb')

# make a dictionary for each chain of interest with value of a list. The list will be the list of residue positions later
missing_per_chain = defaultdict(list)
# go through missing residues and populate each chain's list
for residue in h['missing_residues']:
    if residue["chain"] in chains:
        missing_per_chain[residue["chain"]].append(residue["ssseq"])
print(missing_per_chain)



print('')
print("Missing from chain 'K':\n{}".format(missing_per_chain['K']))

defaultdict(<class 'list'>, {'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 525, 526, 527, 528, 529, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 743, 744, 745, 746, 747, 748, 749, 750, 751], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 189, 190, 191, 192, 193, 194, 195], 'D': [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76], 'E': [1, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173], 'F': [1], 'G': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 108, 109, 110, 111, 112, 113, 114], 'H': [1, 2], 'I': [243, 244, 245, 246, 2

----

#### Compare two structures

In [24]:
# Does 6AH3 have any residues missing that 6AGB doesn't, for chains shared between them?
from Bio.PDB import *
from collections import defaultdict
# extract information on chains in structure
structure = PDBParser().get_structure('6AGB', '6AGB.pdb') # USING CHAIN LISTING FROM THAT BECAUSE ONLY CARE ABOUT ONES SHARED
chains = [each.id for each in structure.get_chains()]

h =parse_pdb_header('6AH3.pdb')

# make a dictionary for each chain of interest with value of a list. The list will be the list of residue positions later
missing_per_chainh3 = defaultdict(list)
# go through missing residues and populate each chain's list
for residue in h['missing_residues']:
    if residue["chain"] in chains:
        missing_per_chainh3[residue["chain"]].append(residue["ssseq"])
print(missing_per_chainh3)



print('')
print("Missing from chain 'K':\n{}".format(missing_per_chain['K']))

print ('')
same_result = missing_per_chainh3 == missing_per_chain
print("Same residues missing for chains shared by 6AGB and 6AH3?:\n{}".format(same_result))
print ('')
print("Chain by chain accounting of whether missing same residues between 6AGB and 6AH3?:")
for chain in chains:
    print(chain)
    print (missing_per_chainh3[chain] == missing_per_chain[chain])

defaultdict(<class 'list'>, {'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 525, 526, 527, 528, 529, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 743, 744, 745, 746, 747, 748, 749, 750, 751], 'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 189, 190, 191, 192, 193, 194, 195], 'D': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76], 'E': [1, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173], 'F': [1], 'G': [1, 2, 3, 4, 5, 6, 7, 8, 9, 1

---- 