In [2]:
import pdbfixer
import mdtraj as md
from simtk.openmm.app import PDBFile

# 5UDC

In [None]:
# Index in mdtraj/PDBFixer, ID in input PDB, ID in output PDB

# Chain 0 : H : A
# Chain 1 : L : B
# Chain 2 : F : C
# Chain 3 : B : D
# Chain 4 : C : E
# Chain 5 : A : F
# Chain 6 : E : G
# Chain 7 : G : H
# Chain 8 : D : I

In [54]:
# Load the file into PDBFixer
fixer = pdbfixer.PDBFixer(filename='../data/5udc/5udc_splitchain_capped.pdb')


In [68]:
# Remove unwanted chains
# fixer.removeChains(indices)


In [55]:
# Identify missing residues
fixer.findMissingResidues()
fixer.missingResidues


{(0, 0): ['GLN'],
 (0, 139): ['SER', 'LYS', 'SER', 'THR', 'SER', 'GLY'],
 (0, 217): ['GLU', 'PRO', 'LYS', 'SER'],
 (1, 211): ['GLY', 'GLU', 'CYS'],
 (2, 0): ['MET',
  'GLU',
  'LEU',
  'LEU',
  'ILE',
  'LEU',
  'LYS',
  'ALA',
  'ASN',
  'ALA',
  'ILE',
  'THR',
  'THR',
  'ILE',
  'LEU',
  'THR',
  'ALA',
  'VAL',
  'THR',
  'PHE',
  'CYS',
  'PHE',
  'ALA',
  'SER',
  'GLY',
  'GLN'],
 (4, 0): ['GLN'],
 (4, 139): ['SER', 'LYS', 'SER', 'THR', 'SER', 'GLY', 'GLY'],
 (4, 218): ['LYS', 'SER'],
 (5, 211): ['GLY', 'GLU', 'CYS'],
 (6, 0): ['MET',
  'GLU',
  'LEU',
  'LEU',
  'ILE',
  'LEU',
  'LYS',
  'ALA',
  'ASN',
  'ALA',
  'ILE',
  'THR',
  'THR',
  'ILE',
  'LEU',
  'THR',
  'ALA',
  'VAL',
  'THR',
  'PHE',
  'CYS',
  'PHE',
  'ALA',
  'SER',
  'GLY',
  'GLN'],
 (8, 139): ['SER', 'SER', 'LYS', 'SER', 'THR', 'SER', 'GLY', 'GLY'],
 (9, 213): ['CYS'],
 (10, 0): ['MET',
  'GLU',
  'LEU',
  'LEU',
  'ILE',
  'LEU',
  'LYS',
  'ALA',
  'ASN',
  'ALA',
  'ILE',
  'THR',
  'THR',
  'ILE',
 

In [56]:
# Remove missing residues if they are part of terminal fragments 
chains = list(fixer.topology.chains())
keys = fixer.missingResidues.keys()
for key in list(keys): # Declare as list because makes a copy of the dict keys
    chain = chains[key[0]]
    if key[1] == 0 or key[1] == len(list(chain.residues())):
        if len(fixer.missingResidues[key]) > 10: # Do not add back terminal fragment if its longer than 10 residues
            if fixer.missingResidues[key][0] == 'LEU':
                fixer.missingResidues[key] = ['LEU'] # Add LEU back in even though loop will not be added back in
            elif fixer.missingResidues[key][-1] == 'GLN':
                fixer.missingResidues[key] = ['GLN'] # Add GLN back in even though loop will not be added back in
            else:
                print(key, chain, fixer.missingResidues[key])
                del fixer.missingResidues[key]
#         print(key, chain)
#         del fixer.missingResidues[key]


In [57]:
fixer.missingResidues

{(0, 0): ['GLN'],
 (0, 139): ['SER', 'LYS', 'SER', 'THR', 'SER', 'GLY'],
 (0, 217): ['GLU', 'PRO', 'LYS', 'SER'],
 (1, 211): ['GLY', 'GLU', 'CYS'],
 (2, 0): ['GLN'],
 (4, 0): ['GLN'],
 (4, 139): ['SER', 'LYS', 'SER', 'THR', 'SER', 'GLY', 'GLY'],
 (4, 218): ['LYS', 'SER'],
 (5, 211): ['GLY', 'GLU', 'CYS'],
 (6, 0): ['GLN'],
 (8, 139): ['SER', 'SER', 'LYS', 'SER', 'THR', 'SER', 'GLY', 'GLY'],
 (9, 213): ['CYS'],
 (10, 0): ['GLN'],
 (10, 71): ['GLN']}

In [58]:
# Identify nonstandard residues
fixer.findNonstandardResidues()
fixer.nonstandardResidues
# fixer.replaceNonstandardResidues()

[]

In [59]:
# Remove heterogens

# The argument specifies whether to keep water molecules. 
# False removes all heterogens including water. 
# True keeps water molecules while removing all other heterogens.

fixer.removeHeterogens(False)


In [60]:
# findMissingAtoms() identifies all missing heavy atoms 
# and stores them into two fields called missingAtoms and missingTerminals. 
# Each of these is a dictionary whose keys are Residue objects and whose values are lists of atom names. 
# missingAtoms contains standard atoms that should be present in any residue of that type, 
# while missingTerminals contains missing terminal atoms that should be present at the start or end of a chain. 
# You are free to remove atoms from these dictionaries before continuing, if you want to prevent certain atoms 
# from being added.

fixer.findMissingAtoms()


In [61]:
fixer.missingAtoms


{}

In [62]:
fixer.missingTerminals


{}

In [63]:
# addMissingAtoms() is the point at which all heavy atoms get added. 
# This includes the ones identified by findMissingAtoms() as well 
# as the missing residues identified by findMissingResidues(). 
# Also, if you used replaceNonstandardResidues() to modify any residues, 
# that will have removed any atoms that do not belong in the replacement residue, 
# but it will not have added ones that are missing from the original residue. 
# addMissingAtoms() is the point when those get added.

fixer.addMissingAtoms()

In [78]:
# Add missing hydrogens
# fixer.addMissingHydrogens(7.0)


In [64]:
# PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/5udc/5udc_clean_nolongterms.pdb', 'w'))
# PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/5udc/5udc_clean_nolongterms_noloop.pdb', 'w'))
# PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/5udc/5udc_clean_nolongterms_noloop_noseqres.pdb', 'w'))
PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/5udc/5udc_clean.pdb', 'w'))



# 4JHW

In [None]:
# Chain 0 : H
## Missing 214-217

# Chain 1 : L
## Missing 212-214

# Chain 2 : F
## Missing 125-136 and 514-550

In [41]:
# Load the file into PDBFixer
fixer = pdbfixer.PDBFixer(filename='../data/4jhw/4jhw_splitchain_capped.pdb')


In [42]:
# Remove unwanted chains
# fixer.removeChains([2])

In [43]:
# Identify missing residues
fixer.findMissingResidues()
fixer.missingResidues

{(0, 226): ['LYS', 'SER', 'CYS', 'ASP'],
 (1, 211): ['GLY', 'GLU', 'CYS'],
 (2, 72): ['GLN']}

In [44]:
# Remove missing residues if they are > than 10 residues long
chains = list(fixer.topology.chains())
keys = fixer.missingResidues.keys()
for key in list(keys): # Declare as list because makes a copy of the dict keys
    chain = chains[key[0]]
    if chain.index == 0: # Do not add back KS at the c-term of chain 0
        fixer.missingResidues[key] = ['LYS', 'SER']
    if len(fixer.missingResidues[key]) > 10:
        if fixer.missingResidues[key][0] == 'LEU':
            fixer.missingResidues[key] = ['LEU'] # Add LEU back in even though loop will not be added back in
        elif fixer.missingResidues[key][-1] == 'GLN':
            fixer.missingResidues[key] = ['GLN'] # Add GLN back in even though loop will not be added back in
#             elif fixer.missingResidues[key][0] == 'GLN':
#                 fixer.missingResidues[key] = ['GLN'] # Add GLN back in even though loop will not be added back in
        else:
            print(key, chain, fixer.missingResidues[key])
            del fixer.missingResidues[key]
#         print(key, chain)
#         del fixer.missingResidues[key]


In [45]:
fixer.missingResidues

{(0, 226): ['LYS', 'SER'], (1, 211): ['GLY', 'GLU', 'CYS'], (2, 72): ['GLN']}

In [46]:
# Identify nonstandard residues
fixer.findNonstandardResidues()
fixer.nonstandardResidues
# fixer.replaceNonstandardResidues()

[]

In [47]:
# Remove heterogens

# The argument specifies whether to keep water molecules. 
# False removes all heterogens including water. 
# True keeps water molecules while removing all other heterogens.

fixer.removeHeterogens(False)


In [48]:
# findMissingAtoms() identifies all missing heavy atoms 
# and stores them into two fields called missingAtoms and missingTerminals. 
# Each of these is a dictionary whose keys are Residue objects and whose values are lists of atom names. 
# missingAtoms contains standard atoms that should be present in any residue of that type, 
# while missingTerminals contains missing terminal atoms that should be present at the start or end of a chain. 
# You are free to remove atoms from these dictionaries before continuing, if you want to prevent certain atoms 
# from being added.

fixer.findMissingAtoms()


In [49]:
fixer.missingAtoms

{}

In [50]:
fixer.missingTerminals

{}

In [51]:
# addMissingAtoms() is the point at which all heavy atoms get added. 
# This includes the ones identified by findMissingAtoms() as well 
# as the missing residues identified by findMissingResidues(). 
# Also, if you used replaceNonstandardResidues() to modify any residues, 
# that will have removed any atoms that do not belong in the replacement residue, 
# but it will not have added ones that are missing from the original residue. 
# addMissingAtoms() is the point when those get added.

fixer.addMissingAtoms()

In [52]:
# Add missing hydrogens
# fixer.addMissingHydrogens(7.0)

In [53]:
PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/4jhw/4jhw_clean.pdb', 'w'))


# 4jha

In [2]:
# Load the file into PDBFixer
fixer = pdbfixer.PDBFixer(filename='../data/renumbered/4jha.pdb')

In [3]:
# Remove unwanted chains
# fixer.removeChains([2])

In [4]:
# Identify missing residues
fixer.findMissingResidues()
fixer.missingResidues

{(0, 226): ['LYS', 'SER', 'CYS', 'ASP', 'LYS'],
 (1, 211): ['GLY', 'GLU', 'CYS']}

In [5]:
# Remove missing residues if they are > than 10 residues long
chains = list(fixer.topology.chains())
keys = fixer.missingResidues.keys()
for key in list(keys): # Declare as list because makes a copy of the dict keys
    chain = chains[key[0]]
    if chain.index == 0: # Do not add back KS at the c-term of chain 0
        fixer.missingResidues[key] = ['LYS', 'SER']


In [6]:
fixer.missingResidues

{(0, 226): ['LYS', 'SER'], (1, 211): ['GLY', 'GLU', 'CYS']}

In [7]:
# Identify nonstandard residues
fixer.findNonstandardResidues()
fixer.nonstandardResidues
# fixer.replaceNonstandardResidues()

[]

In [8]:
# Remove heterogens

# The argument specifies whether to keep water molecules. 
# False removes all heterogens including water. 
# True keeps water molecules while removing all other heterogens.

fixer.removeHeterogens(False)


In [9]:
# findMissingAtoms() identifies all missing heavy atoms 
# and stores them into two fields called missingAtoms and missingTerminals. 
# Each of these is a dictionary whose keys are Residue objects and whose values are lists of atom names. 
# missingAtoms contains standard atoms that should be present in any residue of that type, 
# while missingTerminals contains missing terminal atoms that should be present at the start or end of a chain. 
# You are free to remove atoms from these dictionaries before continuing, if you want to prevent certain atoms 
# from being added.

fixer.findMissingAtoms()


In [11]:
# addMissingAtoms() is the point at which all heavy atoms get added. 
# This includes the ones identified by findMissingAtoms() as well 
# as the missing residues identified by findMissingResidues(). 
# Also, if you used replaceNonstandardResidues() to modify any residues, 
# that will have removed any atoms that do not belong in the replacement residue, 
# but it will not have added ones that are missing from the original residue. 
# addMissingAtoms() is the point when those get added.

fixer.addMissingAtoms()

In [None]:
# Add missing hydrogens
# fixer.addMissingHydrogens(7.0)

In [13]:
PDBFile.writeFile(fixer.topology, fixer.positions, open('../data/renumbered/4jha_clean.pdb', 'w'), keepIds=True)


In [12]:
for chain in fixer.topology.chains():
    for res in chain.residues():
        print(res, res.id, res.insertionCode)

<Residue 0 (GLN) of chain 0> 1  
<Residue 1 (VAL) of chain 0> 2  
<Residue 2 (GLN) of chain 0> 3  
<Residue 3 (LEU) of chain 0> 4  
<Residue 4 (VAL) of chain 0> 5  
<Residue 5 (GLN) of chain 0> 6  
<Residue 6 (SER) of chain 0> 7  
<Residue 7 (GLY) of chain 0> 8  
<Residue 8 (ALA) of chain 0> 9  
<Residue 9 (GLU) of chain 0> 10  
<Residue 10 (VAL) of chain 0> 11  
<Residue 11 (LYS) of chain 0> 12  
<Residue 12 (LYS) of chain 0> 13  
<Residue 13 (PRO) of chain 0> 14  
<Residue 14 (GLY) of chain 0> 15  
<Residue 15 (SER) of chain 0> 16  
<Residue 16 (SER) of chain 0> 17  
<Residue 17 (VAL) of chain 0> 18  
<Residue 18 (MET) of chain 0> 19  
<Residue 19 (VAL) of chain 0> 20  
<Residue 20 (SER) of chain 0> 21  
<Residue 21 (CYS) of chain 0> 22  
<Residue 22 (GLN) of chain 0> 23  
<Residue 23 (ALA) of chain 0> 24  
<Residue 24 (SER) of chain 0> 25  
<Residue 25 (GLY) of chain 0> 26  
<Residue 26 (GLY) of chain 0> 27  
<Residue 27 (PRO) of chain 0> 28  
<Residue 28 (LEU) of chain 0> 29  
<Res