In [1]:
import numpy as np
import csv
from Bio.PDB import *
import nglview as nv
from glob import glob
from sklearn import metrics



In [2]:
"""
Collect pdb data files in the pdbFiles folder.
Then use Biopython to parse the pdb files and get the structure data.
Append that structure data to the pdbs list for processing later.

This was the idea although most of the processing may be just done in the functions rather than stored in a list to save memory.
Also certain processes require more than the Biopython structure object since it cannot store the entirety of the data in the pdb
file.
"""

pdbs = []
parser = PDBParser()
for f in glob('pdbFiles/*.pdb'):
    pdbs.append(parser.get_structure(f'{f}',f))

view = nv.show_biopython(pdbs[0][0])
view



NGLWidget()

In [7]:
def getBindingRegionResidues(chain1, chain2):
    # Convert the chain objects to lists of atoms
    atoms1 = list(chain1.get_atoms())
    atoms2 = list(chain2.get_atoms())
    
    # Create a NeighborSearch object for chain1
    ns = NeighborSearch(atoms1)
    
    # Find the residues in chain2 that are within 4 angstroms of chain1
    nearby_residues = []
    for atom in atoms2:
        neighbors = ns.search(atom.coord, 4)
        for neighbor in neighbors:
            residue = neighbor.get_parent()
            if residue not in nearby_residues:
                nearby_residues.append(residue)
    
    return nearby_residues

In [8]:
#Get info on the different biological assembly information in the pdb file.
def getBiologicalAssemblies(file):
    pdb = []
    with open(file, 'r') as file:
        for line in file:
            #Crystal structure/Biological Assembly details are stored in REMARK lines 350
            if 'REMARK 350' in line:
                pdb.append(" ".join(line.split()))

    assemblies = {}
    counter = 1
    for line in pdb:
        if "APPLY THE FOLLOWING TO CHAINS" in line:
            assemblies[counter] = line.replace("REMARK 350 APPLY THE FOLLOWING TO CHAINS: ", "").replace(',', "").split()
            counter += 1

    return assemblies

In [9]:
def computeBindingRegion(pdbFile):
    #Get biological assemblies in the structure. This requires to reopen the file and read the REMARK lines.
    BioAssemblies = getBiologicalAssemblies(file = pdbFile)
    
    #Parse the pdb file using Biopython and extract the structure information.
    parser = PDBParser()
    structure = parser.get_structure(id = pdbFile,file = pdbFile)

    #Find what chains are part of protein and antibodies.
    compoundInfo = structure.header['compound']
    for k, v in compoundInfo.items():
        if 'antibody' in v['molecule']:
            if 'heavy' in v['molecule']:
                heavyChains = [c for c in v['chain'].upper().replace(',', "").split()]
            elif 'light' in v['molecule']:
                lightChains = [c for c in v['chain'].upper().replace(',', "").split()]
        else:
            protein = [c for c in v['chain'].replace(',', "").upper().split()]

    #Compute binding residues in the structure for protein/heavy & light chain interactions for each biological assembly.
    chainPairs = {}
    currProtein = None
    for k, v in BioAssemblies.items():
        chainPairs[k] = []
        for c in protein:
            if c in v:
                currProtein = c
                v.remove(currProtein)
        for chain in v:
            chainPairs[k].append((currProtein, chain))

    #Compute binding regions
    chain1 = None
    chain2 = None
    BindingRegion = {}
    for k, v in chainPairs.items():
        for pair in v:
            for chain in structure[0]:  
                if chain.get_id() == pair[0]:
                    chain1 = chain
                elif chain.get_id() == pair[1]:
                    chain2 = chain
            resIdsAtBindReg = getBindingRegionResidues(chain1 = chain1, chain2 = chain2)
            BindingRegion[pair] = resIdsAtBindReg
    return BindingRegion

In [22]:
bind = computeBindingRegion(pdbFile='pdbFiles/7fah.pdb')
#print(bind[('A', 'H')][0].get_id()[1])
print(bind)

{('A', 'H'): [<Residue ALA het=  resseq=148 icode= >, <Residue ALA het=  resseq=150 icode= >, <Residue HIS het=  resseq=147 icode= >, <Residue GLU het=  resseq=77 icode= >, <Residue GLY het=  resseq=149 icode= >, <Residue LYS het=  resseq=151 icode= >, <Residue ALA het=  resseq=143 icode= >], ('A', 'L'): [<Residue VAL het=  resseq=141 icode= >, <Residue LYS het=  resseq=139 icode= >, <Residue TRP het=  resseq=159 icode= >, <Residue GLN het=  resseq=232 icode= >, <Residue TYR het=  resseq=100 icode= >, <Residue HIS het=  resseq=189 icode= >, <Residue THR het=  resseq=142 icode= >, <Residue ASP het=  resseq=196 icode= >, <Residue ASP het=  resseq=231 icode= >, <Residue ALA het=  resseq=143 icode= >, <Residue LYS het=  resseq=151 icode= >], ('B', 'C'): [<Residue ALA het=  resseq=148 icode= >, <Residue ALA het=  resseq=150 icode= >, <Residue HIS het=  resseq=147 icode= >, <Residue GLU het=  resseq=77 icode= >, <Residue GLY het=  resseq=149 icode= >, <Residue LYS het=  resseq=151 icode= >, 



In [None]:
#Format the data and label residues participating in binding.
def labelBindingResidues(pdbFile):
    BindingRegion = computeBindingRegion(pdbFile = pdbFile)
    for k, v in BindingRegion.items():
            
    