# Treats PDB Files

# Imports

In [1]:
# -- General
import pandas as pd
import numpy as np
import yaml
import random
import requests
#from tqdm import tqdm
from tqdm.notebook import tqdm
from pathlib import Path

In [53]:
# -- IO
import io
from contextlib import redirect_stdout

In [2]:
# -- Biopython stuff
import Bio.PDB as bp

In [3]:
# -- RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw, PandasTools
from rdkit.Chem.Draw import IPythonConsole
print("RDKit Version: ", rdkit.__version__)

RDKit Version:  2022.09.5


# Functions

In [4]:
def is_het(residue):
    ''' Finds out if a given residue is HET
    '''
    res = residue.id[0]
    return res !=" " and res !="W"

In [5]:
def get_heavy_atoms_list(residue):
    '''Gets a list of the residue atoms, excluding hydrogens'''
    atom_list = []
    for atom in residue.get_atoms():
        if atom.element != 'H': atom_list.append(atom)
    return atom_list

In [6]:
def get_ligands(structure, ligand):
    '''returns a list of ligand residues in the structure'''
    ligands = []
    for residue in structure.get_residues():
        if residue.resname == ligand: ligands.append(residue)
    return ligands

In [15]:
class AtomsToPrintSelector(bp.Select):
    """Class to define the selection atoms"""
    def __init__(self, chains, nonligand_hets=[], remove_waters=True):
        self.chains   = [chain.id for chain in chains]
        self.nonligand_hets = nonligand_hets
        if remove_waters:
            self.nonligand_hets.extend(['WAT','HOH'])
        return
    
    def accept_chain(self, chain):
        accept = False
        if chain.id in self.chains:
            accept = True
        return accept
    
    def accept_residue(self, residue):
        # Cleans undesirable HETs 
        accept = True
        res = residue.id[0]
        if res !=" " and residue.resname in self.nonligand_hets:
            accept = False
        return accept
        
    def accept_atom(self, atom):
        # Eliminate alternate positions
        accept = False
        if (not atom.is_disordered()) or atom.get_altloc() == "A":
            atom.set_altloc(" ")  # Eliminate alt location ID before output.
            accept = True
        return accept

In [8]:
def count_entries(df):
    print("Total # records        : ", len(df) )
    print("Unique PDBIDs          : ", len(df.PDB_ID.unique()) )
    print("Membrane Proteins      : ", len(df.loc[ df.IS_MEMBRANE == True ].PDB_ID.unique()))

In [16]:
# Define the 3-letter codes of residues we DON'T want to consiser. 
# For monoatomic ions, we can also filter by the size of the formula
non_spec_ligands = [# -- Common metals, ions and solvents
                    '1PE', # PENTAETHYLENE GLYCOL
                    '2HT', # 3-methylbenzonitrile
                    '2PE', # NONAETHYLENE GLYCOL
                    '7PE', # 2-(2-(2-(2-(2-(2-ETHOXYETHOXY)ETHOXY)ETHOXY)ETHOXY)ETHOXY)ETHANOL
                    'ACT', # ACETATE ION
                    'ACY', # ACETIC ACID
                    'AKG', # 2-OXOGLUTARIC ACID
                    'BCT', # BICARBONATE ION
                    'BR' , # BROMIDE ION
                    'BMA', # beta-D-mannopyranose
                    'BME', # BETA-MERCAPTOETHANOL
                    'BOG', # octyl beta-D-glucopyranoside
                    'BU3', # (R,R)-2,3-BUTANEDIOL
                    'BUD', # (2S,3S)-butane-2,3-diol
                    'CAC', # CACODYLATE ION
                    'CIT', # CITRIC ACID
                    'CME', # S,S-(2-HYDROXYETHYL)THIOCYSTEINE
                    'CO3', # CARBONATE ION
                    'DMS', # DIMETHYL SULFOXIDE
                    'DTT', # 2,3-DIHYDROXY-1,4-DITHIOBUTANE
                    'DTV', # (2S,3S)-1,4-DIMERCAPTOBUTANE-2,3-DIOL
                    'EDO', # 1,2-ETHANEDIOL
                    'EPE', # 4-(2-HYDROXYETHYL)-1-PIPERAZINE ETHANESULFONIC ACID
                    'FES', # FE2/S2 (INORGANIC) CLUSTER
                    'FMT', # FORMIC ACID
                    'GBL', # GAMMA-BUTYROLACTONE
                    'GOL', # GLYCEROL
                    'GSH', # GLUTATHIONE
                    'HEC', # HEME C
                    'HED', # 2-HYDROXYETHYL DISULFIDE
                    'HEM', # PROTOPORPHYRIN IX CONTAINING FE
                    'IMD', # IMIDAZOLE
                    'IOD', # IODIDE ION
                    'IPA', # ISOPROPYL ALCOHOL
                    'MAN', # alpha-D-mannopyranose
                    'MES', # 2-(N-MORPHOLINO)-ETHANESULFONIC ACID
                    'MG8', # N-OCTANOYL-N-METHYLGLUCAMINE
                    'MLI', # MALONATE ION
                    'MPD', # (4S)-2-METHYL-2,4-PENTANEDIOL
                    'MYR', # MYRISTIC ACID
                    'NAG', # 2-acetamido-2-deoxy-beta-D-glucopyranose
                    'NCO', # COBALT HEXAMMINE(III)
                    'NH3', # AMMONIA
                    'NO3', # NITRATE ION
                    'OCT', # N-OCTANE
                    'OGA', # N-OXALYLGLYCINE
                    'OPG', # OXIRANPSEUDOGLUCOSE
                    'P2U', # 2'-DEOXY-PSEUDOURIDINE-5'MONOPHOSPHATE
                    'PEG', # DI(HYDROXYETHYL)ETHER
                    'PG4', # TETRAETHYLENE GLYCOL
                    'PGE', # TRIETHYLENE GLYCOL
                    'PGO', # S-1,2-PROPANEDIOL
                    'PHO', # PHEOPHYTIN A
                    'PI' , # HYDROGENPHOSPHATE ION (INORGANIC PHOSPHATE)
                    'PLP', # PYRIDOXAL-5'-PHOSPHATE
                    'PO4', # PHOSPHATE ION
                    'POP', # PYROPHOSPHATE 2-
                    'PSE', # O-PHOSPHOETHANOLAMINE
                    'PSU', # PSEUDOURIDINE-5'-MONOPHOSPHATE
                    'PTL', # PENTANAL
                    'SCN', # THIOCYANATE ION
                    'SF4', # IRON/SULFUR CLUSTER
                    'F3S', # FE3-S4 CLUSTER
                    'SGM', # MONOTHIOGLYCEROL
                    'SO4', # SULFATE ION
                    'SPD', # SPERMIDINE
                    'SPM', # SPERMINE
                    'SRT', # S,R MESO-TARTARIC ACID
                    'TAM', # TRIS(HYDROXYETHYL)AMINOMETHANE
                    'TAR', # D(-)-TARTARIC ACID
                    'TFA', # trifluoroacetic acid
                    'TLA', # L(+)-TARTARIC ACID
                    'TPP', # THIAMINE DIPHOSPHATE
                    'TRS', # 2-AMINO-2-HYDROXYMETHYL-PROPANE-1,3-DIOL
                    'WO4', # TUNGSTATE(VI)ION
                    # -- Small ligands (MW < 50 D) --
                    'CO2', # CARBON DIOXIDE
                    'PEO', # HYDROGEN PEROXIDE
                    'NH4', # AMMONIUM ION
                    'EOH', # ETHANOL
                    'CCN', # ACETONITRILE
                    'MOH', # METHANOL
                    'NO2', # NITRITE ION
                    'ACE', # ACETYL GROUP
                    'MEE', # METHANETHIOL
                    '74C', # methyl radical
                    'DMN', # DIMETHYLAMINE
                    'FOR', # FORMYL GROUP
                    'H2S', # HYDROSULFURIC ACID
                    'NSM', # NITROSOMETHANE
                    'ARF', # FORMAMIDE
                    'HOA', # HYDROXYAMINE
                    'HZN', # hydrazine
                    'N2O', # NITROUS OXIDE
                    'D3O', # trideuteriooxidanium
                    '0NM', # cyanic acid
                    'NH2', # AMINO GROUP
                    'TME', # PROPANE
                    'C2H', # acetylene
                    'NEH', # ETHANAMINE
                    'NME', # METHYLAMINE
                    'CNN', # CYANAMIDE
                    'BF2', # BERYLLIUM DIFLUORIDE
                    '2NO', # NITROGEN DIOXIDE
                    'MNC', # METHYL ISOCYANIDE
                    'HDN', # METHYLHYDRAZINE
                    # -- Ligands with 2-letter symbols
                    'PC', # 'PHOSPHOCHOLINE'
                    'EP', # 'EPOTHILONE A'
                    'DC', # "2'-DEOXYCYTIDINE-5'-MONOPHOSPHATE"
                    '5X', # '5R-(2E-METHYL-3-PHENYL-ALLYL)-3-(BENZENESULFONYLAMINO)-4-OXO-2-THIONOTHIAZOLIDINE'
                    'AS', # "2-DEOXY-ADENOSINE -5'-THIO-MONOPHOSPHATE"
                    'ET', # 'ETHIDIUM'
                    'AA', # '9-AMINOACRIDINE'
                    'T3', # "3,5,3'TRIIODOTHYRONINE"
                    'DT', # "THYMIDINE-5'-MONOPHOSPHATE"
                    '5H', # '5R-(4-BROMOPHENYLMETHYL)-3-(BENZENESULFONYLAMINO)-4-OXO-2-THIONOTHIAZOLIDINE'
                    '1N', # '1-[(1~{R},2~{R},4~{S},5~{S})-2,4-bis(4-carbamimidamidophenoxy)-5-[(4-carbamimidamidophenyl)amino]cyclohexyl]guanidine'
                    'DU', # "2'-DEOXYURIDINE-5'-MONOPHOSPHATE"
                    'NQ', # '2-HYDROXYNAPHTHOQUINONE'
                    'AO', # 'ACRIDINE ORANGE'
                    'MC', # '1,2-CIS-1-HYDROXY-2,7-DIAMINO-MITOSENE'
                    'Y3', # '4-ACETYLAMINO-5-HYDROXYNAPHTHALENE-2,7-DISULFONIC ACID'
                    'DG', # "2'-DEOXYGUANOSINE-5'-MONOPHOSPHATE"
                    'DA', # "2'-DEOXYADENOSINE-5'-MONOPHOSPHATE"
                    'CG', # "4-AMIDINOINDAN-1-ONE-2'-AMIDINOHYDRAZONE"
                    'VA', # '(Z)-OCTADEC-11-ENYL ACETATE'
                    'DI', # "2'-DEOXYINOSINE-5'-MONOPHOSPHATE"
                    'IU', # "5-IODOURIDINE-5'-MONOPHOSPHATE"
                    'VI', # '1,3-DIPHENYL-1H-PYRAZOLE-4,5-DICARBOXYLIC ACID'
                    'CH', # "N3-PROTONATED CYTIDINE-5'-MONOPHOSPHATE"
                    'SQ', # '3-ETHYLAMINO-4-METHYLAMINO-CYCLOBUTANE-1,2-DIONE'
                    ]

# Data

In [9]:
pdb_data           = pd.read_pickle('pdb_full_filtered.pkl')
pdb_with_cofactors = pd.read_pickle('pdb_w_cofactors.pkl')
pdb_wout_cofactors = pd.read_pickle('pdb_no_cofactors.pkl')

In [10]:
for df in [pdb_data, pdb_with_cofactors, pdb_wout_cofactors]:
    print("--")
    count_entries(df)

--
Total # records        :  64567
Unique PDBIDs          :  53992
Membrane Proteins      :  1970
--
Total # records        :  5801
Unique PDBIDs          :  5708
Membrane Proteins      :  100
--
Total # records        :  58766
Unique PDBIDs          :  50588
Membrane Proteins      :  1957


# Retrieve PDB files

Files were retrieved using the script `download_pdb_files.py`. 556 files fail to download, with a message that the file doesn't exist, and those sum to 723 entries in `pdb_data`. According to forums, this is caused by some generic error. I tried twice and got exactly the same failures, which means this is likely something on PDB's side, so we'll just remove those from the `pdb_data` dataframe.

In [60]:
failed_downloads = pd.read_csv('failed_downloads.csv')
failed_downloads.sample(5)

Unnamed: 0,pdbid,message
533,7ZQC,Desired structure doesn't exist\n
10,7NBG,Desired structure doesn't exist\n
355,6SXU,Desired structure doesn't exist\n
444,6SGM,Desired structure doesn't exist\n
107,7Q5I,Desired structure doesn't exist\n


In [68]:
pdbs_to_remove = failed_downloads.pdbid.values
len(pdbs_to_remove)

566

In [80]:
len( pdb_data.loc[ pdb_data.PDB_ID.apply(lambda x: x in pdbs_to_remove) ] )

723

In [83]:
len(pdb_data)

64567

In [84]:
pdb_data.drop(pdb_data.loc[ pdb_data.PDB_ID.apply(lambda x: x in pdbs_to_remove) ].index, inplace=True)
len(pdb_data)

63844

# Process Files

In [85]:
# PDB File parser
pdb_parser = bp.PDBParser(get_header=True,QUIET=True)
io = bp.PDBIO()

In [91]:
Path(logfile).is_file()

True

In [125]:
debug=False
df_errors = {"idx": [], "reason":[]}
success = 0
cutoff_dist = 10.0
root_dir = Path(".")
logfile = Path(".","pdb_process.log")
with open(logfile,'w') as lf:
    
    for idx, row in tqdm(pdb_data.iterrows(), total=len(pdb_data)):

        pdb_id = row.PDB_ID
        lig_id = row.LIG_ID

        orig_dir = Path(orig_root, pdb_id[:2])
        
        targets_dir = Path(root_dir,"processed","targets", pdb_id[:2])
        ligands_dir = Path(root_dir,"processed","ligands", pdb_id[:2])
        targets_dir.mkdir(exist_ok=True, parents=True)
        ligands_dir.mkdir(exist_ok=True, parents=True)

        # Original PDB file
        orig_file = Path(orig_dir,f"pdb{pdb_id.lower()}.ent")
        if not orig_file.is_file():
            lf.write(f"FILE ERROR: (df_index: {idx})  Could not find file: {orig_file} for PDB entry {pdb_id} \n")
            df_errors["idx"].append(idx)
            df_errors["reason"].append("PDB File not found")
            continue

        # Processed PDBs
        target_file = Path(targets_dir, f"{pdb_id}-{lig_id}_target.pdb")
        ligand_file = Path(ligands_dir, f"{pdb_id}-{lig_id}_ligand.pdb")
        if target_file.is_file() and ligand_file.is_file():
            # This PDB has already been processsed
            success += 1
            continue
            
        # Read the PDB file
        struct = pdb_parser.get_structure(pdb_id, orig_file)[0]
        
        # We will rename all ligand chains to "Y"
        # If there's a chain named "Y", we need to rename it to something else
        # Some PDB files have irregular chain names, such as "AAA". Here we fix that
        for chain in struct.get_chains():
            if len(chain.id) > 1: chain.id = chain.id[0]
            if chain.id == "Y": chain.id = " "
        
        # Sometimes, the same ligand appears more than once in the same chain.
        # Here we choose to eliminate these repetitions.
        ligs = get_ligands(struct, lig_id)
        if len(ligs) == 0:
            lf.write(f"LIGAND ERROR: (df_index: {idx})  Could not find ligand {lig_id} in PDB entry {pdb_id} \n")
            df_errors["idx"].append(idx)
            df_errors["reason"].append("Ligand not found")
            continue
        lig = ligs[0]
        
        # Remove all other copies of the ligand
        for ligand in ligs:
            orig_chain = ligand.get_parent()
            ligand.detach_parent()
            orig_chain.detach_child(ligand.get_id())

        # For convenience, make ligants to be in chain "Y"
        chain_Y = bp.Chain.Chain('Y')
        lig.set_parent(chain_Y)
        chain_Y.add(lig)
        struct.add(chain_Y)
        if debug: lf.write(f"  Converted to: {lig.resname} :: {lig.full_id} \n")
        
        
        # Find all chains with any atoms within cutoff_dist from the ligand
        neighbor_searcher = bp.NeighborSearch(bp.Selection.unfold_entities(struct, 'A'), bucket_size=10)

        closest = set()
        for atom in get_heavy_atoms_list(lig):
            closest = closest | set(neighbor_searcher.search(atom.coord,cutoff_dist, level='C'))
        # neighbor.searcher returns a list that *includes* the ligand. Remove it.
        closest = closest - set([chain_Y])
        if debug: lf.write(f"  Found chains within {cutoff_dist} Angstroms of ligand atoms: {closest}\n")
        
        
        # Save the final PDB files
        # Save a PDB file with the ligand only
        selector = AtomsToPrintSelector([chain_Y], nonligand_hets=non_spec_ligands, remove_waters=True)
        io.set_structure(struct)
        io.save(str(ligand_file),select=selector,
                write_end=True, preserve_atom_numbering=False)

        # Save only the atoms from these chains to a new PDB file
        selector = AtomsToPrintSelector(closest, nonligand_hets=non_spec_ligands, remove_waters=True)
        io.set_structure(struct)
        io.save(str(target_file),select=selector,
                write_end=True, preserve_atom_numbering=False)
        success += 1
    
print(f"Processed {len(pdb_data)} entries.")
print(f"{success} entries were OK")
print(f"{len(df_errors["idx"])} entries had errors.")

  0%|          | 0/63844 [00:00<?, ?it/s]

Processed 63844 entries.
63837 entries were OK


AttributeError: 'dict' object has no attribute 'idx'