# Get the PDB files of the "Drug and Drug Target Mapping" table from the database
All PDB files in the table (data Nov. 20, 2020) are part of the distribution, so no need to run this notebook.

In [1]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from tqdm.notebook import tqdm
import urllib

## Download "Drug and Drug Target Mapping" table from the PDB
https://www.rcsb.org/pdb/ligand/drugMapping.do

In [2]:
! wget https://www.rcsb.org/pdb/ligand/drugMapping.do?format=csv&is_target_only=false

Read in the table with `pandas`

In [3]:
table = pd.read_csv('drugTable.csv')
print(table.shape)
table.head()

(1215, 13)


Unnamed: 0,Generic Name,Brand Name,DrugBank ID,ATC Codes,Ligand ID,Target Name,UniProt ID,PDB ID 1,Seq. Identity 1,PDB ID 2,Seq. Identity 2,PDB ID 3,Seq. Identity 3
0,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Synaptic vesicular amine transporter,Q05940,,,,,,
1,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Sodium-dependent noradrenaline transporter,P23975,,,,,,
2,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Sodium-dependent serotonin transporter,P31645,,,,,,
3,4-Androstenedione,,DB01536,,ASD,3 beta-hydroxysteroid dehydrogenase/Delta 5-->...,P14060,,,,,,
4,4-Androstenedione,,DB01536,,ASD,Estradiol 17-beta-dehydrogenase 1,P14061,1QYX,89%,,,,


There are up to 3 PDB codes per row. We want to have one per row, therefore we make copies of the respective rows and create one columns with the PDBs

In [4]:
newtable = []
for i, row in table.iterrows():
    for j in range(1,4):
        nrow = row.copy()
        nrow['PDB ID'] = row[f'PDB ID {j}']
        nrow['Seq. Identity'] = row[f'Seq. Identity {j}']
        nrow = nrow.drop(labels=[f'PDB ID {k}' for k in range(1,4)]+[f'Seq. Identity {k}' for k in range(1,4)])
        newtable.append(nrow)
newtable = pd.concat(newtable, axis=1).T
newtable.head()

Unnamed: 0,Generic Name,Brand Name,DrugBank ID,ATC Codes,Ligand ID,Target Name,UniProt ID,PDB ID,Seq. Identity
0,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Synaptic vesicular amine transporter,Q05940,,
0,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Synaptic vesicular amine transporter,Q05940,,
0,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Synaptic vesicular amine transporter,Q05940,,
1,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Sodium-dependent noradrenaline transporter,P23975,,
1,"3,4-Methylenedioxymethamphetamine",,DB01454,,B41,Sodium-dependent noradrenaline transporter,P23975,,


Filter out all rows with PDB codes being NaN.

In [5]:
newtable = newtable.loc[newtable['PDB ID'].notna()]
print(newtable.shape)
newtable.head()

(937, 9)


Unnamed: 0,Generic Name,Brand Name,DrugBank ID,ATC Codes,Ligand ID,Target Name,UniProt ID,PDB ID,Seq. Identity
4,4-Androstenedione,,DB01536,,ASD,Estradiol 17-beta-dehydrogenase 1,P14061,1QYX,89%
6,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,3RUK,97%
6,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,4NKV,97%
6,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,4R1Z,50%
7,Acarbose,Acarbose#Glucobay#Precose#Prandase,DB00284,A10BF01#A10BD17,"ACR,QPS","Maltase-glucoamylase, intestinal",O43451,2QMJ,99%


Remove duplicate pdb codes and reindex with PDB ID

In [6]:
newtable = newtable.loc[np.invert(newtable.duplicated(subset=['PDB ID'], keep='first'))]
newtable.index = newtable["PDB ID"]
print(newtable.shape)
newtable.head()

(602, 9)


Unnamed: 0_level_0,Generic Name,Brand Name,DrugBank ID,ATC Codes,Ligand ID,Target Name,UniProt ID,PDB ID,Seq. Identity
PDB ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1QYX,4-Androstenedione,,DB01536,,ASD,Estradiol 17-beta-dehydrogenase 1,P14061,1QYX,89%
3RUK,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,3RUK,97%
4NKV,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,4NKV,97%
4R1Z,Abiraterone,Zytiga,DB05812,L02BX03,AER,"Steroid 17-alpha-hydroxylase/17,20 lyase",P05093,4R1Z,50%
2QMJ,Acarbose,Acarbose#Glucobay#Precose#Prandase,DB00284,A10BF01#A10BD17,"ACR,QPS","Maltase-glucoamylase, intestinal",O43451,2QMJ,99%


## Download PDB files. If not available, download CIFs and convert to PDB

In [7]:
! mkdir -p pdb

The following cell calls maxit (https://sw-tools.rcsb.org/apps/MAXIT/index.html) to convert CIF to PDB files. All pdb files are part of the distribution, so if you do not have maxit, you will not need to install it.

In [8]:
for i, row in tqdm(newtable.iterrows()):
    if not os.path.exists(f'pdb/{row[f"PDB ID"]}.pdb'):
        try:
            address = f'https://files.rcsb.org/download/{row[f"PDB ID"].lower()}.pdb'
            urllib.request.urlretrieve(address, f'pdb/{row[f"PDB ID"]}.pdb')
        except urllib.error.HTTPError as e:
            print(f'PDB Error while retrieving {row[f"PDB ID"]}: {e}')
            # if pdb is not available, try to download cif file
            if not os.path.exists(f'pdb/{row[f"PDB ID"]}.cif'):
                try:
                    address = f'https://files.rcsb.org/download/{row[f"PDB ID"].lower()}.cif'
                    urllib.request.urlretrieve(address, f'pdb/{row[f"PDB ID"]}.cif')
                except urllib.error.HTTPError as e:
                    print(f'PDB Error while retrieving mmCIF file of  {row[f"PDB ID"]}: {e}')
                    
            # Convert CIF files to pdb; https://sw-tools.rcsb.org/apps/MAXIT/index.html
            inp = f'pdb/{row["PDB ID"]}.cif'
            out = f'pdb/{row["PDB ID"]}.pdb'
            ! maxit -input $inp -output $out -o 2 -log logfile

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




## Save table to CSV

In [9]:
newtable.to_csv('drug_table_cleaned.csv')