# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [4]:
URL = 'https://pdbj.org/rest/mine2_sql'

In [5]:
sqlQuery = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encodedSQL = urllib.parse.quote(sqlQuery)
summary_url = URL + "?format=csv&q=" + encodedSQL

In [6]:
df_summary = pd.read_csv(summary_url, dtype=str)

In [7]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [8]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 173005


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,1H4R,2002-01-16,"MERLIN, SULFATE ION, water",Crystal Structure of the FERM domain of Merlin...,['X-RAY DIFFRACTION'],1.8
1,1H4T,2001-06-18,"PROLYL-TRNA SYNTHETASE, PROLINE, ZINC ION, water",Prolyl-tRNA synthetase from Thermus thermophil...,['X-RAY DIFFRACTION'],2.9
2,1H4W,2002-02-11,"TRYPSIN IVA, BENZAMIDINE, CALCIUM ION, water",Structure of human trypsin IV (brain trypsin),['X-RAY DIFFRACTION'],1.7
3,1H60,2001-07-05,"PENTAERYTHRITOL TETRANITRATE REDUCTASE, FLAVIN...",Structure of Pentaerythritol Tetranitrate Redu...,['X-RAY DIFFRACTION'],1.6
4,1H61,2001-07-05,"PENTAERYTHRITOL TETRANITRATE REDUCTASE, FLAVIN...",Structure of Pentaerythritol Tetranitrate Redu...,['X-RAY DIFFRACTION'],1.4


### Get refinement data

In [9]:
sqlQuery = 'SELECT * FROM refine'
encodedSQL = urllib.parse.quote(sqlQuery)
refine_url = URL + '?format=csv&q=' + encodedSQL

In [10]:
df_refine = pd.read_csv(refine_url, usecols=['ls_R_factor_R_free', 'entry_id'], dtype=str)

In [11]:
df_refine.rename(columns={'ls_R_factor_R_free': 'rFree', 'entry_id': 'pdbId'}, inplace=True)
df_refine.fillna('', inplace=True)

In [12]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 156060


Unnamed: 0,pdbId,rFree
70989,4H9E,0.2038
35877,2W82,0.249
16556,1TQG,0.2111
114267,3EV6,0.249
152323,7A21,0.256


In [13]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [14]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [15]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 173005


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
150819,5OJC,2018-01-24,"Myoglobin, PROTOPORPHYRIN IX CONTAINING FE, IM...",Structure of MbQ2.1 NMH,['X-RAY DIFFRACTION'],1.25,0.2148
64613,5FH7,2016-06-01,"Protein polybromo-1, 1,2-ETHANEDIOL, 6-chloran...",Crystal structure of the fifth bromodomain of ...,['X-RAY DIFFRACTION'],1.47,0.2085
7913,5DEP,2015-09-16,Acyl-[acyl-carrier-protein]--UDP-N-acetylgluco...,Structure of Pseudomonas aeruginosa LpxA in co...,['X-RAY DIFFRACTION'],2.16,0.226
140350,4URQ,2014-10-08,"DIGUANYLATE CYCLASE, water",Crystal Structure of GGDEF domain (I site muta...,['X-RAY DIFFRACTION'],2.5,0.2467
57360,6HRE,2018-10-10,Microtubule-associated protein tau,Paired helical filament from sporadic Alzheime...,['ELECTRON MICROSCOPY'],3.2,
151767,5Q4C,2018-08-08,"DCLRE1A, MALONATE ION, NICKEL (II) ION, water",PanDDA analysis group deposition -- Crystal St...,['X-RAY DIFFRACTION'],2.73,0.3379
124882,4EJS,2012-05-02,"Elongator complex protein 4, Elongator complex...",Structure of yeast elongator subcomplex Elp456,['X-RAY DIFFRACTION'],2.606,0.2239
136502,4P6S,2014-07-30,"Tyrosinase, ZINC ION, 3,4-DIHYDROXYPHENYLALANI...",Crystal Structure of tyrosinase from Bacillus ...,['X-RAY DIFFRACTION'],2.2,0.2216
141608,4X2A,2015-09-16,"Lactoylglutathione lyase, ZINC ION, 5,6,7-trih...",Crystal structure of mouse glyoxalase I comple...,['X-RAY DIFFRACTION'],2.0,0.2177
149458,5LNB,2017-08-16,"Ubiquitin-like-specific protease 2, ACETATE IO...",Crystal structure of the de-sumoylating protease,['X-RAY DIFFRACTION'],2.3,0.1968


In [16]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

# convert methods string to a semicolon separated list (our default one-to-many representation in CSV files)
structures['methods'] = structures['methods'].str.replace('[', '')
structures['methods'] = structures['methods'].str.replace(']', '')
structures['methods'] = structures['methods'].str.replace("'", '')
structures['methods'] = structures['methods'].str.replace(',', ';')
structures['methods'] = structures['methods'].str.replace('; ', ';')

  after removing the cwd from sys.path.
  """


In [17]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'SOLUTION NMR', 'ELECTRON MICROSCOPY',
       'FIBER DIFFRACTION', 'X-RAY DIFFRACTION;SOLUTION NMR;HYBRID',
       'SOLID-STATE NMR', 'X-RAY DIFFRACTION;NEUTRON DIFFRACTION;HYBRID',
       'SOLUTION NMR;THEORETICAL MODEL;HYBRID', 'INFRARED SPECTROSCOPY',
       'X-RAY DIFFRACTION;EPR;HYBRID', 'ELECTRON CRYSTALLOGRAPHY',
       'POWDER DIFFRACTION', 'NEUTRON DIFFRACTION',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'SOLUTION SCATTERING', 'SOLUTION NMR;SOLUTION SCATTERING;HYBRID',
       'SOLID-STATE NMR;ELECTRON MICROSCOPY;HYBRID',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION SCATTERING;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID', 'FLUORESCENCE TRANSFER',
       'SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'SOLUTION SCATTERING;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;EPR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION SCATTERING;HYBRID',
       'SOLID-STATE NMR;SOLUTION SCATTERING;ELE

In [18]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:1H4R,2002-01-16,"MERLIN, SULFATE ION, water",Crystal Structure of the FERM domain of Merlin...,X-RAY DIFFRACTION,1.8,0.227
1,pdb:1H4T,2001-06-18,"PROLYL-TRNA SYNTHETASE, PROLINE, ZINC ION, water",Prolyl-tRNA synthetase from Thermus thermophil...,X-RAY DIFFRACTION,2.9,0.243
2,pdb:1H4W,2002-02-11,"TRYPSIN IVA, BENZAMIDINE, CALCIUM ION, water",Structure of human trypsin IV (brain trypsin),X-RAY DIFFRACTION,1.7,0.203
3,pdb:1H60,2001-07-05,"PENTAERYTHRITOL TETRANITRATE REDUCTASE, FLAVIN...",Structure of Pentaerythritol Tetranitrate Redu...,X-RAY DIFFRACTION,1.6,0.182
4,pdb:1H61,2001-07-05,"PENTAERYTHRITOL TETRANITRATE REDUCTASE, FLAVIN...",Structure of Pentaerythritol Tetranitrate Redu...,X-RAY DIFFRACTION,1.4,0.212


In [19]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Entity info

In [20]:
sqlQuery = "select pdbid, id, pdbx_description from entity"
encodedSQL = urllib.parse.quote(sqlQuery)
entity_url = URL + '?format=csv&q=' + encodedSQL

In [21]:
df_entity = pd.read_csv(entity_url, dtype=str)

In [22]:
df_entity.rename(columns={'pdbid': 'pdbId', 'id': 'entityId', 'pdbx_description': 'description'}, inplace=True)

In [23]:
df_entity['pdbId'] = df_entity['pdbId'].str.upper()

In [24]:
print('Number of entities:', df_entity.shape[0])

Number of entities: 780751


In [25]:
df_entity.head()

Unnamed: 0,pdbId,entityId,description
0,3DAF,7,water
1,3DAG,1,"5,10-methenyltetrahydromethanopterin hydrogenase"
2,3DAG,2,FE (II) ION
3,3DAG,3,"5'-O-[(S)-{[2-(carboxymethyl)-6-hydroxy-3,5-di..."
4,3DAG,4,CARBON MONOXIDE


### Get PDB Polymer Entities

In [26]:
sqlQuery = "select pdbid, entity_id, pdbx_strand_id, type, pdbx_seq_one_letter_code_can from entity_poly"
encodedSQL = urllib.parse.quote(sqlQuery)
poly_url = URL + '?format=csv&q=' + encodedSQL

In [27]:
df_poly = pd.read_csv(poly_url, dtype=str)

In [28]:
df_poly.rename(columns={'pdbid': 'pdbId', 'entity_id': 'entityId', 'pdbx_seq_one_letter_code_can': 'sequence'}, inplace=True)

In [29]:
df_poly['pdbId'] = df_poly['pdbId'].str.upper()
df_poly['chainId'] = df_poly['pdbx_strand_id'].str.split(',')
df_poly = df_poly.explode('chainId')
df_poly = df_poly[['pdbId', 'entityId', 'chainId', 'type', 'sequence']]

In [30]:
print("Number of polymer entities:", df_poly.shape[0])

Number of polymer entities: 608695


In [31]:
df_poly.head()

Unnamed: 0,pdbId,entityId,chainId,type,sequence
0,100D,1,A,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
0,100D,1,B,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
1,101D,1,A,polydeoxyribonucleotide,CGCGAATTCGCG
1,101D,1,B,polydeoxyribonucleotide,CGCGAATTCGCG
2,101M,1,A,polypeptide(L),MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [32]:
df_chain = pd.merge(df_entity, df_poly, on=['pdbId', 'entityId'])

In [33]:
print('Number of polymer chains:', df_chain.shape[0])

Number of polymer chains: 608695


In [34]:
df_chain.head()

Unnamed: 0,pdbId,entityId,description,chainId,type,sequence
0,3DAG,1,"5,10-methenyltetrahydromethanopterin hydrogenase",A,polypeptide(L),MKIAILGAGCYRTHAAAGITNFMRACEVAKEVGKPEIALTHSSITY...
1,3DAH,1,Ribose-phosphate pyrophosphokinase,A,polypeptide(L),SMSSHDGLMVFTGNANPALAQEVVKILGIPLGKAMVSRFSDGEIQV...
2,3DAH,1,Ribose-phosphate pyrophosphokinase,B,polypeptide(L),SMSSHDGLMVFTGNANPALAQEVVKILGIPLGKAMVSRFSDGEIQV...
3,3DAH,1,Ribose-phosphate pyrophosphokinase,C,polypeptide(L),SMSSHDGLMVFTGNANPALAQEVVKILGIPLGKAMVSRFSDGEIQV...
4,3DAI,1,ATPase family AAA domain-containing protein 2,A,polypeptide(L),SMQEEDTFRELRIFLRNVTHRLAIDKRFRVFTKPVDPDEVPDYVTV...


### Get PDB Chain - UniProt sequence mappings

In [35]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [36]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
print("Number of chains:", chains.shape[0])
chains.head()

Number of chains: 798880


Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,123l,A,P00720,1,162,1,162,1,162
1,128l,A,P00720,1,162,1,162,1,162
2,183l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,192l,A,P00720,1,162,1,162,1,162


In [37]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [38]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [39]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,123L,A,P00720,1,162,1,162,1,162
1,128L,A,P00720,1,162,1,162,1,162
2,183L,A,P00720,1,162,1,162,1,162
3,185L,A,P00720,1,162,1,162,1,162
4,192L,A,P00720,1,162,1,162,1,162


### Sort chains by uniprot residue number

In [40]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

In [41]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
0,123L,A,P00720,1,162,1,162,1,162,162
148159,4V7Y,AP,Q5SJH3,1,84,1,84,1,84,84
148151,4V7Y,AH,A0A0M9AFS9,1,138,1,138,1,138,138
148149,4V7Y,AF,Q5SLP8,1,101,1,101,1,101,101
557340,5E81,78,Q5SHQ7,1,147,1,147,1,147,147


### Group data by PDB chains

In [42]:
chains = chains.groupby(['pdbId','chainId','accession']).agg(list).reset_index()

### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [43]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [44]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [45]:
print("Number of chains with UniProt mapping:", chains.shape[0])
chains.tail()

Number of chains with UniProt mapping: 531349


Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
531344,9XIA,A,P24300,1,387,1,387,1,387,[387],387
531345,9XIM,A,P12851,2,393,3,394,3,394,[392],392
531346,9XIM,B,P12851,2,393,3,394,3,394,[392],392
531347,9XIM,C,P12851,3,393,4,394,4,394,[391],391
531348,9XIM,D,P12851,2,393,3,394,3,394,[392],392


In [46]:
chains = df_chain.merge(chains, on=['pdbId', 'chainId'], how='left')
chains.fillna('', inplace=True)

### Assign CURIES

In [48]:
chains['accession'] = 'uniprot:' + chains['accession']
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = chains['pdbId'] + '.' + chains['chainId']

In [49]:
print("Total number of polymer chains:", chains.shape[0])

Total number of polymer chains: 612503


In [50]:
chains.sample(10)

Unnamed: 0,pdbId,entityId,description,chainId,type,sequence,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues,pdbChainId
153778,pdb:pdb:4V3H,1,CYMA PROTEIN,B,polypeptide(L),ANVRLQHHHHHHHLEASDQRGYKPEDVAFDESFFSFGGHVGTSVEY...,uniprot:uniprot:Q48391,33.0,339.0,18.0,324.0,40.0,346.0,[307],307.0,pdb:pdb:4V3H.B
373322,pdb:pdb:6OP5,1,Styrylpyrone synthase 1,F,polypeptide(L),MSKTVEDRAAQRAKGPATVLAIGTATPANVVYQTDYPDYYFRVTKS...,uniprot:uniprot:A0A384E132,10.0,391.0,10.0,391.0,10.0,391.0,[382],382.0,pdb:pdb:6OP5.F
545004,pdb:pdb:2C7U,1,"HLA CLASS I HISTOCOMPATIBILITY ANTIGEN, A-2 AL...",A,polypeptide(L),GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,uniprot:uniprot:P04439,1.0,275.0,1.0,275.0,25.0,299.0,[275],275.0,pdb:pdb:2C7U.A
533590,pdb:pdb:1YEB,1,CYTOCHROME C,A,polypeptide(L),TEFKAGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFGRHSG...,uniprot:uniprot:P00045,1.0,106.0,-5.0,101.0,6.0,111.0,[106],106.0,pdb:pdb:1YEB.A
208189,pdb:pdb:5EF2,1,Transcription attenuation protein MtrB,M,polypeptide(L),MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQF...,uniprot:uniprot:Q9X6J6,3.0,72.0,5.0,74.0,3.0,72.0,[70],70.0,pdb:pdb:5EF2.M
50748,pdb:pdb:3QAZ,3,Cytokine receptor common subunit gamma,L,polypeptide(L),ADPPLPEVQCFVFNVEYMNCTWQSSSEPQPTNLTLHYWYKNSDNDK...,uniprot:uniprot:P31785,4.0,194.0,34.0,224.0,56.0,246.0,[191],191.0,pdb:pdb:3QAZ.L
35096,pdb:pdb:3LUE,2,Alpha-actinin-3,L,polypeptide(L),AWEKQQRKTFTAWCNSHLRKAGTQIENIEEDFRNGLKLMLLLEVIS...,uniprot:uniprot:Q08043,1.0,109.0,42.0,150.0,42.0,150.0,[109],109.0,pdb:pdb:3LUE.L
557404,pdb:pdb:2GYI,1,XYLOSE ISOMERASE,B,polypeptide(L),SYQPTPEDRFTFGLWTVGWQGRDPFGDATRPALDPVETVQRLAELG...,uniprot:uniprot:P15587,2.0,386.0,2.0,386.0,3.0,387.0,[385],385.0,pdb:pdb:2GYI.B
142102,pdb:pdb:4RSN,1,Bifunctional P-450/NADPH-P450 reductase,A,polypeptide(L),RGSHMTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFK...,uniprot:uniprot:P14779,5.0,460.0,0.0,455.0,1.0,456.0,[456],456.0,pdb:pdb:4RSN.A
200,pdb:pdb:3DCP,1,Histidinol-phosphatase,C,polypeptide(L),MKRDGHTHTEFCPHGTHDDVEEMVLKAIELDFDEYSIVEHAPLSSE...,uniprot:uniprot:,,,,,,,,,pdb:pdb:3DCP.C


In [None]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)