# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [61]:
import os
import re
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [62]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [63]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [64]:
URL = 'https://pdbj.org/rest/mine2_sql'

In [65]:
sqlQuery = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encodedSQL = urllib.parse.quote(sqlQuery)
summary_url = URL + "?format=csv&q=" + encodedSQL

In [66]:
df_summary = pd.read_csv(summary_url, dtype=str)

In [67]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [68]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 173005


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,2DCY,2006-02-07,"Endo-1,4-beta-xylanase A, 1,4-DIETHYLENE DIOXI...",Crystal structure of Bacillus subtilis family-...,['X-RAY DIFFRACTION'],1.4
1,6HXJ,2019-04-10,"ATP-citrate lyase beta-subunit, ATP-citrate ly...",Structure of ATP citrate lyase from Chlorobium...,['X-RAY DIFFRACTION'],2.58
2,6HXN,2019-04-10,"ATP-citrate lyase alpha-subunit, COENZYME A, S...",Structure of the citryl-CoA lyase core module ...,['X-RAY DIFFRACTION'],1.7
3,2DCX,2006-02-28,"Dermaseptin-4, 12-AMINO-DODECANOIC ACID",NMR solution structure of the Dermaseptin anti...,['SOLUTION NMR'],
4,2DCZ,2006-02-07,"Endo-1,4-beta-xylanase A, SULFATE ION, 1,4-DIE...",Thermal Stabilization of Bacillus subtilis Fam...,['X-RAY DIFFRACTION'],1.9


### Get refinement data

In [69]:
sqlQuery = 'SELECT * FROM refine'
encodedSQL = urllib.parse.quote(sqlQuery)
refine_url = URL + '?format=csv&q=' + encodedSQL

In [70]:
df_refine = pd.read_csv(refine_url, usecols=['ls_R_factor_R_free', 'entry_id'], dtype=str)

In [71]:
df_refine.rename(columns={'ls_R_factor_R_free': 'rFree', 'entry_id': 'pdbId'}, inplace=True)
df_refine.fillna('', inplace=True)

In [72]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 156060


Unnamed: 0,pdbId,rFree
91846,5JLX,0.2464
140815,6KNS,0.2023
67039,4DQ1,0.25229
28782,2I1W,0.25696
50648,3KTX,0.2388


In [73]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [74]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [75]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 173005


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
167327,1TBT,2005-01-25,"Carbonic anhydrase II, ZINC ION, water",Effect of Shuttle Location and pH Environment ...,['X-RAY DIFFRACTION'],2.0,0.184
116351,1JPH,2001-12-19,"UROPORPHYRINOGEN DECARBOXYLASE, water","Ile260Thr mutant of Human UroD, human uroporph...",['X-RAY DIFFRACTION'],2.1,0.235
62167,4A04,2011-10-12,"T-BOX TRANSCRIPTION FACTOR TBX1, DNA, water",Structure of the DNA-bound T-box domain of hum...,['X-RAY DIFFRACTION'],2.58,0.2269
55182,5VFZ,2018-04-18,"Gp33, GLYCEROL, SODIUM ION, ACETATE ION, water",Integrase from mycobacterium phage Brujita,['X-RAY DIFFRACTION'],1.847,0.1818
90675,5J09,2016-05-04,"Beak and feather disease virus capsid protein,...",Crystal structure of decameric BFDV Capsid Pro...,['X-RAY DIFFRACTION'],2.0,0.2387
129518,2Y7X,2011-03-16,"ACTIVATED FACTOR XA HEAVY CHAIN, FACTOR X LIGH...",The discovery of potent and long-acting oral f...,['X-RAY DIFFRACTION'],1.9,0.22952
158038,6CQE,2018-12-19,Mitogen-activated protein kinase kinase kinase...,Crystal structure of HPK1 kinase domain S171A ...,['X-RAY DIFFRACTION'],1.886,0.2576
87260,5A7P,2016-01-13,"LYSINE-SPECIFIC DEMETHYLASE 4A, SULFATE ION, M...",Crystal structure of human JMJD2A in complex w...,['X-RAY DIFFRACTION'],2.28,0.2362
3451,2FZW,2006-06-13,"Alcohol dehydrogenase class III chi chain, ZIN...",Structure of the binary complex of the E67L mu...,['X-RAY DIFFRACTION'],1.84,0.204
129963,1F40,2000-11-08,"FK506 BINDING PROTEIN (FKBP12), (2S)-[3-PYRIDY...",SOLUTION STRUCTURE OF FKBP12 COMPLEXED WITH GP...,['SOLUTION NMR'],,


In [76]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

# convert methods string to a semicolon separated list (our default one-to-many representation in CSV files)
structures['methods'] = structures['methods'].str.replace('[', '')
structures['methods'] = structures['methods'].str.replace(']', '')
structures['methods'] = structures['methods'].str.replace("'", '')
structures['methods'] = structures['methods'].str.replace(',', ';')
structures['methods'] = structures['methods'].str.replace('; ', ';')

  after removing the cwd from sys.path.
  """


In [77]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'SOLUTION NMR', 'ELECTRON MICROSCOPY',
       'X-RAY DIFFRACTION;NEUTRON DIFFRACTION;HYBRID', 'SOLID-STATE NMR',
       'NEUTRON DIFFRACTION', 'SOLUTION NMR;SOLUTION SCATTERING;HYBRID',
       'ELECTRON CRYSTALLOGRAPHY', 'SOLUTION SCATTERING',
       'POWDER DIFFRACTION', 'FIBER DIFFRACTION',
       'SOLUTION SCATTERING;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'SOLID-STATE NMR;ELECTRON MICROSCOPY;HYBRID',
       'SOLUTION NMR;EPR;HYBRID', 'X-RAY DIFFRACTION;EPR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION SCATTERING;HYBRID',
       'INFRARED SPECTROSCOPY',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'SOLID-STATE NMR;SOLUTION SCATTERING;ELECTRON MICROSCOPY;HYBRID',
       'FIBER DIFFRACTION;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'X-RAY DIFFRACTION;

In [78]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:2DCY,2006-02-07,"Endo-1,4-beta-xylanase A, 1,4-DIETHYLENE DIOXI...",Crystal structure of Bacillus subtilis family-...,X-RAY DIFFRACTION,1.4,0.21733
1,pdb:6HXJ,2019-04-10,"ATP-citrate lyase beta-subunit, ATP-citrate ly...",Structure of ATP citrate lyase from Chlorobium...,X-RAY DIFFRACTION,2.58,0.216
2,pdb:6HXN,2019-04-10,"ATP-citrate lyase alpha-subunit, COENZYME A, S...",Structure of the citryl-CoA lyase core module ...,X-RAY DIFFRACTION,1.7,0.192
3,pdb:2DCX,2006-02-28,"Dermaseptin-4, 12-AMINO-DODECANOIC ACID",NMR solution structure of the Dermaseptin anti...,SOLUTION NMR,,
4,pdb:2DCZ,2006-02-07,"Endo-1,4-beta-xylanase A, SULFATE ION, 1,4-DIE...",Thermal Stabilization of Bacillus subtilis Fam...,X-RAY DIFFRACTION,1.9,0.22302


In [79]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Entity info

In [80]:
sqlQuery = "select pdbid, id, pdbx_description from entity"
encodedSQL = urllib.parse.quote(sqlQuery)
entity_url = URL + '?format=csv&q=' + encodedSQL

In [81]:
df_entity = pd.read_csv(entity_url, dtype=str)

In [82]:
df_entity.rename(columns={'pdbid': 'pdbId', 'id': 'entityId', 'pdbx_description': 'description'}, inplace=True)

In [83]:
df_entity['pdbId'] = df_entity['pdbId'].str.upper()

In [84]:
print('Number of entities:', df_entity.shape[0])

Number of entities: 780751


In [85]:
df_entity.head()

Unnamed: 0,pdbId,entityId,description
0,3TOU,1,Glutathione s-transferase protein
1,3TOU,2,GLUTATHIONE
2,3TOU,3,ACETATE ION
3,3TOU,4,water
4,3TOV,1,Glycosyl transferase family 9


In [86]:
def categorize(description):
    words = re.split('[ ,\-()]', str(description).lower())
    
    keywords_ab = ['antibody', 'fab', 'nanobody']
    keywords_hc = ['heavy', 'h', 'hc']
    keywords_lc = ['light', 'l', 'lc']
    
    category = ''
    
    if any(w in words for w in keywords_ab):
        category = 'antibody'
        if any(w in words for w in keywords_hc):
            category = 'antibody heavy chain'
            
        if any(w in words for w in keywords_lc):
            category = 'antibody light chain'
    
    return category

In [87]:
df_entity['category'] = df_entity['description'].apply(categorize)

In [88]:
df_entity.query("category != ''").head()

Unnamed: 0,pdbId,entityId,description,category
88,3TPK,1,Immunoglobulin heavy chain antibody variable d...,antibody heavy chain
543,3TT1,2,"mouse monoclonal 1gG2a Fab fragment, heavy chain",antibody heavy chain
544,3TT1,3,"mouse monoclonal 1gG2a Fab fragment, kappa lig...",antibody light chain
553,3TT3,2,"mouse monoclonal 1gG1 Fab fragment, heavy chain",antibody heavy chain
554,3TT3,3,"mouse monoclonal 1gG1 Fab fragment, kappa ligh...",antibody light chain


### Get PDB Polymer Entities

In [89]:
sqlQuery = "select pdbid, entity_id, pdbx_strand_id, type, pdbx_seq_one_letter_code_can from entity_poly"
encodedSQL = urllib.parse.quote(sqlQuery)
poly_url = URL + '?format=csv&q=' + encodedSQL

In [90]:
df_poly = pd.read_csv(poly_url, dtype=str)

In [91]:
df_poly.rename(columns={'pdbid': 'pdbId', 'entity_id': 'entityId', 'pdbx_seq_one_letter_code_can': 'sequence'}, inplace=True)

In [92]:
df_poly['pdbId'] = df_poly['pdbId'].str.upper()
df_poly['chainId'] = df_poly['pdbx_strand_id'].str.split(',')
df_poly = df_poly.explode('chainId')
df_poly = df_poly[['pdbId', 'entityId', 'chainId', 'type', 'sequence']]

In [93]:
print("Number of polymer entities:", df_poly.shape[0])

Number of polymer entities: 608695


In [94]:
df_poly.head()

Unnamed: 0,pdbId,entityId,chainId,type,sequence
0,100D,1,A,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
0,100D,1,B,polydeoxyribonucleotide/polyribonucleotide hybrid,CCGGCGCCGG
1,101D,1,A,polydeoxyribonucleotide,CGCGAATTCGCG
1,101D,1,B,polydeoxyribonucleotide,CGCGAATTCGCG
2,101M,1,A,polypeptide(L),MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...


In [95]:
df_chain = pd.merge(df_entity, df_poly, on=['pdbId', 'entityId'])

In [96]:
print('Number of polymer chains:', df_chain.shape[0])

Number of polymer chains: 608695


In [97]:
df_chain.head()

Unnamed: 0,pdbId,entityId,description,category,chainId,type,sequence
0,3TOU,1,Glutathione s-transferase protein,,A,polypeptide(L),MVMKLIGSHASPYTRKVRVVLAEKKIDYQFVLEDVWNADTQIHQFN...
1,3TOU,1,Glutathione s-transferase protein,,B,polypeptide(L),MVMKLIGSHASPYTRKVRVVLAEKKIDYQFVLEDVWNADTQIHQFN...
2,3TOV,1,Glycosyl transferase family 9,,A,polypeptide(L),SNAMELDYKRIVVTFLMHLGDVILTTPFLEVLRKAAPHSHITYVID...
3,3TOV,1,Glycosyl transferase family 9,,B,polypeptide(L),SNAMELDYKRIVVTFLMHLGDVILTTPFLEVLRKAAPHSHITYVID...
4,3TOW,1,Multivesicular body subunit 12B,,A,polypeptide(L),MDPITGVGVVASRNRAPTGYDVVAQTADGVDADLWKDGLFKSKVTR...


### Get PDB Chain - UniProt sequence mappings

In [122]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [123]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
print("Number of chains:", chains.shape[0])
chains.head()

Number of chains: 798880


Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,123l,A,P00720,1,162,1,162,1,162
1,128l,A,P00720,1,162,1,162,1,162
2,183l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,192l,A,P00720,1,162,1,162,1,162


In [124]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [125]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [126]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,123L,A,P00720,1,162,1,162,1,162
1,128L,A,P00720,1,162,1,162,1,162
2,183L,A,P00720,1,162,1,162,1,162
3,185L,A,P00720,1,162,1,162,1,162
4,192L,A,P00720,1,162,1,162,1,162


In [127]:
#
chains.query("uniprotStart == '1;313'")

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd


### Sort chains by uniprot residue number

In [128]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
# TODO length seems to be string??
chains['length'] = chains['length'].astype(int)
#
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

In [129]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
0,123L,A,P00720,1,162,1,162,1,162,162
148159,4V7Y,AP,Q5SJH3,1,84,1,84,1,84,84
148151,4V7Y,AH,A0A0M9AFS9,1,138,1,138,1,138,138
148149,4V7Y,AF,Q5SLP8,1,101,1,101,1,101,101
557340,5E81,78,Q5SHQ7,1,147,1,147,1,147,147


In [130]:
chains.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 798880 entries, 0 to 538698
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pdbId         798880 non-null  object
 1   chainId       798845 non-null  object
 2   accession     798880 non-null  object
 3   seqresStart   798880 non-null  object
 4   seqresEnd     798880 non-null  object
 5   pdbStart      798880 non-null  object
 6   pdbEnd        798880 non-null  object
 7   uniprotStart  798880 non-null  object
 8   uniprotEnd    798880 non-null  object
 9   length        798880 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 67.0+ MB


### Group data by PDB chains

In [131]:
chains = chains.groupby(['pdbId','chainId','accession']).agg(list).reset_index()

### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [132]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [133]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [134]:
print("Number of chains with UniProt mapping:", chains.shape[0])
chains.tail()

Number of chains with UniProt mapping: 531349


Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
531344,9XIA,A,P24300,1,387,1,387,1,387,[387],387
531345,9XIM,A,P12851,2,393,3,394,3,394,[392],392
531346,9XIM,B,P12851,2,393,3,394,3,394,[392],392
531347,9XIM,C,P12851,3,393,4,394,4,394,[391],391
531348,9XIM,D,P12851,2,393,3,394,3,394,[392],392


In [135]:
chains = df_chain.merge(chains, on=['pdbId', 'chainId'], how='left')
chains.fillna('', inplace=True)

### Assign CURIES

In [136]:
chains['accession'] = chains['accession'].apply(lambda x: x if x == '' else 'uniprot:' + x)
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = chains['pdbId'] + '.' + chains['chainId']

In [137]:
print("Total number of polymer chains:", chains.shape[0])

Total number of polymer chains: 612503


In [138]:
chains.sample(10)

Unnamed: 0,pdbId,entityId,description,category,chainId,type,sequence,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues,pdbChainId
394061,pdb:6VE1,1,Endo-beta-N-acetylglucosaminidase H,,A,polypeptide(L),SLSTGCYMVKQGPTSVAYVEVNNNSMLNVGKYTLADGGGNAFDVAV...,uniprot:P04067,9,275,9,275,47,313,[267],267.0,pdb:6VE1.A
134802,pdb:5C0W,2,Exosome complex component SKI6,,B,polypeptide(L),GHMSRLEIYSPEGLRLDGRRWNELRRFESSINTHPHAADGSSYMEQ...,uniprot:P46948,3,246,1,244,1,244,[244],244.0,pdb:5C0W.B
144215,pdb:5E24,2,Protein hairless,,B,polypeptide(L),GGRLQFFKDGKFILELARSKDGDKSGWVSVTRKTFRPP,uniprot:Q02308,1,38,232,269,232,269,[38],38.0,pdb:5E24.B
442866,pdb:1OFQ,1,PHOSPHO-2-DEHYDRO-3-DEOXYHEPTONATE ALDOLASE,,C,polypeptide(L),MSESPMFAANGMPKVNQGAEEDVRILGYDPLASPALLQVQIPATPT...,uniprot:P32449,23;117;230,114;228;369,3023;3117;3230,3114;3228;3369,23;117;230,114;228;369,"[92, 112, 140]",344.0,pdb:1OFQ.C
6747,pdb:3VQC,1,POL polyprotein,,B,polypeptide(L),DSSPGIWQLDCTHLEGKVILVAVHVASGYIEAEVIPAETGQETAYF...,uniprot:Q72498,3;99,84;154,57;153,138;208,772;868,853;923,"[82, 56]",138.0,pdb:3VQC.B
456119,pdb:1T3H,1,Dephospho-CoA kinase,,A,polypeptide(L),MRYIVALTGGIGSGKSTVANAFADLGINVIDADIIARQVVEPGAPA...,uniprot:P0A6I9,1,206,1,206,1,206,[206],206.0,pdb:1T3H.A
297728,pdb:6MTB,20,60S ribosomal protein L13a,,O,polypeptide(L),QVLVLDGRGHLLGRLAAIVAKQVLLGRKVVVVRCEGINISGNFYRN...,,,,,,,,,,pdb:6MTB.O
142444,pdb:5DMK,2,beta chain of Major Histocompatibility Complex...,,B,polypeptide(L),SRLGLWSRMDQLAKELTAELVPRGSGSERHFVHQFKGECYFTNGTQ...,uniprot:Q31135,28;135,127;212,5;115,107;192,31;138,130;215,"[100, 78]",178.0,pdb:5DMK.B
565970,pdb:3HXA,1,Pterin-4-alpha-carbinolamine dehydratase,,C,polypeptide(L),MAGKAHRLSAEERDQLLPNLRAVGWNELEGRDAIFKQFHFKDFNRA...,uniprot:P61459,6,103,6,103,6,103,[98],98.0,pdb:3HXA.C
552968,pdb:3E41,1,Type-2 restriction enzyme HindII,,B,polypeptide(L),SFIKPIYQDINSILIGQKVKRPKSGTLSGHAAGEPFEKLVYKFLKE...,uniprot:P17743,1;33,19;256,2;34,20;257,2;34,20;257,"[19, 224]",243.0,pdb:3E41.B


In [139]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)