# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [4]:
URL = 'https://pdbj.org/rest/mine2_sql'

In [5]:
sqlQuery = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encodedSQL = urllib.parse.quote(sqlQuery)
summary_url = URL + "?format=csv&q=" + encodedSQL

In [6]:
df_summary = pd.read_csv(summary_url, dtype=str)

In [7]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [8]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 171313


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,2ZOZ,2008-07-08,"Transcriptional regulator, SULFATE ION, GLYCER...",Crystal structure of the ethidium-bound form o...,['X-RAY DIFFRACTION'],1.95
1,3VD2,2012-04-18,"Tumor protein p73, DNA (5'-D(*AP*TP*GP*GP*AP*C...",structure of p73 DNA binding domain tetramer m...,['X-RAY DIFFRACTION'],4.0
2,2ZPL,2008-10-21,"Regulator of sigma E protease, NICKEL (II) ION...",Crystal structure analysis of PDZ domain A,['X-RAY DIFFRACTION'],1.7
3,4F4P,2012-12-12,"Tyrosine-protein kinase SYK, N-{6-[3-(piperazi...",SYK in COMPLEX WITH LIGAND LASW836,['X-RAY DIFFRACTION'],2.37
4,2ZPQ,2009-07-28,"Anionic trypsin, SULFATE ION, CALCIUM ION, BEN...",Crystal structure of anionic trypsin isoform 1...,['X-RAY DIFFRACTION'],1.9


### Get refinement data

In [9]:
# sqlQuery = 'SELECT ls_d_res_high AS resolution, ls_R_factor_R_free AS rFree FROM refine' # this query doesn't work ???
sqlQuery = 'SELECT * FROM refine'
encodedSQL = urllib.parse.quote(sqlQuery)
refine_url = URL + "?format=csv&q=" + encodedSQL

In [10]:
df_refine = pd.read_csv(refine_url, dtype=str)

In [11]:
df_refine['rFree'] = df_refine['ls_R_factor_R_free']
df_refine['pdbId'] = df_refine['entry_id']
df_refine = df_refine[['pdbId', 'rFree']]
df_refine.fillna('', inplace=True)

In [12]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 154553


Unnamed: 0,pdbId,rFree
7657,1JCH,0.283
23175,2BX4,0.302
55515,3PQ1,0.3284
84667,4W7A,0.3018
129683,6CW3,0.2273


In [13]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [14]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [15]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 171313


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
71240,2HR7,2006-08-15,"Insulin receptor, 2-acetamido-2-deoxy-beta-D-g...",Insulin receptor (domains 1-3),['X-RAY DIFFRACTION'],2.32,0.231
44866,5Z3N,2018-11-21,"DNA polymerase I, thermostable, DNA (5'-D(*GP*...",Structure of large fragment of DNA Polymerase ...,['X-RAY DIFFRACTION'],1.91,0.22933
132417,2H1C,2006-09-26,"Trafficking protein B, Trafficking protein A, ...",Crystal Structure of FitAcB from Neisseria gon...,['X-RAY DIFFRACTION'],1.8,0.223
14650,4G4Y,2013-07-24,"Outer membrane protein Omp38, ALANINE, SULFATE...",Crystal structure of OmpA peptidoglycan-bindin...,['X-RAY DIFFRACTION'],1.7,0.2238
114245,1U7S,2005-07-19,"Myoglobin, PROTOPORPHYRIN IX CONTAINING FE, water",Crystal structure of Native Sperm Whale myoglo...,['X-RAY DIFFRACTION'],1.4,0.1793
140313,2QDH,2007-08-21,"Fructose-1,6-bisphosphate aldolase, D-MANNITOL...","Fructose-1,6-bisphosphate aldolase from Leishm...",['X-RAY DIFFRACTION'],1.9,0.235
144488,2XKQ,2010-11-24,"DNA PROTECTION DURING STARVATION PROTEIN, MANG...",Crystal structure of Streptococcus suis Dpr wi...,['X-RAY DIFFRACTION'],2.4,0.2449
134506,2IXC,2006-10-26,"DTDP-4-DEHYDRORHAMNOSE 3,5-EPIMERASE RMLC, 2'-...",RmlC M. tuberculosis with dTDP-rhamnose,['X-RAY DIFFRACTION'],1.79,0.196
153116,3G1W,2009-02-17,"Sugar ABC transporter, water",Crystal structure of sugar ABC transporter (su...,['X-RAY DIFFRACTION'],2.02,0.23711
95550,6NUE,2019-03-13,CRISPR system single-strand-specific deoxyribo...,Small conformation of apo CRISPR_Csm complex,['ELECTRON MICROSCOPY'],3.3,


In [16]:
structures['pdbId'] = 'pdb:' + structures['pdbId']
structures['methods'] = structures['methods'].str.replace('[', '')
structures['methods'] = structures['methods'].str.replace(']', '')
structures['methods'] = structures['methods'].str.replace("'", '')
structures['methods'] = structures['methods'].str.replace(',', ';')
structures['methods'] = structures['methods'].str.replace('; ', ';')

In [17]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'SOLUTION NMR', 'NEUTRON DIFFRACTION',
       'ELECTRON MICROSCOPY',
       'X-RAY DIFFRACTION;NEUTRON DIFFRACTION;HYBRID',
       'ELECTRON CRYSTALLOGRAPHY', 'SOLUTION SCATTERING',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'FIBER DIFFRACTION', 'SOLID-STATE NMR;ELECTRON MICROSCOPY;HYBRID',
       'POWDER DIFFRACTION', 'X-RAY DIFFRACTION;EPR;HYBRID',
       'SOLUTION NMR;SOLUTION SCATTERING;HYBRID', 'SOLID-STATE NMR',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION SCATTERING;HYBRID',
       'NEUTRON DIFFRACTION;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'SOLUTION NMR;THEORETICAL MODEL;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION NMR;HYBRID', 'INFRARED SPECTROSCOPY',
       'SOLUTION SCATTERING;SOLUTION NMR;HYBRID', 'FLUORESCENCE TRANSFER',
       'SOLUTIO

In [18]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:2ZOZ,2008-07-08,"Transcriptional regulator, SULFATE ION, GLYCER...",Crystal structure of the ethidium-bound form o...,X-RAY DIFFRACTION,1.95,0.235
1,pdb:3VD2,2012-04-18,"Tumor protein p73, DNA (5'-D(*AP*TP*GP*GP*AP*C...",structure of p73 DNA binding domain tetramer m...,X-RAY DIFFRACTION,4.0,0.284
2,pdb:2ZPL,2008-10-21,"Regulator of sigma E protease, NICKEL (II) ION...",Crystal structure analysis of PDZ domain A,X-RAY DIFFRACTION,1.7,0.1755
3,pdb:4F4P,2012-12-12,"Tyrosine-protein kinase SYK, N-{6-[3-(piperazi...",SYK in COMPLEX WITH LIGAND LASW836,X-RAY DIFFRACTION,2.37,0.2846
4,pdb:2ZPQ,2009-07-28,"Anionic trypsin, SULFATE ION, CALCIUM ION, BEN...",Crystal structure of anionic trypsin isoform 1...,X-RAY DIFFRACTION,1.9,0.222


In [19]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Chain - UniProt sequence mappings

In [20]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [21]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
chains.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,105m,A,P02185,1,153,1,153,2,154
1,120l,A,P00720,1,162,1,162,1,162
2,123l,A,P00720,1,162,1,162,1,162
3,128l,A,P00720,1,162,1,162,1,162
4,113l,A,P00720,1,162,1,162,1,162


In [22]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [23]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,105m,A,P02185,1,153,1,153,2,154
1,120l,A,P00720,1,162,1,162,1,162
2,123l,A,P00720,1,162,1,162,1,162
3,128l,A,P00720,1,162,1,162,1,162
4,113l,A,P00720,1,162,1,162,1,162


In [24]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [25]:
chains['pdbChainId'] = chains['pdbId'] + "." + chains['chainId']

### Sort chains by uniprot residue number

In [26]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

### Assign CURIES

In [27]:
chains['accession'] = 'uniprot:' + chains['accession']
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = 'pdb:' + chains['pdbChainId']

In [28]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId,length
470445,pdb:6EVW,F,uniprot:P02554,1,429,1,429,1,429,pdb:6EVW.F,429
421657,pdb:3ND5,F,uniprot:Q831P9,1,38,1,38,1,38,pdb:3ND5.F,38
421678,pdb:3NEC,B,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.B,163
421679,pdb:3NEC,C,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.C,163
421680,pdb:3NEC,D,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.D,163


### Group data by PDB chains

In [29]:
chains = chains.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [30]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
524160,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,[1],[387],[1],[387],[1],[387],[387]
524161,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
524162,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
524163,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,[3],[393],[4],[394],[4],[394],[391]
524164,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [31]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [32]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [33]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
524160,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,1,387,1,387,1,387,[387],387
524161,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,2,393,3,394,3,394,[392],392
524162,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,2,393,3,394,3,394,[392],392
524163,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,3,393,4,394,4,394,[391],391
524164,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,2,393,3,394,3,394,[392],392


In [34]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)