# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [4]:
URL = 'https://pdbj.org/rest/mine2_sql'

In [5]:
sqlQuery = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encodedSQL = urllib.parse.quote(sqlQuery)
summary_url = URL + "?format=csv&q=" + encodedSQL

In [6]:
df_summary = pd.read_csv(summary_url, dtype=str)

In [7]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [8]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 171313


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,4CBT,2013-12-11,"HISTONE DEACETYLASE 4, (1R,2R,3R)-2-[4-(5-fluo...","Design, synthesis, and biological evaluation o...",['X-RAY DIFFRACTION'],3.03
1,4CBU,2014-04-30,"ACTIN-1, GELSOLIN, ADENOSINE-5'-TRIPHOSPHATE, ...",Crystal structure of Plasmodium falciparum act...,['X-RAY DIFFRACTION'],1.3
2,4CBV,2014-02-12,COME,X-ray structure of full-length ComE from Strep...,['X-RAY DIFFRACTION'],3.39
3,4CBW,2014-04-30,"ACTIN, ALPHA SKELETAL MUSCLE, ACTIN, GELSOLIN,...",Crystal structure of Plasmodium berghei actin ...,['X-RAY DIFFRACTION'],2.501
4,4CBX,2014-04-30,"ACTIN-2, GELSOLIN, ADENOSINE-5'-TRIPHOSPHATE, ...",Crystal structure of Plasmodium berghei actin II,['X-RAY DIFFRACTION'],2.2


### Get refinement data

In [9]:
#TODO sqlQuery = 'SELECT entity_id, ls_R_factor_R_free AS rFree FROM refine' # this query doesn't work, get all columns for now
sqlQuery = 'SELECT * FROM refine'
encodedSQL = urllib.parse.quote(sqlQuery)
refine_url = URL + '?format=csv&q=' + encodedSQL

In [10]:
df_refine = pd.read_csv(refine_url, usecols=['ls_R_factor_R_free', 'entry_id'], dtype=str)

In [11]:
df_refine.rename(columns={'ls_R_factor_R_free': 'rFree'}, inplace=True)
df_refine.rename(columns={'entry_id': 'pdbId'}, inplace=True)
df_refine.fillna('', inplace=True)

In [12]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 154553


Unnamed: 0,pdbId,rFree
13939,1R8K,0.234
118230,5J21,0.232
22503,2B8X,0.24811
14916,1SEL,
19700,1Y8X,0.259


In [13]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [14]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [15]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 171313


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
67721,1LNL,2003-06-03,"hemocyanin, 2-acetamido-2-deoxy-beta-D-glucopy...",Structure of deoxygenated hemocyanin from Rapa...,['X-RAY DIFFRACTION'],3.3,0.28756
159432,3U0C,2012-02-15,"Invasin ipaB, water",Crystal structure of N-terminal region of Type...,['X-RAY DIFFRACTION'],2.05,0.2945
83213,3QV9,2011-07-06,"Pyruvate kinase 2, POTASSIUM ION, PHOSPHATE IO...",Crystal structure of Trypanosoma cruzi pyruvat...,['X-RAY DIFFRACTION'],2.1,0.2491
61581,2BMF,2005-08-03,"RNA HELICASE, water",Dengue virus RNA helicase at 2.4A,['X-RAY DIFFRACTION'],2.41,0.2703
102723,1T8Y,2004-08-17,"AMP nucleosidase, PHOSPHATE ION, water",Crystal Structure of E.coli AMP Nucleosidase c...,['X-RAY DIFFRACTION'],3.0,0.243
49564,1Y8P,2005-05-24,[Pyruvate dehydrogenase [lipoamide]] kinase is...,Crystal structure of the PDK3-L2 complex,['X-RAY DIFFRACTION'],2.63,0.2302
98184,6BIY,2018-01-17,"HLA class II histocompatibility antigen, DR al...",HLA-DRB1 in complex with Histone 2B peptide,['X-RAY DIFFRACTION'],2.05004900063,0.231849295707
170739,4BQR,2013-12-11,"ENOYL-[ACYL-CARRIER-PROTEIN] REDUCTASE [NADH],...",Mtb InhA complex with Methyl-thiazole compound 11,['X-RAY DIFFRACTION'],2.05,0.23209
113490,2AUO,2006-03-28,"Globin I, PROTOPORPHYRIN IX CONTAINING FE, CAR...",Residue F4 plays a key role in modulating the ...,['X-RAY DIFFRACTION'],1.53,0.216
116883,2DQ6,2006-08-01,"Aminopeptidase N, ZINC ION, SULFATE ION, water",Crystal Structure of Aminopeptidase N from Esc...,['X-RAY DIFFRACTION'],1.5,0.191


In [16]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

# convert methods string to a semicolon separated list (our default one-to-many representation in CSV files)
structures['methods'] = structures['methods'].str.replace('[', '')
structures['methods'] = structures['methods'].str.replace(']', '')
structures['methods'] = structures['methods'].str.replace("'", '')
structures['methods'] = structures['methods'].str.replace(',', ';')
structures['methods'] = structures['methods'].str.replace('; ', ';')

In [17]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'ELECTRON MICROSCOPY', 'SOLUTION NMR',
       'ELECTRON CRYSTALLOGRAPHY',
       'X-RAY DIFFRACTION;NEUTRON DIFFRACTION;HYBRID',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'NEUTRON DIFFRACTION', 'FIBER DIFFRACTION', 'POWDER DIFFRACTION',
       'X-RAY DIFFRACTION;EPR;HYBRID',
       'SOLUTION NMR;SOLUTION SCATTERING;HYBRID', 'SOLID-STATE NMR',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION SCATTERING;HYBRID',
       'NEUTRON DIFFRACTION;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;SOLID-STATE NMR;HYBRID', 'SOLUTION SCATTERING',
       'SOLUTION NMR;THEORETICAL MODEL;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION NMR;HYBRID', 'INFRARED SPECTROSCOPY',
       'SOLUTION SCATTERING;SOLUTION NMR;HYBRID',
       'SOLID-STATE NMR;ELECTRON MICROSCOPY;HYBRID',
       'FLUORESCENCE TRANSFER', 'SOLUTIO

In [18]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:4CBT,2013-12-11,"HISTONE DEACETYLASE 4, (1R,2R,3R)-2-[4-(5-fluo...","Design, synthesis, and biological evaluation o...",X-RAY DIFFRACTION,3.03,0.27335
1,pdb:4CBU,2014-04-30,"ACTIN-1, GELSOLIN, ADENOSINE-5'-TRIPHOSPHATE, ...",Crystal structure of Plasmodium falciparum act...,X-RAY DIFFRACTION,1.3,0.1534
2,pdb:4CBV,2014-02-12,COME,X-ray structure of full-length ComE from Strep...,X-RAY DIFFRACTION,3.39,0.2229
3,pdb:4CBW,2014-04-30,"ACTIN, ALPHA SKELETAL MUSCLE, ACTIN, GELSOLIN,...",Crystal structure of Plasmodium berghei actin ...,X-RAY DIFFRACTION,2.501,0.264
4,pdb:4CBX,2014-04-30,"ACTIN-2, GELSOLIN, ADENOSINE-5'-TRIPHOSPHATE, ...",Crystal structure of Plasmodium berghei actin II,X-RAY DIFFRACTION,2.2,0.2275


In [19]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Chain - UniProt sequence mappings

In [20]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [21]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
chains.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,105m,A,P02185,1,153,1,153,2,154
1,120l,A,P00720,1,162,1,162,1,162
2,123l,A,P00720,1,162,1,162,1,162
3,128l,A,P00720,1,162,1,162,1,162
4,113l,A,P00720,1,162,1,162,1,162


In [22]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [23]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,105m,A,P02185,1,153,1,153,2,154
1,120l,A,P00720,1,162,1,162,1,162
2,123l,A,P00720,1,162,1,162,1,162
3,128l,A,P00720,1,162,1,162,1,162
4,113l,A,P00720,1,162,1,162,1,162


In [24]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [25]:
chains['pdbChainId'] = chains['pdbId'] + "." + chains['chainId']

### Sort chains by uniprot residue number

In [26]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

### Assign CURIES

In [27]:
chains['accession'] = 'uniprot:' + chains['accession']
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = 'pdb:' + chains['pdbChainId']

In [28]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId,length
470445,pdb:6EVW,F,uniprot:P02554,1,429,1,429,1,429,pdb:6EVW.F,429
421657,pdb:3ND5,F,uniprot:Q831P9,1,38,1,38,1,38,pdb:3ND5.F,38
421678,pdb:3NEC,B,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.B,163
421679,pdb:3NEC,C,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.C,163
421680,pdb:3NEC,D,uniprot:Q58NA1,4,166,1,163,1,163,pdb:3NEC.D,163


### Group data by PDB chains

In [29]:
chains = chains.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [30]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
524160,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,[1],[387],[1],[387],[1],[387],[387]
524161,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
524162,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
524163,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,[3],[393],[4],[394],[4],[394],[391]
524164,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [31]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [32]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [33]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
524160,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,1,387,1,387,1,387,[387],387
524161,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,2,393,3,394,3,394,[392],392
524162,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,2,393,3,394,3,394,[392],392
524163,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,3,393,4,394,4,394,[391],391
524164,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,2,393,3,394,3,394,[392],392


In [34]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)