# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information from the Worldwide Protein Data Bank.

Data sources: 
[Protein Data Bank Japan](https://pdbj.org/), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import dateutil
import urllib
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get PDB summary data

In [4]:
URL = 'https://pdbj.org/rest/mine2_sql'

In [5]:
sqlQuery = 'SELECT pdbid, release_date AS releasedate, pdbx_descriptor AS description, struct_title AS title, exptl_method AS methods, resolution FROM brief_summary'
encodedSQL = urllib.parse.quote(sqlQuery)
summary_url = URL + "?format=csv&q=" + encodedSQL

In [6]:
df_summary = pd.read_csv(summary_url, dtype=str)

In [7]:
df_summary.rename(columns={'pdbid': 'pdbId'}, inplace=True)
df_summary['pdbId'] = df_summary['pdbId'].str.upper()
df_summary.rename(columns={'releasedate': 'releaseDate'}, inplace=True)

In [8]:
print('Number of records:', df_summary.shape[0])
df_summary.head()

Number of records: 171588


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution
0,1XV5,2005-08-30,"DNA alpha-glucosyltransferase, CHLORIDE ION, U...",alpha-glucosyltransferase (AGT) in complex wit...,['X-RAY DIFFRACTION'],1.73
1,2QAG,2007-08-07,"Septin-2, Septin-6, Septin-7, GUANOSINE-5'-DIP...",Crystal structure of human septin trimer 2/6/7,['X-RAY DIFFRACTION'],4.0
2,1XV6,2005-04-05,5'-R(*(C2L)P*(G2L)P*(C2L)P*(G2L)P*(A2L)P*(A2L)...,"The solution structure of 2',5'-linked 3'-O-(2...",['SOLUTION NMR'],
3,2QAI,2007-06-26,"V-type ATP synthase subunit F, water",Crystal structure of the V-type ATP synthase s...,['X-RAY DIFFRACTION'],2.4
4,1XV8,2005-10-11,"Alpha-amylase, CALCIUM ION, CHLORIDE ION, water",Crystal Structure of Human Salivary Alpha-Amyl...,['X-RAY DIFFRACTION'],3.0


### Get refinement data

In [9]:
#TODO sqlQuery = 'SELECT entity_id, ls_R_factor_R_free AS rFree FROM refine' # this query doesn't work, get all columns for now
sqlQuery = 'SELECT * FROM refine'
encodedSQL = urllib.parse.quote(sqlQuery)
refine_url = URL + '?format=csv&q=' + encodedSQL

In [10]:
df_refine = pd.read_csv(refine_url, usecols=['ls_R_factor_R_free', 'entry_id'], dtype=str)

In [11]:
df_refine.rename(columns={'ls_R_factor_R_free': 'rFree'}, inplace=True)
df_refine.rename(columns={'entry_id': 'pdbId'}, inplace=True)
df_refine.fillna('', inplace=True)

In [12]:
print('Number of records:', df_refine.shape[0])
df_refine.sample(5)

Number of records: 154811


Unnamed: 0,pdbId,rFree
11053,1N38,0.263
75668,4MCD,0.2349
117809,4Z8T,0.222
86607,4YFG,0.3148
20909,1Z0G,0.32876


In [13]:
structures = df_summary.merge(df_refine, on='pdbId', how='left')
structures.fillna('', inplace=True)

In [14]:
structures.drop_duplicates(subset=['pdbId'], inplace=True)

In [15]:
print('Number of structures:', structures.shape[0])
structures.sample(10)

Number of structures: 171588


Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
155278,6UOU,2020-05-13,Abscisic acid receptor PYL5,MicroED structure of OsPYL/RCAR5 (24-29) at 9 ...,['ELECTRON CRYSTALLOGRAPHY'],1.001,0.2497
151865,6IAF,2019-05-22,"Ferritin, middle subunit, FE (II) ION, MAGNESI...",Fifteen minutes iron loaded Rana Catesbeiana H...,['X-RAY DIFFRACTION'],1.35,0.14072
110076,6ETJ,2018-11-07,"6-phosphofructo-2-kinase/fructose-2,6-bisphosp...",HUMAN PFKFB3 IN COMPLEX WITH KAN0438241,['X-RAY DIFFRACTION'],2.51,0.2353
132917,1D6G,1999-11-17,"cholecystokinin type a receptor, cholecystokin...",MOLECULAR COMPLEX OF CHOLECYSTOKININ-8 AND N-T...,['SOLUTION NMR'],,
120483,6NY9,2020-01-22,"Mycophenolic acid acyl-glucuronide esterase, m...",Alpha/beta hydrolase domain-containing protein...,['X-RAY DIFFRACTION'],1.66,0.2188
68693,4UWU,2014-12-10,"LYSOZYME C, CHLORIDE ION, SODIUM ION, FORMIC A...",Lysozyme soaked with a ruthenium based CORM wi...,['X-RAY DIFFRACTION'],1.78,0.19522
26608,2GA6,2006-08-15,"orf1a polyprotein, ZINC ION, water",The crystal structure of SARS nsp10 without zi...,['X-RAY DIFFRACTION'],2.7,0.267
127411,1E83,2000-11-06,"CYTOCHROME C', HEME C, water",Cytochrome c' from Alcaligenes xylosoxidans - ...,['X-RAY DIFFRACTION'],2.05,0.252
95697,1EW8,2002-05-01,"ALKALINE PHOSPHATASE, ZINC ION, PHOSPHATE ION,...",ALKALINE PHOSPHATASE (E.C. 3.1.3.1) COMPLEX WI...,['X-RAY DIFFRACTION'],2.2,0.257
81259,5W9M,2017-08-16,"Spike glycoprotein, G4 VH, G4 VL",MERS S ectodomain trimer in complex with varia...,['ELECTRON MICROSCOPY'],4.7,


In [16]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

# convert methods string to a semicolon separated list (our default one-to-many representation in CSV files)
structures['methods'] = structures['methods'].str.replace('[', '')
structures['methods'] = structures['methods'].str.replace(']', '')
structures['methods'] = structures['methods'].str.replace("'", '')
structures['methods'] = structures['methods'].str.replace(',', ';')
structures['methods'] = structures['methods'].str.replace('; ', ';')

In [17]:
structures['methods'].unique()

array(['X-RAY DIFFRACTION', 'SOLUTION NMR', 'ELECTRON MICROSCOPY',
       'SOLUTION SCATTERING', 'SOLUTION NMR;SOLUTION SCATTERING;HYBRID',
       'X-RAY DIFFRACTION;NEUTRON DIFFRACTION;HYBRID', 'SOLID-STATE NMR',
       'ELECTRON CRYSTALLOGRAPHY', 'FIBER DIFFRACTION',
       'SOLUTION SCATTERING;SOLUTION NMR;HYBRID',
       'SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'SOLID-STATE NMR;ELECTRON MICROSCOPY;HYBRID', 'POWDER DIFFRACTION',
       'NEUTRON DIFFRACTION', 'SOLUTION NMR;THEORETICAL MODEL;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION SCATTERING;HYBRID',
       'NEUTRON DIFFRACTION;X-RAY DIFFRACTION;HYBRID',
       'X-RAY DIFFRACTION;EPR;HYBRID',
       'FIBER DIFFRACTION;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLID-STATE NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;HYBRID',
       'ELECTRON MICROSCOPY;SOLUTION NMR;SOLID-STATE NMR;HYBRID',
       'X-RAY DIFFRACTION;SOLUTION SCATTERING;HYBRID',
       'NEUTRON DIFFRACTION;SOLUTION NMR;HYBRID',
       'SOLI

In [18]:
structures.head()

Unnamed: 0,pdbId,releaseDate,description,title,methods,resolution,rFree
0,pdb:1XV5,2005-08-30,"DNA alpha-glucosyltransferase, CHLORIDE ION, U...",alpha-glucosyltransferase (AGT) in complex wit...,X-RAY DIFFRACTION,1.73,0.205
1,pdb:2QAG,2007-08-07,"Septin-2, Septin-6, Septin-7, GUANOSINE-5'-DIP...",Crystal structure of human septin trimer 2/6/7,X-RAY DIFFRACTION,4.0,0.39208
2,pdb:1XV6,2005-04-05,5'-R(*(C2L)P*(G2L)P*(C2L)P*(G2L)P*(A2L)P*(A2L)...,"The solution structure of 2',5'-linked 3'-O-(2...",SOLUTION NMR,,
3,pdb:2QAI,2007-06-26,"V-type ATP synthase subunit F, water",Crystal structure of the V-type ATP synthase s...,X-RAY DIFFRACTION,2.4,0.238
4,pdb:1XV8,2005-10-11,"Alpha-amylase, CALCIUM ION, CHLORIDE ION, water",Crystal Structure of Human Salivary Alpha-Amyl...,X-RAY DIFFRACTION,3.0,0.271


In [19]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Chain - UniProt sequence mappings

In [20]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [21]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
chains.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,123l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [22]:
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
chains.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
chains.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
chains.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [23]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,123l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [24]:
chains['pdbId'] = chains['pdbId'].str.upper()

In [25]:
chains['pdbChainId'] = chains['pdbId'] + "." + chains['chainId']

### Sort chains by uniprot residue number

In [26]:
chains['uniprotStart'] = chains['uniprotStart'].astype(int)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(int)
chains['length'] = chains['uniprotEnd'] - chains['uniprotStart'] + 1
chains.sort_values(by='uniprotStart', inplace=True)
chains['uniprotStart'] = chains['uniprotStart'].astype(str)
chains['uniprotEnd'] = chains['uniprotEnd'].astype(str)

### Assign CURIES

In [27]:
chains['accession'] = 'uniprot:' + chains['accession']
chains['pdbId'] = 'pdb:' + chains['pdbId']
chains['pdbChainId'] = 'pdb:' + chains['pdbChainId']

In [28]:
chains.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId,length
239783,pdb:5CSS,A,uniprot:Q9HLB6,3,216,1,214,1,214,pdb:5CSS.A,214
76777,pdb:6SMD,B,uniprot:Q7N385,1,310,1,310,1,310,pdb:6SMD.B,310
392556,pdb:2HK9,B,uniprot:O67049,7,273,1,267,1,267,pdb:2HK9.B,267
604503,pdb:3K9U,A,uniprot:Q9HL57,1,159,1,159,1,159,pdb:3K9U.A,159
604504,pdb:3K9U,B,uniprot:Q9HL57,1,159,1,159,1,159,pdb:3K9U.B,159


### Group data by PDB chains

In [29]:
chains = chains.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [30]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
515605,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,[1],[387],[1],[387],[1],[387],[387]
515606,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
515607,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]
515608,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,[3],[393],[4],[394],[4],[394],[391]
515609,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,[2],[393],[3],[394],[3],[394],[392]


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [31]:
chains['uniprotStart'] = chains['uniprotStart'].apply(lambda x: ';'.join(x))
chains['uniprotEnd'] = chains['uniprotEnd'].apply(lambda x: ';'.join(x))
chains['seqresStart'] = chains['seqresStart'].apply(lambda x: ';'.join(x))
chains['seqresEnd'] = chains['seqresEnd'].apply(lambda x: ';'.join(x))
chains['pdbStart'] = chains['pdbStart'].apply(lambda x: ';'.join(x))
chains['pdbEnd'] = chains['pdbEnd'].apply(lambda x: ';'.join(x))

In [32]:
chains['residues'] = chains['length'].apply(lambda x: sum(x))

In [33]:
chains.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
515605,pdb:9XIA,A,pdb:9XIA.A,uniprot:P24300,1,387,1,387,1,387,[387],387
515606,pdb:9XIM,A,pdb:9XIM.A,uniprot:P12851,2,393,3,394,3,394,[392],392
515607,pdb:9XIM,B,pdb:9XIM.B,uniprot:P12851,2,393,3,394,3,394,[392],392
515608,pdb:9XIM,C,pdb:9XIM.C,uniprot:P12851,3,393,4,394,4,394,[391],391
515609,pdb:9XIM,D,pdb:9XIM.D,uniprot:P12851,2,393,3,394,3,394,[392],392


In [34]:
chains.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)