# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information for SARS-CoV-2 proteins

Data sources: 
[RCSB Protein Data Bank](https://www.rcsb.org), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
import dateutil
from pathlib import Path
from py2neo import Graph
from rcsbsearch import TextQuery
from rcsbsearch import rcsb_attributes as attrs

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [4]:
taxonomy_id = '2697049'

### Find PDB Structures containing SARS-CoV-2 proteins or nucleic acids

In [5]:
query = attrs.rcsb_entity_source_organism.taxonomy_lineage.id == taxonomy_id
entries = query('entry')
df = pd.DataFrame(entries, columns=['pdbId'])

In [6]:
df.head()

Unnamed: 0,pdbId
0,6W9Q
1,6VXS
2,6W9C
3,6VWW
4,6VYO


In [7]:
print("Number of structures:", df['pdbId'].shape[0])

Number of structures: 541


### Get structure metadata

In [8]:
def get_pdb_entry_data(row):
    pdb_id = row['pdbId']
    
    response = json.loads(requests.get(f'http://data.rcsb.org/rest/v1/core/entry/{pdb_id}').text)
    
    data = np.empty(9, dtype=object)

    data[0] = pdb_id
    data[1] = response['struct']['title']
    data[2] = response['struct'].get('pdbx_descriptor', '')
    data[3] = response['rcsb_accession_info']['deposit_date']
    data[4] = response['rcsb_accession_info']['initial_release_date']
    data[5] = response['pdbx_vrpt_summary'].get('pdbresolution', '')
    data[6] = response['pdbx_vrpt_summary'].get('pdbr', '')
    data[7] = response['pdbx_vrpt_summary'].get('pdbrfree', '')      
    data[8] = response['exptl'][0]['method']
    
    return data

In [9]:
structures = df.apply(get_pdb_entry_data, axis=1, result_type='expand')

In [10]:
structures.columns = ['pdbId', 'title', 'description', 'depositDate', 'releaseDate', 'resolution', 'rFactor', 'rFree', 'method']

In [11]:
structures['depositDate'] = structures['depositDate'].apply(lambda d: dateutil.parser.parse(d[:10]))
structures['releaseDate'] = structures['releaseDate'].apply(lambda d: dateutil.parser.parse(d[:10]))

In [12]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

In [13]:
structures.head()

Unnamed: 0,pdbId,title,description,depositDate,releaseDate,resolution,rFactor,rFree,method
0,pdb:6W9Q,Peptide-bound SARS-CoV-2 Nsp9 RNA-replicase,Non-structural protein 9,2020-03-23,2020-04-08,2.05,0.23,0.25,X-RAY DIFFRACTION
1,pdb:6VXS,Crystal Structure of ADP ribose phosphatase of...,Non-structural protein 3,2020-02-24,2020-03-04,2.03,0.19,0.23,X-RAY DIFFRACTION
2,pdb:6W9C,The crystal structure of papain-like protease ...,Papain-like proteinase,2020-03-22,2020-04-01,2.7,0.23,0.28,X-RAY DIFFRACTION
3,pdb:6VWW,Crystal Structure of NSP15 Endoribonuclease fr...,Uridylate-specific endoribonuclease,2020-02-20,2020-03-04,2.2,0.16,0.18,X-RAY DIFFRACTION
4,pdb:6VYO,Crystal structure of RNA binding domain of nuc...,RNA binding domain of SARS-CoV-2 nucleocapsid ...,2020-02-27,2020-03-11,1.7,0.16,0.21,X-RAY DIFFRACTION


In [14]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Chain - UniProt sequence mappings

In [15]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [16]:
segments = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
segments.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,128l,A,P00720,1,162,1,162,1,162
1,123l,A,P00720,1,162,1,162,1,162
2,105m,A,P02185,1,153,1,153,2,154
3,113l,A,P00720,1,162,1,162,1,162
4,120l,A,P00720,1,162,1,162,1,162


In [17]:
segments.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
segments.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
segments.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
segments.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [18]:
segments.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,128l,A,P00720,1,162,1,162,1,162
1,123l,A,P00720,1,162,1,162,1,162
2,105m,A,P02185,1,153,1,153,2,154
3,113l,A,P00720,1,162,1,162,1,162
4,120l,A,P00720,1,162,1,162,1,162


In [19]:
segments['pdbId'] = segments['pdbId'].str.upper()

In [20]:
segments['pdbChainId'] = segments['pdbId'] + "." + segments['chainId']

In [21]:
coverage = segments.merge(df, on='pdbId')

### Sort segments by uniprot residue number

In [22]:
coverage['uniprotStart'] = coverage['uniprotStart'].astype(int)
coverage['uniprotEnd'] = coverage['uniprotEnd'].astype(int)
coverage['length'] = coverage['uniprotEnd'] - coverage['uniprotStart'] + 1
coverage.sort_values(by='uniprotStart', inplace=True)
coverage['uniprotStart'] = coverage['uniprotStart'].astype(str)
coverage['uniprotEnd'] = coverage['uniprotEnd'].astype(str)

### Assign CURIES

In [23]:
coverage['accession'] = 'uniprot:' + coverage['accession']
coverage['pdbId'] = 'pdb:' + coverage['pdbId']
coverage['pdbChainId'] = 'pdb:' + coverage['pdbChainId']

In [24]:
coverage.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId,length
1091,pdb:6ZOJ,G,uniprot:P62753,1,230,1,230,1,230,pdb:6ZOJ.G,230
2365,pdb:6ZMI,Ln,uniprot:P62945,1,24,1,24,1,24,pdb:6ZMI.Ln,24
4637,pdb:6ZM7,SG,uniprot:P62753,1,237,1,237,1,237,pdb:6ZM7.SG,237
2372,pdb:6ZMI,Lz,uniprot:P62906,1,217,1,217,1,217,pdb:6ZMI.Lz,217
2376,pdb:6ZMI,SD,uniprot:P23396,1,227,1,227,1,227,pdb:6ZMI.SD,227


### Group data by PDB chains

In [25]:
coverage = coverage.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [26]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length
1682,pdb:7KKL,C,pdb:7KKL.C,uniprot:P0DTC2,"[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[43, 64, 8, 60, 358, 36, 139, 294]"
1683,pdb:7KKL,D,pdb:7KKL.D,uniprot:P0DTC2,"[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[27, 80, 165, 186, 263, 641, 689, 854]","[69, 143, 172, 245, 620, 676, 827, 1147]","[43, 64, 8, 60, 358, 36, 139, 294]"
1684,pdb:7KL9,A,pdb:7KL9.A,uniprot:P0DTC2,"[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[40, 64, 17, 56, 254, 101, 36, 137, 290]"
1685,pdb:7KL9,B,pdb:7KL9.B,uniprot:P0DTC2,"[27, 80, 156, 188, 263, 641, 690, 856]","[66, 143, 172, 243, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 641, 690, 856]","[66, 143, 172, 243, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 641, 690, 856]","[66, 143, 172, 243, 620, 676, 826, 1145]","[40, 64, 17, 56, 358, 36, 137, 290]"
1686,pdb:7KL9,C,pdb:7KL9.C,uniprot:P0DTC2,"[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[27, 80, 156, 188, 263, 520, 641, 690, 856]","[66, 143, 172, 243, 516, 620, 676, 826, 1145]","[40, 64, 17, 56, 254, 101, 36, 137, 290]"


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [27]:
coverage['uniprotStart'] = coverage['uniprotStart'].apply(lambda x: ';'.join(x))
coverage['uniprotEnd'] = coverage['uniprotEnd'].apply(lambda x: ';'.join(x))
coverage['seqresStart'] = coverage['seqresStart'].apply(lambda x: ';'.join(x))
coverage['seqresEnd'] = coverage['seqresEnd'].apply(lambda x: ';'.join(x))
coverage['pdbStart'] = coverage['pdbStart'].apply(lambda x: ';'.join(x))
coverage['pdbEnd'] = coverage['pdbEnd'].apply(lambda x: ';'.join(x))

In [28]:
coverage['residues'] = coverage['length'].apply(lambda x: sum(x))

In [29]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,length,residues
1682,pdb:7KKL,C,pdb:7KKL.C,uniprot:P0DTC2,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,"[43, 64, 8, 60, 358, 36, 139, 294]",1002
1683,pdb:7KKL,D,pdb:7KKL.D,uniprot:P0DTC2,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,27;80;165;186;263;641;689;854,69;143;172;245;620;676;827;1147,"[43, 64, 8, 60, 358, 36, 139, 294]",1002
1684,pdb:7KL9,A,pdb:7KL9.A,uniprot:P0DTC2,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,"[40, 64, 17, 56, 254, 101, 36, 137, 290]",995
1685,pdb:7KL9,B,pdb:7KL9.B,uniprot:P0DTC2,27;80;156;188;263;641;690;856,66;143;172;243;620;676;826;1145,27;80;156;188;263;641;690;856,66;143;172;243;620;676;826;1145,27;80;156;188;263;641;690;856,66;143;172;243;620;676;826;1145,"[40, 64, 17, 56, 358, 36, 137, 290]",998
1686,pdb:7KL9,C,pdb:7KL9.C,uniprot:P0DTC2,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,27;80;156;188;263;520;641;690;856,66;143;172;243;516;620;676;826;1145,"[40, 64, 17, 56, 254, 101, 36, 137, 290]",995


In [30]:
coverage.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)