# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information for SARS-CoV-2 proteins

Data sources: 
[RCSB Protein Data Bank](https://www.rcsb.org), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
import dateutil
from pathlib import Path
from py2neo import Graph
from rcsbsearch import TextQuery
from rcsbsearch import rcsb_attributes as attrs

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-328d8379-6ab4-4cc1-a397-2de37909d2e4/installation-4.1.0/import


In [4]:
taxonomy_id = '2697049'

### Find PDB Structures containing SARS-CoV-2 proteins or nucleic acids

In [5]:
# Create terminals for each query
#q1 = TextQuery(taxonomy_id)
q2 = attrs.rcsb_entity_source_organism.taxonomy_lineage.id == taxonomy_id

# combined using bitwise operators (&, |, ~, etc)
#query = q1 & q2  # AND of all queries
query = q2

# Call the query to execute it
entries = query('entry')
df = pd.DataFrame(entries, columns=['pdbId'])

In [6]:
df.head()

Unnamed: 0,pdbId
0,5R84
1,5R83
2,5R7Y
3,5R80
4,5R82


In [7]:
print("Number of structures:", df['pdbId'].shape[0])

Number of structures: 371


### Get structure metadata

In [8]:
def get_pdb_entry_data(row):
    pdb_id = row['pdbId']
    
    response = json.loads(requests.get(f'http://data.rcsb.org/rest/v1/core/entry/{pdb_id}').text)
    
    data = np.empty(9, dtype=object)
    data[0] = pdb_id
    data[1] = response['struct']['title']
    data[2] = response['struct']['pdbx_descriptor']
    data[3] = response['rcsb_accession_info']['deposit_date']
    data[4] = response['rcsb_accession_info']['initial_release_date']
    data[5] = response['pdbx_vrpt_summary'].get('pdbresolution', '')
    data[6] = response['pdbx_vrpt_summary'].get('pdbr', '')
    data[7] = response['pdbx_vrpt_summary'].get('pdbrfree', '')      
    data[8] = response['exptl'][0]['method']
    
    return data

In [9]:
structures = df.apply(get_pdb_entry_data, axis=1, result_type='expand')

In [10]:
structures.columns = ['pdbId', 'title', 'description', 'depositDate', 'releaseDate', 'resolution', 'rFactor', 'rFree', 'method']

In [11]:
structures['depositDate'] = structures['depositDate'].apply(lambda d: dateutil.parser.parse(d[:10]))
structures['releaseDate'] = structures['releaseDate'].apply(lambda d: dateutil.parser.parse(d[:10]))

In [12]:
structures['pdbId'] = 'pdb:' + structures['pdbId']

In [13]:
structures.head()

Unnamed: 0,pdbId,title,description,depositDate,releaseDate,resolution,rFactor,rFree,method
0,pdb:5R84,PanDDA analysis group deposition -- Crystal St...,SARS-CoV-2 main protease,2020-03-02,2020-03-10,1.83,0.22,0.29,X-RAY DIFFRACTION
1,pdb:5R83,PanDDA analysis group deposition -- Crystal St...,SARS-CoV-2 main protease,2020-03-02,2020-03-10,1.58,0.18,0.21,X-RAY DIFFRACTION
2,pdb:5R7Y,PanDDA analysis group deposition -- Crystal St...,SARS-CoV-2 main protease,2020-03-02,2020-03-10,1.65,0.18,0.24,X-RAY DIFFRACTION
3,pdb:5R80,PanDDA analysis group deposition -- Crystal St...,SARS-CoV-2 main protease,2020-03-02,2020-03-10,1.93,0.17,0.23,X-RAY DIFFRACTION
4,pdb:5R82,PanDDA analysis group deposition -- Crystal St...,SARS-CoV-2 main protease,2020-03-02,2020-03-10,1.31,0.18,0.21,X-RAY DIFFRACTION


In [14]:
structures.to_csv(NEO4J_IMPORT / "01f-PDBStructure.csv", index=False)

### Get PDB Chain - UniProt sequence mappings

In [15]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [16]:
segments = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
segments.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [17]:
segments.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
segments.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
segments.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
segments.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [18]:
segments.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [19]:
segments['pdbId'] = segments['pdbId'].str.upper()

In [20]:
segments['pdbChainId'] = segments['pdbId'] + "." + segments['chainId']

In [21]:
coverage = segments.merge(df, on='pdbId')

### Sort segments by uniprot residue number

In [22]:
coverage['uniprotStart'] = coverage['uniprotStart'].astype(int)
coverage.sort_values(by='uniprotStart', inplace=True)
coverage['uniprotStart'] = coverage['uniprotStart'].astype(str)

### Assign CURIES

In [23]:
coverage['accession'] = 'uniprot:' + coverage['accession']
coverage['pdbId'] = 'pdb:' + coverage['pdbId']
coverage['pdbChainId'] = 'pdb:' + coverage['pdbChainId']

In [24]:
coverage.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId
316,pdb:6YVA,C,uniprot:Q64339,1,155,1,155,1,155,pdb:6YVA.C
140,pdb:6ZME,LY,uniprot:P61254,1,134,1,134,1,134,pdb:6ZME.LY
138,pdb:6ZME,LW,uniprot:P83731,1,124,1,124,1,124,pdb:6ZME.LW
123,pdb:6ZME,LH,uniprot:P32969,1,190,1,190,1,190,pdb:6ZME.LH
117,pdb:6ZME,LC,uniprot:P36578,1,368,1,368,1,368,pdb:6ZME.LC


### Group data by PDB chains

In [25]:
coverage = coverage.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [26]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
1250,pdb:7JTL,A,pdb:7JTL.A,uniprot:P0DTC8,"[4, 53]","[50, 107]","[18, 67]","[64, 121]","[18, 67]","[64, 121]"
1251,pdb:7JTL,B,pdb:7JTL.B,uniprot:P0DTC8,"[5, 55]","[51, 107]","[19, 69]","[65, 121]","[19, 69]","[65, 121]"
1252,pdb:7JU7,A,pdb:7JU7.A,uniprot:P0DTD1,[1],[304],[1],[304],[3264],[3567]
1253,pdb:7JUN,A,pdb:7JUN.A,uniprot:P0DTD1,[1],[306],[1],[306],[3264],[3569]
1254,pdb:7JYC,A,pdb:7JYC.A,uniprot:P0DTD1,[1],[306],[1],[306],[3264],[3569]


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [27]:
coverage['uniprotStart'] = coverage['uniprotStart'].apply(lambda x: ';'.join(x))
coverage['uniprotEnd'] = coverage['uniprotEnd'].apply(lambda x: ';'.join(x))
coverage['seqresStart'] = coverage['seqresStart'].apply(lambda x: ';'.join(x))
coverage['seqresEnd'] = coverage['seqresEnd'].apply(lambda x: ';'.join(x))
coverage['pdbStart'] = coverage['pdbStart'].apply(lambda x: ';'.join(x))
coverage['pdbEnd'] = coverage['pdbEnd'].apply(lambda x: ';'.join(x))

In [28]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
1250,pdb:7JTL,A,pdb:7JTL.A,uniprot:P0DTC8,4;53,50;107,18;67,64;121,18;67,64;121
1251,pdb:7JTL,B,pdb:7JTL.B,uniprot:P0DTC8,5;55,51;107,19;69,65;121,19;69,65;121
1252,pdb:7JU7,A,pdb:7JU7.A,uniprot:P0DTD1,1,304,1,304,3264,3567
1253,pdb:7JUN,A,pdb:7JUN.A,uniprot:P0DTD1,1,306,1,306,3264,3569
1254,pdb:7JYC,A,pdb:7JYC.A,uniprot:P0DTD1,1,306,1,306,3264,3569


In [29]:
coverage.to_csv(NEO4J_IMPORT / "01f-PDBChain.csv", index=False)