# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads 3D-structure information for SARS-CoV-2 proteins

Data sources: 
[RCSB Protein Data Bank](https://www.rcsb.org), 
[PDBe Protein Data Bank Europe](https://www.ebi.ac.uk/pdbe/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph
from rcsbsearch import TextQuery
from rcsbsearch import rcsb_attributes as attrs

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-328d8379-6ab4-4cc1-a397-2de37909d2e4/installation-4.1.0/import


In [4]:
taxonomy_id = 2697049
columns = 'id,genes(PREFERRED),length'

### Find PDB Structures containing SARS-CoV-2 proteins or nucleic acids

In [5]:
# Create terminals for each query
q1 = TextQuery('2697049')
q2 = attrs.rcsb_entity_source_organism.taxonomy_lineage.id == '2697049'

# combined using bitwise operators (&, |, ~, etc)
query = q1 & q2  # AND of all queries

# Call the query to execute it
polymer_entities = query('polymer_entity')

df = pd.DataFrame(polymer_entities, columns=['polymerEntity'])
df['pdbId'] = df['polymerEntity'].str[:4]

In [6]:
df.head()

Unnamed: 0,polymerEntity,pdbId
0,6XQB_4,6XQB
1,7CTT_5,7CTT
2,7CTT_4,7CTT
3,7BV2_4,7BV2
4,7BV2_5,7BV2


In [7]:
# TODO: Investigate why some chains have multiple polymerEntity ids, for now eliminate those entries
df.drop(columns='polymerEntity', inplace=True)

In [8]:
len(df['pdbId'].unique())

371

### Get PDB Chain - UniProt sequence mappings

In [9]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [10]:
segments = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
segments.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [11]:
segments.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chainId', 'SP_PRIMARY': 'accession'}, inplace=True)
segments.rename(columns={'RES_BEG': 'seqresStart', 'RES_END': 'seqresEnd'}, inplace=True)
segments.rename(columns={'PDB_BEG': 'pdbStart', 'PDB_END': 'pdbEnd'}, inplace=True)
segments.rename(columns={'SP_BEG': 'uniprotStart', 'SP_END': 'uniprotEnd'}, inplace=True)

In [12]:
segments.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
0,105m,A,P02185,1,153,1,153,2,154
1,113l,A,P00720,1,162,1,162,1,162
2,120l,A,P00720,1,162,1,162,1,162
3,185l,A,P00720,1,162,1,162,1,162
4,128l,A,P00720,1,162,1,162,1,162


In [13]:
segments['pdbId'] = segments['pdbId'].str.upper()

In [14]:
segments['pdbChainId'] = segments['pdbId'] + "." + segments['chainId']

In [15]:
coverage = segments.merge(df, on='pdbId')

In [16]:
coverage.shape

(2916, 10)

In [17]:
# TODO: why are there duplicates? For now, drop any duplicates
coverage.drop_duplicates(['pdbChainId', 'accession', 'uniprotStart', 'uniprotEnd'], inplace=True)

### Sort segments by uniprot residue number

In [18]:
coverage['uniprotStart'] = coverage['uniprotStart'].astype(int)
coverage.sort_values(by='uniprotStart', inplace=True)
coverage['uniprotStart'] = coverage['uniprotStart'].astype(str)

In [19]:
coverage.shape

(2636, 10)

### Assign CURIES

In [20]:
coverage['accession'] = 'uniprot:' + coverage['accession']
coverage['pdbId'] = 'pdb:' + coverage['pdbId']
coverage['pdbChainId'] = 'pdb:' + coverage['pdbChainId']

In [21]:
coverage.shape

(2636, 10)

In [22]:
coverage.head()

Unnamed: 0,pdbId,chainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd,pdbChainId
324,pdb:6YVA,C,uniprot:Q64339,1,155,1,155,1,155,pdb:6YVA.C
148,pdb:6ZME,LY,uniprot:P61254,1,134,1,134,1,134,pdb:6ZME.LY
146,pdb:6ZME,LW,uniprot:P83731,1,124,1,124,1,124,pdb:6ZME.LW
131,pdb:6ZME,LH,uniprot:P32969,1,190,1,190,1,190,pdb:6ZME.LH
125,pdb:6ZME,LC,uniprot:P36578,1,368,1,368,1,368,pdb:6ZME.LC


### Group data by PDB chains

In [23]:
coverage = coverage.groupby(['pdbId','chainId','pdbChainId','accession']).agg(list).reset_index()

In [24]:
coverage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   pdbId         1255 non-null   object
 1   chainId       1255 non-null   object
 2   pdbChainId    1255 non-null   object
 3   accession     1255 non-null   object
 4   seqresStart   1255 non-null   object
 5   seqresEnd     1255 non-null   object
 6   pdbStart      1255 non-null   object
 7   pdbEnd        1255 non-null   object
 8   uniprotStart  1255 non-null   object
 9   uniprotEnd    1255 non-null   object
dtypes: object(10)
memory usage: 98.2+ KB


In [25]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
1250,pdb:7JTL,A,pdb:7JTL.A,uniprot:P0DTC8,"[4, 53]","[50, 107]","[18, 67]","[64, 121]","[18, 67]","[64, 121]"
1251,pdb:7JTL,B,pdb:7JTL.B,uniprot:P0DTC8,"[5, 55]","[51, 107]","[19, 69]","[65, 121]","[19, 69]","[65, 121]"
1252,pdb:7JU7,A,pdb:7JU7.A,uniprot:P0DTD1,[1],[304],[1],[304],[3264],[3567]
1253,pdb:7JUN,A,pdb:7JUN.A,uniprot:P0DTD1,[1],[306],[1],[306],[3264],[3569]
1254,pdb:7JYC,A,pdb:7JYC.A,uniprot:P0DTD1,[1],[306],[1],[306],[3264],[3569]


### Create semicolon separated string of residue numbers so they can be represented in a csv file

In [26]:
coverage['uniprotStart'] = coverage['uniprotStart'].apply(lambda x: ';'.join(x))
coverage['uniprotEnd'] = coverage['uniprotEnd'].apply(lambda x: ';'.join(x))
coverage['seqresStart'] = coverage['seqresStart'].apply(lambda x: ';'.join(x))
coverage['seqresEnd'] = coverage['seqresEnd'].apply(lambda x: ';'.join(x))
coverage['pdbStart'] = coverage['pdbStart'].apply(lambda x: ';'.join(x))
coverage['pdbEnd'] = coverage['pdbEnd'].apply(lambda x: ';'.join(x))

In [27]:
coverage.tail()

Unnamed: 0,pdbId,chainId,pdbChainId,accession,seqresStart,seqresEnd,pdbStart,pdbEnd,uniprotStart,uniprotEnd
1250,pdb:7JTL,A,pdb:7JTL.A,uniprot:P0DTC8,4;53,50;107,18;67,64;121,18;67,64;121
1251,pdb:7JTL,B,pdb:7JTL.B,uniprot:P0DTC8,5;55,51;107,19;69,65;121,19;69,65;121
1252,pdb:7JU7,A,pdb:7JU7.A,uniprot:P0DTD1,1,304,1,304,3264,3567
1253,pdb:7JUN,A,pdb:7JUN.A,uniprot:P0DTD1,1,306,1,306,3264,3569
1254,pdb:7JYC,A,pdb:7JYC.A,uniprot:P0DTD1,1,306,1,306,3264,3569


In [28]:
coverage.to_csv(NEO4J_IMPORT / "01f-PDBStructures.csv", index=False)