# Downloads PDB Structure Information
**[Work in progress]**

This notebook downloads PDB structure information for Coronavirus Structures

Data source: [RCSB Protein Data Bank](https://www.rcsb.org)

Author: Peter Rose (pwrose@ucsd.edu)

In [103]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph
from rcsbsearch import TextQuery
from rcsbsearch import rcsb_attributes as attrs

In [104]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [105]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


In [106]:
taxonomy_id = 2697049
columns = 'id,genes(PREFERRED),length'

### Find PDB Structures containing SARS-CoV-2 proteins

In [107]:
# Create terminals for each query
q1 = TextQuery('2697049')
q2 = attrs.rcsb_entity_source_organism.taxonomy_lineage.id == '2697049'

# combined using bitwise operators (&, |, ~, etc)
query = q1 & q2  # AND of all queries

# Call the query to execute it
polymer_entities = query('polymer_entity')

df = pd.DataFrame(polymer_entities, columns=['polymerEntity'])
df['pdbId'] = df['polymerEntity'].str[:4]

In [108]:
df.head()

Unnamed: 0,polymerEntity,pdbId
0,6X2G_4,6X2G
1,7BV2_4,7BV2
2,7BV2_5,7BV2
3,6LVN_1,6LVN
4,7C22_1,7C22


In [110]:
#http://data.rcsb.org/rest/v1/core/entry/6X2G

In [111]:
# https://stackoverflow.com/questions/33559660/valueerror-errors-while-reading-json-file-with-pd-read-json

from urllib.request import urlopen
import json 
from pandas.io.json import json_normalize
import pandas as pd
import requests


#df = json.loads(requests.get('https://data.rcsb.org/rest/v1/core/entry/6X2G').text)

#data = pd.DataFrame.from_dict(df, orient='index')

#print(data)

In [112]:
#df

In [113]:
url = f'https://www.uniprot.org/uniprot/?query=organism:{taxonomy_id}&columns={columns}&format=tab'

In [114]:
unp = pd.read_csv(url, sep='\t')
unp.rename(columns={'Gene names  (primary )': 'GENE'}, inplace=True)  ## create name without spaces
unp.head()

Unnamed: 0,Entry,GENE,Length
0,P0DTC2,S,1273
1,P0DTC9,N,419
2,P0DTC1,,4405
3,P0DTC7,,121
4,P0DTD2,,97


In [115]:
print('Unique proteins: ', len(unp['Entry'].unique()), 'for organism:', taxonomy_id)
print('Unique genes   : ', len(unp['GENE'].unique()), 'for organism:', taxonomy_id)

Unique proteins:  120 for organism: 2697049
Unique genes   :  20 for organism: 2697049


In [116]:
sifts_url = 'http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/uniprot_segments_observed.tsv.gz'

In [117]:
segments = pd.read_csv(sifts_url, sep='\t', skiprows=1)
segments.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,RES_BEG,RES_END,PDB_BEG,PDB_END,SP_BEG,SP_END
0,128l,A,P00720,1,162,1,162,1,162
1,113l,A,P00720,1,162,1,162,1,162
2,123l,A,P00720,1,162,1,162,1,162
3,105m,A,P02185,1,153,1,153,2,154
4,120l,A,P00720,1,162,1,162,1,162


In [118]:
segments['SEG_LENGTH'] = segments['SP_END'] - segments['SP_BEG'] + 1

In [119]:
segments['PDB_CHAIN_ID'] = segments['PDB'].str.upper()  + "." + segments['CHAIN']
segments = segments[['PDB_CHAIN_ID','SP_PRIMARY','SP_BEG','SP_END','SEG_LENGTH']]

In [120]:
coverage = segments.merge(unp, left_on=['SP_PRIMARY'], right_on=['Entry'])

In [121]:
coverage.head(100)

Unnamed: 0,PDB_CHAIN_ID,SP_PRIMARY,SP_BEG,SP_END,SEG_LENGTH,Entry,GENE,Length
0,5RE9.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
1,5REE.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
2,5REN.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
3,5REI.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
4,5REW.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
5,5RF4.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
6,5RF3.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
7,5RFB.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
8,5RFX.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
9,5RER.A,P0DTD1,3264,3567,304,P0DTD1,rep,7096
