# Downloads EPPIC Interface Information
**[Work in progress]**

This notebook processes 3D biological interface info from EPPIC

Data sources: 
[EPPIC](http://www.eppic-web.org/rest/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
import dateutil
import urllib
from pathlib import Path
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [4]:
# Create a directory to cache variation data that could not be parsed
CACHE_PROCESSED = Path(NEO4J_IMPORT / 'cache/domain_interactions/processed')
CACHE_PROCESSED.mkdir(parents=True, exist_ok=True)

In [5]:
# Create a directory to cache variation data that could not be parsed
CACHE_SKIPPED = Path(NEO4J_IMPORT / 'cache/domain_interactions/skipped')
CACHE_SKIPPED.mkdir(parents=True, exist_ok=True)

### Get PDB chain - UniProt mappings

In [6]:
sifts_url = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_uniprot.tsv.gz'

In [7]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)
chains['PDB'] = chains['PDB'].str.upper()
chains = chains[['PDB', 'CHAIN', 'SP_PRIMARY']]
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chain1', 'SP_PRIMARY': 'proteinAccession1'}, inplace=True)
# add same info a chain 2 to simplify merging later on
chains['pdbChainId'] = 'pdb:' + chains['pdbId'] + '.' + chains['chain1']
chains['chain2'] = chains['chain1']
chains['proteinAccession2'] = chains['proteinAccession1']
print("Number of chains:", chains.shape[0])
chains.head()

Number of chains: 548236


Unnamed: 0,pdbId,chain1,proteinAccession1,pdbChainId,chain2,proteinAccession2
0,101M,A,P02185,pdb:101M.A,A,P02185
1,102L,A,P00720,pdb:102L.A,A,P00720
2,102L,A,P00720,pdb:102L.A,A,P00720
3,102M,A,P02185,pdb:102M.A,A,P02185
4,103L,A,P00720,pdb:103L.A,A,P00720


### Get Pfam Domain ranges

In [8]:
domains = pd.read_csv(NEO4J_IMPORT / "01g-PfamDomainPDB.csv")

In [9]:
domains = domains[['name', 'accession', 'pdbChainId', 'start', 'end']]

# remove domains with insertion codes
domains = domains[(domains['start'].str.isdigit()) & (domains['end'].str.isdigit())]

domains['start'] = domains['start'].astype(int)
domains['end'] = domains['end'].astype(int)

In [10]:
# TODO: for identical chains, Pfam only indexes the first chain, e.g. pdb:6VXX.A, but chains B and C are missing
domains.query('pdbChainId == "pdb:1OHR.A"') # pdb:1OHR.B is identical and missing

Unnamed: 0,name,accession,pdbChainId,start,end
81164,RVP,pfam:PF00077,pdb:1OHR.A,6,97


In [11]:
domains.head()

Unnamed: 0,name,accession,pdbChainId,start,end
0,Bromodomain,pfam:PF00439,pdb:5POC.A,40,120
1,Beta-lactamase,pfam:PF00144,pdb:3BLS.A,14,360
2,Apolipoprotein,pfam:PF01442,pdb:3R2P.A,46,182
3,DKCLD,pfam:PF08068,pdb:3MQK.A,20,66
4,TruB_N,pfam:PF01509,pdb:3MQK.A,70,121


In [12]:
chains.head()

Unnamed: 0,pdbId,chain1,proteinAccession1,pdbChainId,chain2,proteinAccession2
0,101M,A,P02185,pdb:101M.A,A,P02185
1,102L,A,P00720,pdb:102L.A,A,P00720
2,102L,A,P00720,pdb:102L.A,A,P00720
3,102M,A,P02185,pdb:102M.A,A,P02185
4,103L,A,P00720,pdb:103L.A,A,P00720


In [13]:
domains = pd.merge(domains, chains, on='pdbChainId')

In [14]:
domains.head()

Unnamed: 0,name,accession,pdbChainId,start,end,pdbId,chain1,proteinAccession1,chain2,proteinAccession2
0,Bromodomain,pfam:PF00439,pdb:5POC.A,40,120,5POC,A,O95696,A,O95696
1,Beta-lactamase,pfam:PF00144,pdb:3BLS.A,14,360,3BLS,A,P00811,A,P00811
2,Apolipoprotein,pfam:PF01442,pdb:3R2P.A,46,182,3R2P,A,P02647,A,P02647
3,DKCLD,pfam:PF08068,pdb:3MQK.A,20,66,3MQK,A,Q7LWY0,A,Q7LWY0
4,TruB_N,pfam:PF01509,pdb:3MQK.A,70,121,3MQK,A,Q7LWY0,A,Q7LWY0


In [15]:
pdb_entries_url = 'https://data.rcsb.org/rest/v1/holdings/current/entry_ids'

In [16]:
try:
    pdb_list = json.loads(requests.get(pdb_entries_url).text)
except:
    print("Could not load pdb entries")   

In [18]:
#pdb_id = '7cab'
#pdb_id = '7d19'
#pdb_id = '7dd2'
#pdb_id = '6vxx' # 12/744
#pdb_id = '6xdg'
#pdb_id = '4hhb'
#pdb_id = '1stp' #1/156
#pdb_id = '1ohr' # 1/83
#pdb_id = '3g46'
#pdb_id = 'junk'
#pdb_id = '3t6c' # D4
#pdb_id =  '4evc' # T
#pdb_id = '4po5' # O 48x
#pdb_id = '1c8d' # I x60 no interfaces found???
#pdb_id = '1rvv' # I x60
#pdb_id = '1a34' # I x60
#pdb_id = '1RVV' # I x60

In [19]:
# https://stackoverflow.com/questions/52795561/flattening-nested-json-in-pandas-data-frame
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

### Get Biological Interfaces

In [20]:
def download_interfaces(pdb_id):
    interface_url = f'http://www.eppic-web.org/rest/api/v3/job/interfaces/{pdb_id.lower()}'
    try:
        response = json.loads(requests.get(interface_url).text)
    except:
        return pd.DataFrame()
    
    interfaces = pd.DataFrame(flatten_json(x) for x in response)
    interfaces['pdbId'] = pdb_id.upper()

    if interfaces.shape[0] > 0:
        interfaces.query('interfaceScores_interfaceScore_3_callName == "bio"', inplace=True)
        print('   interfaces:', interfaces.shape)
        interfaces.rename(columns={'interfaceScores_interfaceScore_3_confidence': 'confidence'}, inplace=True)
        return interfaces[['pdbId', 'interfaceId', 'clusterId', 'chain1', 'chain2', 'isologous', 'area', 'confidence']].copy()
    else:
        return pd.DataFrame()

In [21]:
def get_interfaces(pdb_id):
    interfaces = download_interfaces(pdb_id)
    if interfaces.shape[0] > 0:
        interfaces = interfaces.merge(chains[['pdbId', 'chain1', 'proteinAccession1']], on=['pdbId', 'chain1'])
        interfaces = interfaces.merge(chains[['pdbId', 'chain2', 'proteinAccession2']], on=['pdbId', 'chain2'])
    return interfaces

### Get interface residues

In [22]:
def download_contacts(pdb_id, interface_id):
    contact_url = f'http://www.eppic-web.org/rest/api/v3/job/contacts/{pdb_id.lower()}/{interface_id}'
    try:
        response = json.loads(requests.get(contact_url).text)
        return pd.DataFrame([flatten_json(x) for x in response])
    except:
        return pd.DataFrame()

In [23]:
def get_interface_residues(pdb_id, interfaces):
    interface_ids = interfaces['interfaceId'].values
    contacts = pd.concat([download_contacts(pdb_id, i) for i in interface_ids], ignore_index=True)
    contacts.rename(columns={'pdbCode': 'pdbId'}, inplace=True)
    contacts['pdbId'] = contacts['pdbId'].str.upper()
    
    chain_contacts = contacts.merge(interfaces, on=['pdbId', 'interfaceId'])
    print('   contacts:', chain_contacts.shape)
    chain_contacts['pdbChainId1'] = 'pdb:' + chain_contacts['pdbId'] + '.' + chain_contacts['chain1']
    chain_contacts['pdbChainId2'] = 'pdb:' + chain_contacts['pdbId'] + '.' + chain_contacts['chain2']
    
    return chain_contacts

In [24]:
def unique_order(row):
    if row.accession1 > row.accession2:
        row.accession1, row.accession2 = row.accession2, row.accession1
        row.proteinAccession1, row.proteinAccession2 = row.proteinAccession2, row.proteinAccession1
        row.pdbChainId1, row.pdbChainId2 = row.pdbChainId2, row.pdbChainId1
    return row

In [25]:
def filter_domain_interactions(contacts, domains):
    
    # merge chain with domain boundary information
    domain_contacts = contacts.merge(domains[['pdbId', 'proteinAccession1', 'accession', 'start', 'end']], on=['pdbId','proteinAccession1'], suffixes=['1','2'])
    domain_contacts = domain_contacts.merge(domains[['pdbId', 'proteinAccession2', 'accession', 'start', 'end']], on=['pdbId','proteinAccession2'], suffixes=['1','2'])
    
    #print("Number of chain - chain contacts:", domain_contacts.shape[0])
    
    # keep only contacts within the domain boundaries
    domain_contacts = domain_contacts[(domain_contacts['firstResNumber'] >= domain_contacts['start1']) &
                                      (domain_contacts['firstResNumber'] <= domain_contacts['end1']) &
                                      (domain_contacts['secondResNumber'] >= domain_contacts['start2']) &
                                      (domain_contacts['secondResNumber'] <= domain_contacts['end2'])]
    
    #print("Number of domain - domain contacts:", domain_contacts.shape[0])
    
    # keep only subset of data required
    domain_contacts = domain_contacts[['pdbChainId1', 'pdbChainId2', 'confidence', 'isologous', 'proteinAccession1', 'proteinAccession2', 'accession1', 'accession2']].copy()

    # order domains in canonical order
    domain_contacts = domain_contacts.apply(unique_order, axis=1)
    domain_contacts.drop_duplicates(inplace=True)
    
    return domain_contacts

In [26]:
def get_domain_interactions(pdb_id, domains):
    interfaces = get_interfaces(pdb_id)
    if interfaces.shape[0] > 0:
        contacts = get_interface_residues(pdb_id, interfaces)
        return filter_domain_interactions(contacts, domains)
    else:
        return pd.DataFrame()

In [27]:
def process_pdbs(pdb_list):
    for pdb_id in pdb_list:
        print(pdb_id)
        domain_interactions = get_domain_interactions(pdb_id, domains)
        filename = pdb_id + '.csv'
        if domain_interactions.shape[0] > 0:
            domain_interactions.to_csv(CACHE_PROCESSED / filename, index=False)
        else:
            domain_interactions.to_csv(CACHE_SKIPPED / filename, index=False)   

In [28]:
process_pdbs(pdb_list)

3J3Q
