# Downloads EPPIC Interface Information
**[Work in progress]**

This notebook processes 3D biological interface info from EPPIC

Data sources: 
[EPPIC](http://www.eppic-web.org/rest/)

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import requests
import json
import dateutil
import urllib
import time
from pathlib import Path
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [4]:
# Create a directory to cache variation data that could not be parsed
CACHE_PROCESSED = Path(NEO4J_IMPORT / 'cache/domain_interactions/processed')
CACHE_PROCESSED.mkdir(parents=True, exist_ok=True)

In [5]:
# Create a directory to cache variation data that could not be parsed
CACHE_SKIPPED = Path(NEO4J_IMPORT / 'cache/domain_interactions/skipped')
CACHE_SKIPPED.mkdir(parents=True, exist_ok=True)

### Get PDB chain - UniProt mappings

In [6]:
sifts_url = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_uniprot.tsv.gz'

In [7]:
chains = pd.read_csv(sifts_url, sep='\t', skiprows=1, dtype=str)

In [8]:
chains['PDB'] = chains['PDB'].str.upper()
chains = chains[['PDB', 'CHAIN', 'SP_PRIMARY']]
chains.rename(columns={'PDB': 'pdbId', 'CHAIN': 'chain1', 'SP_PRIMARY': 'proteinAccession1'}, inplace=True)

In [9]:
print("Number of chains:", chains.shape[0])
chains.head()

Number of chains: 548236


Unnamed: 0,pdbId,chain1,proteinAccession1
0,101M,A,P02185
1,102L,A,P00720
2,102L,A,P00720
3,102M,A,P02185
4,103L,A,P00720


In [10]:
# add same info for chain 2 to simplify merging later on
chains['pdbChainId'] = 'pdb:' + chains['pdbId'] + '.' + chains['chain1']
chains['chain2'] = chains['chain1']
chains['proteinAccession2'] = chains['proteinAccession1']

In [11]:
# some pdbIds are mapped to multiple segments of the UniProt chain, e.g., 3WYE
chains.drop_duplicates(inplace=True)

In [12]:
print('Number of chains after dropping duplicates', chains.shape[0])

Number of chains after dropping duplicates 535634


In [13]:
chains.query('pdbId == "6MEK"')

Unnamed: 0,pdbId,chain1,proteinAccession1,pdbChainId,chain2,proteinAccession2
458992,6MEK,A,H2FJ05,pdb:6MEK.A,A,H2FJ05
458994,6MEK,C,H2FJ05,pdb:6MEK.C,C,H2FJ05


### Get Pfam Domain ranges

In [14]:
domains = pd.read_csv(NEO4J_IMPORT / "01g-PfamDomainPDB.csv")

In [15]:
domains = domains[['name', 'accession', 'pdbChainId', 'start', 'end']]

# remove domains with insertion codes
domains = domains[(domains['start'].str.isdigit()) & (domains['end'].str.isdigit())]

domains['start'] = domains['start'].astype(int)
domains['end'] = domains['end'].astype(int)

In [16]:
domains.head()

Unnamed: 0,name,accession,pdbChainId,start,end
0,Bromodomain,pfam:PF00439,pdb:5POC.A,40,120
1,Beta-lactamase,pfam:PF00144,pdb:3BLS.A,14,360
2,Apolipoprotein,pfam:PF01442,pdb:3R2P.A,46,182
3,DKCLD,pfam:PF08068,pdb:3MQK.A,20,66
4,TruB_N,pfam:PF01509,pdb:3MQK.A,70,121


In [17]:
domains.query('pdbChainId == "pdb:6MEK.A"')

Unnamed: 0,name,accession,pdbChainId,start,end
338715,HCV_NS1,pfam:PF01560,pdb:6MEK.A,423,645


In [18]:
chains.head()

Unnamed: 0,pdbId,chain1,proteinAccession1,pdbChainId,chain2,proteinAccession2
0,101M,A,P02185,pdb:101M.A,A,P02185
1,102L,A,P00720,pdb:102L.A,A,P00720
3,102M,A,P02185,pdb:102M.A,A,P02185
4,103L,A,P00720,pdb:103L.A,A,P00720
6,103M,A,P02185,pdb:103M.A,A,P02185


In [19]:
domains = pd.merge(domains, chains, on='pdbChainId')

In [20]:
domains.head()

Unnamed: 0,name,accession,pdbChainId,start,end,pdbId,chain1,proteinAccession1,chain2,proteinAccession2
0,Bromodomain,pfam:PF00439,pdb:5POC.A,40,120,5POC,A,O95696,A,O95696
1,Beta-lactamase,pfam:PF00144,pdb:3BLS.A,14,360,3BLS,A,P00811,A,P00811
2,Apolipoprotein,pfam:PF01442,pdb:3R2P.A,46,182,3R2P,A,P02647,A,P02647
3,DKCLD,pfam:PF08068,pdb:3MQK.A,20,66,3MQK,A,Q7LWY0,A,Q7LWY0
4,TruB_N,pfam:PF01509,pdb:3MQK.A,70,121,3MQK,A,Q7LWY0,A,Q7LWY0


In [21]:
print("Unique Pfams in 01g-PfamDomainPDB.csv", len(domains['accession'].unique()))
print("Unique PDBs in 01g-PfamDomainPDB.csv", len(domains['pdbId'].unique()))

Unique Pfams in 01g-PfamDomainPDB.csv 9654
Unique PDBs in 01g-PfamDomainPDB.csv 156272


In [22]:
### PFam - PDB Mappings by SIFTS

In [23]:
pfam_url = 'ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_pfam.tsv.gz'

In [24]:
pfam_domains = pd.read_csv(pfam_url, sep='\t', skiprows=1, dtype=str)

In [25]:
pfam_domains.head()

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,PFAM_ID,COVERAGE
0,101m,A,P02185,PF00042,1
1,102l,A,P00720,PF00959,1
2,102m,A,P02185,PF00042,1
3,103l,A,P00720,PF00959,1
4,103m,A,P02185,PF00042,1


In [26]:
pfam_domains.query('PDB == "6mek"')

Unnamed: 0,PDB,CHAIN,SP_PRIMARY,PFAM_ID,COVERAGE


In [27]:
print("Unique Pfams in pdb_chain_pfam.tsv.gz", len(pfam_domains['PFAM_ID'].unique()))
print("Unique PDBs in pdb_chain_pfam.tsv.gz", len(pfam_domains['PDB'].unique()))

Unique Pfams in pdb_chain_pfam.tsv.gz 8947
Unique PDBs in pdb_chain_pfam.tsv.gz 144260


In [28]:
#CACHE_SIFTS_PROCESSED = Path(NEO4J_IMPORT / 'cache/sifts/processed/sifts.csv')

In [29]:
#df = pd.read_csv(CACHE_SIFTS_PROCESSED, dtype='str')

In [30]:
#df.head()

In [31]:
#df.query('pdbId == "6MEK"') # no Pfam assignments

In [32]:
#print("Unique Pfams in sifts.xml files", len(df['pfamId'].unique()))
#print("Unique PDBs in sifts.xml", len(df['pdbId'].unique()))

In [33]:
t0 = time.time()

In [34]:
pdb_entries_url = 'https://data.rcsb.org/rest/v1/holdings/current/entry_ids'

In [35]:
try:
    pdb_list = json.loads(requests.get(pdb_entries_url).text)
except:
    print("Could not load pdb entries")   

In [36]:
# https://stackoverflow.com/questions/52795561/flattening-nested-json-in-pandas-data-frame
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

### Get Biological Assemblies (PDB, EPPIC)

In [37]:
def download_assemblies(pdb_id):
    assembly_url = f'http://www.eppic-web.org/rest/api/v3/job/assemblies/{pdb_id.lower()}'
    try:
        response = json.loads(requests.get(assembly_url).text)
    except:
        return pd.DataFrame()
    
    assemblies = pd.DataFrame(flatten_json(x, exclude=['graphNodes', 'graphEdges']) for x in response)
    assemblies['pdbId'] = pdb_id.upper()
    columns = assemblies.columns

    if 'assemblyScores_assemblyScore_1_callName' in columns:
        assemblies = assemblies[(assemblies['assemblyScores_assemblyScore_0_callName'] == "bio") |
                                (assemblies['assemblyScores_assemblyScore_1_callName'] == "bio")]
    else:
        # for NMR structure there is no assemblyScores_assemblyScore_1_callName, e.g. PDB 2DTB
        assemblies = assemblies[(assemblies['assemblyScores_assemblyScore_0_callName'] == "bio")]
    
    # remove entries without cluster id
    assemblies.query('interfaceClusterIdsString != "{}"', inplace=True)
    # remove braces from interface list
    assemblies['interfaceClusterIdsString'] = assemblies['interfaceClusterIdsString'].str.replace('{', '', regex=False)
    assemblies['interfaceClusterIdsString'] = assemblies['interfaceClusterIdsString'].str.replace('}', '', regex=False)
    assemblies['interface'] = assemblies['interfaceClusterIdsString'].str.split(',')
    
    assemblies = assemblies.explode('interface')
    assemblies['interface'] = assemblies['interface'].str.strip()
    assemblies['interface'] = assemblies['interface'].astype('int64')
    
    assemblies = assemblies[['pdbId',
                             'interface',
    # the following columns are not always present
    #                        'assemblyScores_assemblyScore_0_method',
    #                         'assemblyScores_assemblyScore_0_callName',
    #                         'assemblyContents_assemblyContent_0_symmetry',
    #                         'assemblyContents_assemblyContent_0_stoichiometry',
     #                        'assemblyScores_assemblyScore_1_method',
    #                         'assemblyScores_assemblyScore_1_callName',
    #                         'assemblyContents_assemblyContent_1_symmetry', # missing in some entries, e.g. 1STP ??
    #                         'assemblyContents_assemblyContent_1_stoichiometry',
                            ]]
    return assemblies

### Get Biological Interfaces (PDB and EPPIC)

In [38]:
def download_interfaces(pdb_id, interface_ids):
    interface_url = f'http://www.eppic-web.org/rest/api/v3/job/interfaces/{pdb_id.lower()}'
    try:
        response = json.loads(requests.get(interface_url).text)
    except:
        return pd.DataFrame()
    
    interfaces = pd.DataFrame(flatten_json(x) for x in response)
    interfaces['pdbId'] = pdb_id.upper()

    if interfaces.shape[0] > 0:
        interfaces = interfaces[interfaces['interfaceId'].isin(interface_ids)]
        interfaces.rename(columns={'interfaceScores_interfaceScore_3_confidence': 'confidence'}, inplace=True)
        return interfaces[['pdbId', 'interfaceId', 'clusterId', 'chain1', 'chain2', 'isologous', 'area', 'confidence']].copy()
    else:
        return pd.DataFrame()

In [39]:
def get_interfaces(pdb_id, interface_ids):
    interfaces = download_interfaces(pdb_id, interface_ids)
    if interfaces.shape[0] > 0:
        interfaces = interfaces.merge(chains[['pdbId', 'chain1', 'proteinAccession1']], on=['pdbId', 'chain1'])
        interfaces = interfaces.merge(chains[['pdbId', 'chain2', 'proteinAccession2']], on=['pdbId', 'chain2'])
    return interfaces

### Get interface residues

In [40]:
def download_contacts(pdb_id, interface_id):
    contact_url = f'http://www.eppic-web.org/rest/api/v3/job/contacts/{pdb_id.lower()}/{interface_id}'
    try:
        response = json.loads(requests.get(contact_url).text)
        return pd.DataFrame([flatten_json(x) for x in response])
    except:
        return pd.DataFrame()

In [41]:
def get_interface_residues(pdb_id, interfaces):
    interface_ids = interfaces['interfaceId'].unique()
    contacts = pd.concat([download_contacts(pdb_id, i) for i in interface_ids], ignore_index=True)
    contacts.rename(columns={'pdbCode': 'pdbId'}, inplace=True)
    contacts['pdbId'] = contacts['pdbId'].str.upper()
    
    chain_contacts = contacts.merge(interfaces, on=['pdbId', 'interfaceId'])
    chain_contacts['pdbChainId1'] = 'pdb:' + chain_contacts['pdbId'] + '.' + chain_contacts['chain1']
    chain_contacts['pdbChainId2'] = 'pdb:' + chain_contacts['pdbId'] + '.' + chain_contacts['chain2']
    
    return chain_contacts

In [42]:
def unique_order(row):
    if row.accession1 > row.accession2:
        row.accession1, row.accession2 = row.accession2, row.accession1
        row.proteinAccession1, row.proteinAccession2 = row.proteinAccession2, row.proteinAccession1
        row.pdbChainId1, row.pdbChainId2 = row.pdbChainId2, row.pdbChainId1
    return row

In [43]:
def filter_domain_interactions(contacts, domains):
    
    # merge chain with domain boundary information
    domain_contacts = contacts.merge(domains[['pdbId', 'proteinAccession1', 'accession', 'start', 'end']], on=['pdbId','proteinAccession1'], suffixes=['1','2'])
    #print("Number of chain - chain contacts 1 merge:", domain_contacts.shape[0])
    #print(domain_contacts.head())
    domain_contacts = domain_contacts.merge(domains[['pdbId', 'proteinAccession2', 'accession', 'start', 'end']], on=['pdbId','proteinAccession2'], suffixes=['1','2'])
    
    #print("Number of chain - chain contacts:", domain_contacts.shape[0])
    #print(domain_contacts)
    
    # keep only contacts within the domain boundaries
    domain_contacts = domain_contacts[(domain_contacts['firstResNumber'] >= domain_contacts['start1']) &
                                      (domain_contacts['firstResNumber'] <= domain_contacts['end1']) &
                                      (domain_contacts['secondResNumber'] >= domain_contacts['start2']) &
                                      (domain_contacts['secondResNumber'] <= domain_contacts['end2'])]
    
    #print("Number of domain - domain contacts:", domain_contacts.shape[0])
    
    # keep only subset of data required
    domain_contacts = domain_contacts[['pdbChainId1', 'pdbChainId2', 'confidence', 'isologous', 'proteinAccession1', 'proteinAccession2', 'accession1', 'accession2']].copy()

    # order domains in canonical order
    domain_contacts = domain_contacts.apply(unique_order, axis=1)
    domain_contacts.drop_duplicates(inplace=True)
    
    return domain_contacts

In [44]:
def get_domain_interactions(pdb_id, domains):
    assemblies = download_assemblies(pdb_id)
    if assemblies.shape[0] > 0:
        interface_ids = list(assemblies['interface'].unique())
    
        interfaces = get_interfaces(pdb_id, interface_ids)
    
        if interfaces.shape[0] > 0:
            contacts = get_interface_residues(pdb_id, interfaces)
            return filter_domain_interactions(contacts, domains)

    return pd.DataFrame()

In [46]:
def process_pdbs(pdb_list):
    filename = CACHE_PROCESSED / '01f-EPPICInteractions.csv'
    exist = os.path.isfile(filename)
    
    for pdb_id in pdb_list:
        #print(pdb_id)
        domain_interactions = get_domain_interactions(pdb_id, domains)

        if domain_interactions.shape[0] > 0:     
            if not exist:
                domain_interactions.to_csv(filename, index=False)
                exist = True
            else:
                domain_interactions.to_csv(filename, header=False, mode='a', index=False)
        else:
            domain_interactions.to_csv(CACHE_SKIPPED / (pdb_id + '.csv'), index=False)   

In [47]:
# pdb_list = ['6MEK'] # example of missing Pfam domains
process_pdbs(pdb_list)

True


In [48]:
t1 = time.time()
print(f"Time to process {len(pdb_list)} files:", t1-t0)

Time to process 20 files: 33.328689098358154
