# Downloads ProtCID Interface Information
**[Work in progress]**

This notebook processes domain-domain interactions from ProtCID interface info from EPPIC

Data sources: 
[ProdCID]()

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import numpy as np
import pandas as pd
import gzip
import io
import requests
import json
import dateutil
import urllib
from pathlib import Path
from py2neo import Graph

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [4]:
CACHE = Path(NEO4J_IMPORT / 'cache/protcid/PdbPfamDomainDomainInteractions.txt.gz')

In [5]:
df = pd.read_csv(CACHE, sep='\t', skiprows=22)

In [6]:
df.head(10)

Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
2,1sc6,B,B,B,B,2_656,1_556,P0A9T0,P0A9T0,PF02826,PF01842,16,1,2,2,45,4,7,266.7667,0,0
3,3k5p,A,A,A,A,1_555,11_555,Q2YK82,Q2YK82,PF01842,PF02826,16,1,2,2,45,4,7,266.7667,0,0
4,1psd,B,B,B,B,1_555,4_456,P0A9T0,P0A9T0,PF02826,PF01842,16,2,1,3,98,4,7,313.9667,0,0
5,2p9c,A,A,A,A,1_555,3_456,P0A9T0,P0A9T0,PF02826,PF01842,16,2,1,3,98,4,7,313.9667,0,0
6,2p9g,A,A,A,A,3_555,1_555,P0A9T0,P0A9T0,PF01842,PF02826,16,2,1,3,98,4,7,313.9667,0,0
7,1c4t,A,B,A,B,1_555,1_555,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
8,1dpb,A,A,A,A,1_555,5_555,P10802,P10802,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
9,1scz,A,A,A,A,1_555,58_465,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21


In [7]:
len(df['PdbID'].unique())

78393

#### Remove intrachain interactions

In [8]:
df = df[(df['AsymChain1'] != df['AsymChain2']) & (df['SymmetryOp1'] != 'symmetryOp2')]

In [9]:
len(df['PdbID'].unique())

51911

#### Remove homo interactions (same protein interacting in a complex)

In [10]:
df = df[df['UniProt1'] != df['UniProt2']]

In [11]:
len(df['PdbID'].unique())

12113

In [12]:
#df.sort_values(by=['SurfaceArea'], ascending=False, inplace=True)

In [13]:
print(df.shape)
df.head()

(84340, 21)


Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
142,4zwj,A,B,A,B,1_555,3_445,P20443,P08100,PF02752,PF00001,141,2,2,3,99,2,3,332.8917,0,0
143,5dgy,A,B,A,B,1_555,3_445,P20443,P08100,PF02752,PF00001,141,2,2,3,99,2,3,332.8917,0,0
144,5w0p,C,D,C,D,1_555,1_455,P20443,P08100,PF02752,PF00001,141,2,2,3,99,2,3,332.8917,0,0


In [14]:
# total: 320127
# (df['NumPdbBAs'] > 0) | (df['NumPisaBAs'] > 0): 186629
# (df['NumPisaBAs'] > 0): 174994
# (df['NumPdbBAs']): 163726

In [15]:
df = df[(df['NumPisaBAs'] > 0)]

In [16]:
df.shape

(69317, 21)

In [17]:
def unique_id(row):
    if row.PfamAcc1 > row.PfamAcc2:
        return row.PfamAcc2 + '_' + row.PfamAcc1
    else:
        return row.PfamAcc1 + '_' + row.PfamAcc2

In [18]:
df['uniqueId'] = df.apply(lambda r: unique_id(r), axis=1)

In [19]:
df.head()

Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs,uniqueId
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2,PF02195_PF10417
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2,PF02195_PF10417
171,4cvn,A,H,A,H,1_555,1_555,Q9UZK4,P62010,PF13238,PF00411,321,1,1,2,100,1,2,1006.05,2,2,PF00411_PF13238
172,4cw7,A,B,A,B,1_555,1_555,Q9UZK4,P62010,PF13238,PF00411,321,1,1,2,100,1,2,1006.05,2,2,PF00411_PF13238
181,1g4a,A,F,E,C,1_555,1_555,P0A6H5,P0A7B8,PF07724,PF00227,338,1,4,9,17,4,9,224.4542,7,4,PF00227_PF07724


In [20]:
print('Unique Pfam pairs:', len(df['uniqueId'].unique()))

Unique Pfam pairs: 3585


In [21]:
def unique_order(row):
    if row.PfamAcc1 > row.PfamAcc2:
        row.PfamAcc1, row.PfamAcc2 = row.PfamAcc2, row.PfamAcc1
        row.UniProt1, row.UniProt2 = row.UniProt2, row.UniProt1
    return row

In [22]:
df_unique = df[['PfamAcc1', 'PfamAcc2', 'UniProt1', 'UniProt2']].copy()

In [23]:
df_unique = df_unique.apply(unique_order, axis=1)

In [24]:
df_unique.head()

Unnamed: 0,PfamAcc1,PfamAcc2,UniProt1,UniProt2
0,PF02195,PF10417,Q9BYN0,Q06830
1,PF02195,PF10417,Q9BYN0,Q06830
171,PF00411,PF13238,P62010,Q9UZK4
172,PF00411,PF13238,P62010,Q9UZK4
181,PF00227,PF07724,P0A7B8,P0A6H5


In [25]:
df_unique.shape

(69317, 4)

In [26]:
df_unique.tail()

Unnamed: 0,PfamAcc1,PfamAcc2,UniProt1,UniProt2
365535,PF00028,PF00541,Q14126,P04501
365536,PF00028,PF00541,Q14126,P04501
365561,PF01867,PF09711,6qxf2,6qxf1
365562,PF01867,PF09711,6qxt2,6qxt1
365563,PF01867,PF09711,6qy32,6qy31


In [27]:
df_unique['accession1'] = 'pfam:' + df_unique['PfamAcc1']
df_unique['accession2'] = 'pfam:' + df_unique['PfamAcc2']
df_unique['id'] = df_unique['PfamAcc1'] + '-' + df_unique['PfamAcc2']

In [28]:
# only use valid UniProt Ids
df_unique['proteinAccession1'] = df_unique['UniProt1'].apply(lambda x: '' if x[0].isdigit() else 'uniprot:' + x)
df_unique['proteinAccession2'] = df_unique['UniProt2'].apply(lambda x: '' if x[0].isdigit() else 'uniprot:' + x)

In [29]:
df_unique = df_unique[['id', 'accession1', 'accession2', 'proteinAccession1', 'proteinAccession2']].copy()
df_unique = df_unique.drop_duplicates()

In [30]:
print('Number of domain-domain pairs:', df_unique.shape[0])

Number of domain-domain pairs: 14581


In [31]:
df_unique.head()

Unnamed: 0,id,accession1,accession2,proteinAccession1,proteinAccession2
0,PF02195-PF10417,pfam:PF02195,pfam:PF10417,uniprot:Q9BYN0,uniprot:Q06830
171,PF00411-PF13238,pfam:PF00411,pfam:PF13238,uniprot:P62010,uniprot:Q9UZK4
181,PF00227-PF07724,pfam:PF00227,pfam:PF07724,uniprot:P0A7B8,uniprot:P0A6H5
183,PF00227-PF07724,pfam:PF00227,pfam:PF07724,uniprot:P43772,uniprot:P43773
186,PF00227-PF07724,pfam:PF00227,pfam:PF07724,uniprot:P39070,uniprot:P0A6H5


In [32]:
df_unique.to_csv(NEO4J_IMPORT / "01g-ProtCIDInteraction.csv", index=False)