# Downloads ProtCID Interface Information
**[Work in progress]**

This notebook processes domain-domain interactions from ProtCID interface info from EPPIC

Data sources: 
[ProdCID]()

Author: Peter Rose (pwrose@ucsd.edu)

In [6]:
import os
import numpy as np
import pandas as pd
import gzip
import io
import requests
import json
import dateutil
import urllib
from pathlib import Path
from py2neo import Graph

In [7]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [8]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


In [9]:
CACHE = Path(NEO4J_IMPORT / 'cache/protcid/PdbPfamDomainDomainInteractions.txt.gz')

In [10]:
df = pd.read_csv(CACHE, sep='\t', skiprows=22)

In [11]:
df.head(10)

Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
2,1sc6,B,B,B,B,2_656,1_556,P0A9T0,P0A9T0,PF02826,PF01842,16,1,2,2,45,4,7,266.7667,0,0
3,3k5p,A,A,A,A,1_555,11_555,Q2YK82,Q2YK82,PF01842,PF02826,16,1,2,2,45,4,7,266.7667,0,0
4,1psd,B,B,B,B,1_555,4_456,P0A9T0,P0A9T0,PF02826,PF01842,16,2,1,3,98,4,7,313.9667,0,0
5,2p9c,A,A,A,A,1_555,3_456,P0A9T0,P0A9T0,PF02826,PF01842,16,2,1,3,98,4,7,313.9667,0,0
6,2p9g,A,A,A,A,3_555,1_555,P0A9T0,P0A9T0,PF01842,PF02826,16,2,1,3,98,4,7,313.9667,0,0
7,1c4t,A,B,A,B,1_555,1_555,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
8,1dpb,A,A,A,A,1_555,5_555,P10802,P10802,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
9,1scz,A,A,A,A,1_555,58_465,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21


In [12]:
len(df['PdbID'].unique())

78393

In [15]:
df = df[(df['AsymChain1'] != df['AsymChain2']) & (df['SymmetryOp1'] != 'symmetryOp2')]

In [16]:
len(df['PdbID'].unique())

51911

In [17]:
#df.sort_values(by=['SurfaceArea'], ascending=False, inplace=True)

In [18]:
print(df.shape)
df.head()

(219884, 21)


Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2
7,1c4t,A,B,A,B,1_555,1_555,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
10,2ii3,A,B,A,B,1_555,1_555,P11181,P11181,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21
12,3mae,A,B,A,B,1_555,1_555,3mae1,3mae1,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21


In [19]:
# total: 320127
# (df['NumPdbBAs'] > 0) | (df['NumPisaBAs'] > 0): 186629
# (df['NumPisaBAs'] > 0): 174994
# (df['NumPdbBAs']): 163726

In [20]:
df = df[(df['NumPisaBAs'] > 0)]

In [21]:
df.shape

(142982, 21)

In [22]:
def unique_id(row):
    if row.PfamAcc1 > row.PfamAcc2:
        return row.PfamAcc2 + '_' + row.PfamAcc1
    else:
        return row.PfamAcc1 + '_' + row.PfamAcc2

In [23]:
df['uniqueId'] = df.apply(lambda r: unique_id(r), axis=1)

In [24]:
df.head()

Unnamed: 0,PdbID,AsymChain1,AsymChain2,AuthChain1,AuthChain2,SymmetryOp1,SymmetryOp2,UniProt1,UniProt2,PfamAcc1,PfamAcc2,PfamPairID,ClusterID,NumCFs_cluster,NumEntries_cluster,MinSeqIdentity,NumCFs_PfamPair,NumEntries_PfamPair,SurfaceArea,NumPdbBAs,NumPisaBAs,uniqueId
0,2rii,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2,PF02195_PF10417
1,3hy2,A,D,A,Y,1_555,1_555,Q06830,Q9BYN0,PF10417,PF02195,4,1,2,2,96,2,2,860.7375,2,2,PF02195_PF10417
7,1c4t,A,B,A,B,1_555,1_555,P0AFG6,P0AFG6,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21,PF00198_PF00198
10,2ii3,A,B,A,B,1_555,1_555,P11181,P11181,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21,PF00198_PF00198
12,3mae,A,B,A,B,1_555,1_555,3mae1,3mae1,PF00198,PF00198,30,1,9,21,26,10,22,1491.018,21,21,PF00198_PF00198


In [25]:
print('Unique Pfam pairs:', len(df['uniqueId'].unique()))

Unique Pfam pairs: 7684


In [26]:
def unique_order(row):
    if row.PfamAcc1 > row.PfamAcc2:
        row.PfamAcc1, row.PfamAcc2 = row.PfamAcc2, row.PfamAcc1
        row.UniProt1, row.UniProt2 = row.UniProt2, row.UniProt1
    return row

In [27]:
df_unique = df[['PfamAcc1', 'PfamAcc2', 'UniProt1', 'UniProt2']].copy()

In [28]:
df_unique = df_unique.apply(unique_order, axis=1)

In [29]:
df_unique.head()

Unnamed: 0,PfamAcc1,PfamAcc2,UniProt1,UniProt2
0,PF02195,PF10417,Q9BYN0,Q06830
1,PF02195,PF10417,Q9BYN0,Q06830
7,PF00198,PF00198,P0AFG6,P0AFG6
10,PF00198,PF00198,P11181,P11181
12,PF00198,PF00198,3mae1,3mae1


In [30]:
df_unique.shape

(142982, 4)

In [31]:
df_unique.tail()

Unnamed: 0,PfamAcc1,PfamAcc2,UniProt1,UniProt2
365535,PF00028,PF00541,Q14126,P04501
365536,PF00028,PF00541,Q14126,P04501
365561,PF01867,PF09711,6qxf2,6qxf1
365562,PF01867,PF09711,6qxt2,6qxt1
365563,PF01867,PF09711,6qy32,6qy31


In [32]:
df_unique['accession1'] = 'pfam:' + df_unique['PfamAcc1']
df_unique['accession2'] = 'pfam:' + df_unique['PfamAcc2']
df_unique['id'] = df_unique['PfamAcc1'] + '-' + df_unique['PfamAcc2']

In [33]:
# only use valid UniProt Ids
df_unique['proteinAccession1'] = df_unique['UniProt1'].apply(lambda x: '' if x[0].isdigit() else 'uniprot:' + x)
df_unique['proteinAccession2'] = df_unique['UniProt2'].apply(lambda x: '' if x[0].isdigit() else 'uniprot:' + x)

In [34]:
df_unique = df_unique[['id', 'accession1', 'accession2', 'proteinAccession1', 'proteinAccession2']].copy()
df_unique = df_unique.drop_duplicates()

In [35]:
print('Number of domain-domain pairs:', df_unique.shape[0])

Number of domain-domain pairs: 37406


In [36]:
df_unique.head()

Unnamed: 0,id,accession1,accession2,proteinAccession1,proteinAccession2
0,PF02195-PF10417,pfam:PF02195,pfam:PF10417,uniprot:Q9BYN0,uniprot:Q06830
7,PF00198-PF00198,pfam:PF00198,pfam:PF00198,uniprot:P0AFG6,uniprot:P0AFG6
10,PF00198-PF00198,pfam:PF00198,pfam:PF00198,uniprot:P11181,uniprot:P11181
12,PF00198-PF00198,pfam:PF00198,pfam:PF00198,,
13,PF00198-PF00198,pfam:PF00198,pfam:PF00198,uniprot:Q9HIA5,uniprot:Q9HIA5


In [39]:
df_unique.to_csv(NEO4J_IMPORT / "01g-ProtCIDInteraction.csv", index=False)