# Protein Protein Interaction Data
**[Work in progress]**

This notebook downloads and standardizes viral-host protein data from IntAct for ingestion into the Knowledge Graph.

Data source: [IntAct](https://www.ebi.ac.uk/intact/query/pubid:IM-27814)

Authors: Kaushik Ganapathy, Eric Yu, Peter Rose (krganapa@ucsd.edu, ery010@ucsd.edu, pwrose@ucsd.edu)

In [1]:
import os
import re
import hashlib 

import pandas as pd
import numpy as np

from pathlib import Path
from Bio import SeqIO

pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [2]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


https://www.uniprot.org/help/uniprotkb_column_names
https://www.uniprot.org/uniprot/P0DTD1#PRO_0000449630

### Get list of organisms to include in the Knowledge Graph

In [3]:
organisms = pd.read_csv("../../reference_data/Organism.csv", dtype=str)

In [4]:
organisms = organisms[organisms['id'].str.startswith('taxonomy')]
# remove CURIE
organisms['taxonomy'] = organisms['id'].apply(lambda x: x.split(':')[1])
taxonomy_ids = organisms['taxonomy'].unique()

In [5]:
taxonomy_ids

array(['2697049', '1263720', '694009', '443239', '31631', '11137',
       '277944', '12131', '12134', '9606', '10090', '59477', '608659',
       '49442', '9974', '143292', '71116', '9608', '9685', '9666',
       '419130', '452646', '10036', '9689', '9694'], dtype=object)

### Retrieve interaction data from IntAct

In [6]:
urls = [f'https://www.ebi.ac.uk/intact/export?format=mitab_25&query=taxid%3A{taxon}&negative=false&spoke=false&ontology=false&sort=intact-miscore&asc=false'
        for taxon in taxonomy_ids]

In [7]:
data = pd.concat((pd.read_csv(url, sep='\t', dtype='str') for url in urls))

In [8]:
print('Number of interactions:', data.shape[0])

Number of interactions: 780441


In [9]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s)
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.94
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.94
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44
3,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.94
4,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449629,intact:EBI-25475871,intact:EBI-25475885,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449629(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.86


### Process Data

In [10]:
# change CURIE from wwpdb to pdb
data['interactionId'] = data['Interaction identifier(s)'].str.replace('wwpdb', 'pdb')
# represent one-to-many relationship as semicolon separated string
data['interactionId'] = data['interactionId'].str.replace('|', ';')

#### Remove duplicates

In [11]:
data.drop_duplicates(subset=['interactionId'], inplace=True)

In [12]:
print('Number of interactions without duplicates:', data.shape[0])

Number of interactions without duplicates: 446520


In [13]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),interactionId
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.94,intact:EBI-25495631;pdb:6vyb;pdb:6vxx;imex:IM-...
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.94,intact:EBI-25503580;pdb:6LXT;imex:IM-27873-1
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44,intact:EBI-25504928;imex:IM-27884-1;pdb:6vww;p...
3,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.94,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...
5,uniprotkb:P0DTD1-PRO_0000449626,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475874,intact:EBI-25475871,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25510487|imex:IM-27888-3,intact-miscore:0.94,intact:EBI-25510487;imex:IM-27888-3


#### Apply naming conventions

In [14]:
data.rename(columns={'Alt. ID(s) interactor A': 'interactorA', 'Alt. ID(s) interactor B': 'interactorB'}, inplace=True)

In [15]:
#data.rename(columns={'#ID(s) interactor A': 'interactorA', 'ID(s) interactor B': 'interactorB'}, inplace=True)

#### Extract UniProt accession number and Uniprot protein ids from interactor columns

In [16]:
# uniprot:P0DTD1-PRO_0000449619 -> P0DTD1-PRO_0000449619
data['id_a'] = data['#ID(s) interactor A'].str.replace('uniprotkb:', '')
data['id_b'] = data['ID(s) interactor B'].str.replace('uniprotkb:', '')

# P0DTD1-PRO_0000449619 -> P0DTD1 (UniProt accession number)
data['accession_a'] = data['id_a'].str.split('-PRO', expand=True)[0]
data['accession_b'] = data['id_b'].str.split('-PRO', expand=True)[0]

# Remove isoform id: Q9UBL6-2 -> Q9UBL6
data['accession_a'] = data['accession_a'].str.split('-', expand=True)[0]
data['accession_b'] = data['accession_b'].str.split('-', expand=True)[0]

# ADD CURIE "uniprot" as prefix (see https://registry.identifiers.org/registry/uniprot)
data['accession_a'] = 'uniprot:' + data['accession_a']
data['accession_b'] = 'uniprot:' + data['accession_b']

# P0DTD1-PRO_0000449619 -> PRO_0000449619 (UniProt protein id)
data['pro_id_a'] = data['id_a'].str.split('-PRO', expand=True)[1]
data['pro_id_b'] = data['id_b'].str.split('-PRO', expand=True)[1]

# Add CURIE "uniprot.chain" as prefix (see https://registry.identifiers.org/registry/uniprot.chain)
data['pro_id_a'] = data['pro_id_a'].str.replace('_', 'uniprot.chain:PRO_')
data['pro_id_b'] = data['pro_id_b'].str.replace('_', 'uniprot.chain:PRO_')

data.fillna('', inplace=True)

In [17]:
# extract text descriptions in  parenthesis
data['detectionMethod'] = data['Interaction detection method(s)'].str.extract('.*\((.*)\).*') 
data['interactionType'] = data['Interaction type(s)'].str.extract('.*\((.*)\).*') 

In [18]:
# set no-numerical values to ""
data['confidenceValue'] = data['Confidence value(s)'].str.split(':', expand=True)[1]
data['confidenceValue'] = data['confidenceValue'].apply(lambda s: s if s and s.replace('.','',1).isdigit() else '')

In [19]:
data['confidenceValue'].unique()

array(['0.94', '0.44', '0.93', '0.37', '0.70', '0.84', '0.49', '0.36',
       '0.56', '0.99', '0.62', '0.50', '0.53', '0.35', '0.40', '0.27',
       '0.64', '0.59', '0.32', '0.54', '0.80', '0.77', '0.72', '0.71',
       '0.68', '0.46', '0.66', '0.63', '0.87', '0.52', '0.51', '0.73',
       '0.81', '0.67', '0.89', '0.98', '0.91', '0.65', '0.57', '0.74',
       '0.61', '0.75', '0.76', '0.69', '0.43', '0.55', '0.60', '0.58',
       '0.82', '0.78', '0.83', '0.88', '0.96', '0.97', '0.90', '0.95',
       '0.85', '0.86', '0.92', '0.79', '0.41', '0.48', '', '0.28', '0.47',
       '0.31', '0.22', '0.45', '0.23', '0.34'], dtype=object)

#### Extract pubmed id
Example: imex:IM-27912|pubmed:32275855 -> 2275855

In [20]:
data['pubmedId'] = data['Publication Identifier(s)'].str.extract('pubmed:(\d*)')
# add CURIE prefix if pubmedId exists
data['pubmedId'] = data['pubmedId'].apply(lambda s: 'pubmed:' + s if len(s) > 0 else '')

#### Extract taxonomy id
Example: taxid:9606(human)|taxid:9606(Homo sapiens) -> 9606

In [21]:
position_pattern = re.compile('taxid:(\d*)\(')

def extract_tax_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [22]:
#data['taxonomy_id_a'] = data['Taxid interactor A'].apply(extract_tax_id)
#data['taxonomy_id_b'] = data['Taxid interactor B'].apply(extract_tax_id)
data['taxonomy_id_a'] = data['Taxid interactor A'].str.extract('taxid:(\d*)\(')
data['taxonomy_id_b'] = data['Taxid interactor B'].str.extract('taxid:(\d*)\(')

In [23]:
data = data[['id_a', 'id_b', 'accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'interactorA', 'interactorB', 'taxonomy_id_a', 'taxonomy_id_b', 'interactionId', 'interactionType', 'detectionMethod', 'confidenceValue','pubmedId']]

In [24]:
data.head()

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,interactorA,interactorB,taxonomy_id_a,taxonomy_id_b,interactionId,interactionType,detectionMethod,confidenceValue,pubmedId
0,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,intact:EBI-25474821,intact:EBI-25474821,2697049,2697049,intact:EBI-25495631;pdb:6vyb;pdb:6vxx;imex:IM-...,direct interaction,3D electron microscopy,0.94,pubmed:32155444
1,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,intact:EBI-25474821,intact:EBI-25474821,2697049,2697049,intact:EBI-25503580;pdb:6LXT;imex:IM-27873-1,direct interaction,x-ray crystallography,0.94,pubmed:32231345
2,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,2697049,2697049,intact:EBI-25504928;imex:IM-27884-1;pdb:6vww;p...,direct interaction,x-ray crystallography,0.44,pubmed:32304108
3,P0DTD1-PRO_0000449625,P0DTD1-PRO_0000449626,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449625,uniprot.chain:PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,2697049,2697049,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...,physical association,3D electron microscopy,0.94,pubmed:32277040
5,P0DTD1-PRO_0000449626,P0DTD1-PRO_0000449625,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449626,uniprot.chain:PRO_0000449625,intact:EBI-25475874,intact:EBI-25475871,2697049,2697049,intact:EBI-25510487;imex:IM-27888-3,direct interaction,molecular sieving,0.94,pubmed:32277040


#### Restrict data to the set of currently supported taxonomy ids

In [25]:
data = data[data['taxonomy_id_a'].isin(taxonomy_ids) & data['taxonomy_id_b'].isin(taxonomy_ids)]

Remove data with accession numbers that are not UniProt accession numbers

In [26]:
data = data[~(data['id_a'].str.contains(':')) & ~(data['id_b'].str.contains(':'))]

Remove self-interactions (they make graph display too crowded)

In [27]:
data = data[~(data['accession_a'] == data['accession_b'])]

In [28]:
data.shape

(382232, 15)

In [29]:
data['source'] = 'IntAct'

### Save interaction data

In [30]:
#data = data[['accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'interactorA', 'interactorB', 'source', 'pubmedId']]
data.drop_duplicates(inplace=True)
data.to_csv(NEO4J_IMPORT / '01e-ProteinProteinInteraction.csv', index = False)

In [31]:
print('Number of interactions:', data.shape[0])

Number of interactions: 382232


In [32]:
data.sample(5)

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,interactorA,interactorB,taxonomy_id_a,taxonomy_id_b,interactionId,interactionType,detectionMethod,confidenceValue,pubmedId,source
117726,P35240,Q9UNE7,uniprot:P35240,uniprot:Q9UNE7,,,intact:EBI-1014472|uniprotkb:O95683|uniprotkb:...,intact:EBI-357085|uniprotkb:O60526|uniprotkb:Q...,9606,9606,intact:EBI-25878302;imex:IM-28217-54057,physical association,two hybrid array,0.56,pubmed:32814053,IntAct
469693,Q8TCT0,O60566,uniprot:Q8TCT0,uniprot:O60566,,,intact:EBI-10274247|uniprotkb:A8K611|uniprotkb...,intact:EBI-1001438|uniprotkb:O60501|uniprotkb:...,9606,9606,intact:EBI-11137874;imex:IM-24272-1123,association,anti tag coimmunoprecipitation,0.35,pubmed:26496610,IntAct
137772,Q9H8Y8,Q96HD9,uniprot:Q9H8Y8,uniprot:Q96HD9,,,intact:EBI-739467|uniprotkb:Q96I74|uniprotkb:Q...,intact:EBI-3916242,9606,9606,intact:EBI-22269183;imex:IM-27553-833,physical association,two hybrid array,0.87,pubmed:29892012,IntAct
94050,P42858,O00300,uniprot:P42858,uniprot:O00300,,,intact:EBI-466029|uniprotkb:Q9UQB7,intact:EBI-15481185|uniprotkb:B2R9A8|uniprotkb...,9606,9606,intact:EBI-26251576;imex:IM-28217-47651,physical association,validated two hybrid,0.56,pubmed:32814053,IntAct
417530,Q9H7E9,Q9UMY1,uniprot:Q9H7E9,uniprot:Q9UMY1,,,intact:EBI-715389|uniprotkb:A6NGC0|uniprotkb:Q...,intact:EBI-2862609|uniprotkb:Q9Y3U7|uniprotkb:...,9606,9606,intact:EBI-7109820;mint:MINT-8250490;imex:IM-1...,physical association,two hybrid,0.37,pubmed:21900206,IntAct
