# Protein Protein Interaction Data
**[Work in progress]**

This notebook downloads and standardizes viral-host protein data from IntAct for ingestion into the Knowledge Graph.

Data source: [IntAct](https://www.ebi.ac.uk/intact/query/pubid:IM-27814)

Authors: Kaushik Ganapathy, Eric Yu, Peter Rose (krganapa@ucsd.edu, ery010@ucsd.edu, pwrose@ucsd.edu)

In [1]:
import os
import re
import hashlib 

import pandas as pd
import numpy as np

from pathlib import Path
from Bio import SeqIO

pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [2]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


https://www.uniprot.org/help/uniprotkb_column_names
https://www.uniprot.org/uniprot/P0DTD1#PRO_0000449630

### Get list of organisms to include in the Knowledge Graph

In [3]:
organisms = pd.read_csv("../../reference_data/Organism.csv", dtype=str)

In [4]:
organisms = organisms[organisms['id'].str.startswith('taxonomy')]
# remove CURIE
organisms['taxonomy'] = organisms['id'].apply(lambda x: x.split(':')[1])
taxonomy_ids = organisms['taxonomy'].unique()

In [5]:
taxonomy_ids

array(['2697049', '1263720', '694009', '443239', '31631', '11137',
       '277944', '12131', '12134', '9606', '10090', '59477', '608659',
       '49442', '9974', '143292', '71116', '9608', '9685', '9666',
       '419130', '452646', '10036', '9689', '9694'], dtype=object)

### Retrieve interaction data from IntAct

In [6]:
urls = [f'https://www.ebi.ac.uk/intact/export?format=mitab_25&query=taxid%3A{taxon}&negative=false&spoke=false&ontology=false&sort=intact-miscore&asc=false'
        for taxon in taxonomy_ids]

In [7]:
data = pd.concat((pd.read_csv(url, sep='\t', dtype='str') for url in urls))

In [8]:
print('Number of interactions:', data.shape[0])

Number of interactions: 780441


In [9]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s)
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.94
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.94
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44
3,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.94
4,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449629,intact:EBI-25475871,intact:EBI-25475885,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449629(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.86
5,uniprotkb:P0DTD1-PRO_0000449626,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475874,intact:EBI-25475871,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25510487|imex:IM-27888-3,intact-miscore:0.94
6,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Rosas-Lemus et al. (2020),pubmed:32511376|imex:IM-27900,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508047|imex:IM-27900-1|wwpdb:6w61...,intact-miscore:0.93
7,uniprotkb:P0DTD1-PRO_0000449620,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475859,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449620(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508754|imex:IM-27901-1,intact-miscore:0.37
8,uniprotkb:P0DTD1-PRO_0000449619,uniprotkb:P0DTD8,intact:EBI-25475847,intact:EBI-25475914,psi-mi:p0dtd1-pro_0000449619(display_long)|uni...,psi-mi:ns7b_sars2(display_long)|uniprotkb:Acce...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508759|imex:IM-27901-81,intact-miscore:0.37
9,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD8,intact:EBI-25475871,intact:EBI-25475914,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:ns7b_sars2(display_long)|uniprotkb:Acce...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508766|imex:IM-27901-3,intact-miscore:0.37


### Process Data

In [10]:
data.rename(columns={'#ID(s) interactor A': 'interactorA', 'ID(s) interactor B': 'interactorB'}, inplace=True)

#### Extract UniProt accession number and Uniprot protein ids from interactor columns

In [11]:
# uniprot:P0DTD1-PRO_0000449619 -> P0DTD1-PRO_0000449619
data['id_a'] = data['interactorA'].str.replace('uniprotkb:', '')
data['id_b'] = data['interactorB'].str.replace('uniprotkb:', '')

# P0DTD1-PRO_0000449619 -> P0DTD1 (UniProt accession number)
data['accession_a'] = data['id_a'].str.split('-PRO', expand=True)[0]
data['accession_b'] = data['id_b'].str.split('-PRO', expand=True)[0]

# Remove isoform id: Q9UBL6-2 -> Q9UBL6
data['accession_a'] = data['accession_a'].str.split('-', expand=True)[0]
data['accession_b'] = data['accession_b'].str.split('-', expand=True)[0]

# ADD CURIE "uniprot" as prefix (see https://registry.identifiers.org/registry/uniprot)
data['accession_a'] = 'uniprot:' + data['accession_a']
data['accession_b'] = 'uniprot:' + data['accession_b']

# P0DTD1-PRO_0000449619 -> PRO_0000449619 (UniProt protein id)
data['pro_id_a'] = data['id_a'].str.split('-PRO', expand=True)[1]
data['pro_id_b'] = data['id_b'].str.split('-PRO', expand=True)[1]

# Add CURIE "uniprot.chain" as prefix (see https://registry.identifiers.org/registry/uniprot.chain)
data['pro_id_a'] = data['pro_id_a'].str.replace('_', 'uniprot.chain:PRO_')
data['pro_id_b'] = data['pro_id_b'].str.replace('_', 'uniprot.chain:PRO_')

data.fillna('', inplace=True)

In [12]:
data.rename(columns={'Alt. ID(s) interactor A': 'interactorA', 'Alt. ID(s) interactor B': 'interactorB'}, inplace=True)

In [15]:
# extract text descriptions in  parenthesis
data['detectionMethod'] = data['Interaction detection method(s)'].str.extract('.*\((.*)\).*') 
data['interactionType'] = data['Interaction type(s)'].str.extract('.*\((.*)\).*') 

In [16]:
# change CURIE from wwpdb to pdb
data['interactionId'] = data['Interaction identifier(s)'].str.replace('wwpdb', 'pdb')
# represent one-to-many relationship as semicolon separated string
data['interactionId'] = data['interactionId'].str.replace('|', ';')

In [17]:
# set no-numerical values to ""
data['confidenceValue'] = data['Confidence value(s)'].str.split(':', expand=True)[1]
data['confidenceValue'] = data['confidenceValue'].apply(lambda s: s if s and s.replace('.','',1).isdigit() else '')

In [18]:
data['confidenceValue'].unique()

array(['0.94', '0.44', '0.86', '0.93', '0.37', '0.70', '0.84', '0.49',
       '0.36', '0.56', '0.99', '0.62', '0.79', '0.50', '0.53', '0.35',
       '0.40', '0.64', '0.73', '0.27', '0.59', '0.87', '0.32', '0.77',
       '0.54', '0.80', '0.72', '0.71', '0.68', '0.46', '0.63', '0.66',
       '0.58', '0.52', '0.51', '0.81', '0.67', '0.89', '0.98', '0.91',
       '0.65', '0.57', '0.74', '0.61', '0.75', '0.76', '0.69', '0.43',
       '0.55', '0.60', '0.82', '0.78', '0.95', '0.85', '0.92', '0.90',
       '0.83', '0.88', '0.96', '0.97', '0.41', '0.48', '', '0.28', '0.47',
       '0.31', '0.22', '0.45', '0.23', '0.34'], dtype=object)

In [19]:
data.head(20)

Unnamed: 0,interactorA,interactorB,interactorA.1,interactorB.1,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,detectionMethod,interactionType,interactionId,confidenceValue
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.94,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,3D electron microscopy,direct interaction,intact:EBI-25495631;pdb:6vyb;pdb:6vxx;imex:IM-...,0.94
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.94,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,x-ray crystallography,direct interaction,intact:EBI-25503580;pdb:6LXT;imex:IM-27873-1,0.94
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,x-ray crystallography,direct interaction,intact:EBI-25504928;imex:IM-27884-1;pdb:6vww;p...,0.44
3,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.94,P0DTD1-PRO_0000449625,P0DTD1-PRO_0000449626,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449625,uniprot.chain:PRO_0000449626,3D electron microscopy,physical association,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...,0.94
4,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449629,intact:EBI-25475871,intact:EBI-25475885,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449629(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.86,P0DTD1-PRO_0000449625,P0DTD1-PRO_0000449629,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449625,uniprot.chain:PRO_0000449629,3D electron microscopy,physical association,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...,0.86
5,uniprotkb:P0DTD1-PRO_0000449626,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475874,intact:EBI-25475871,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25510487|imex:IM-27888-3,intact-miscore:0.94,P0DTD1-PRO_0000449626,P0DTD1-PRO_0000449625,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449626,uniprot.chain:PRO_0000449625,molecular sieving,direct interaction,intact:EBI-25510487;imex:IM-27888-3,0.94
6,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Rosas-Lemus et al. (2020),pubmed:32511376|imex:IM-27900,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508047|imex:IM-27900-1|wwpdb:6w61...,intact-miscore:0.93,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,x-ray crystallography,direct interaction,intact:EBI-25508047;imex:IM-27900-1;pdb:6w61;p...,0.93
7,uniprotkb:P0DTD1-PRO_0000449620,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475859,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449620(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508754|imex:IM-27901-1,intact-miscore:0.37,P0DTD1-PRO_0000449620,P0DTD1-PRO_0000449632,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449620,uniprot.chain:PRO_0000449632,two hybrid,physical association,intact:EBI-25508754;imex:IM-27901-1,0.37
8,uniprotkb:P0DTD1-PRO_0000449619,uniprotkb:P0DTD8,intact:EBI-25475847,intact:EBI-25475914,psi-mi:p0dtd1-pro_0000449619(display_long)|uni...,psi-mi:ns7b_sars2(display_long)|uniprotkb:Acce...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508759|imex:IM-27901-81,intact-miscore:0.37,P0DTD1-PRO_0000449619,P0DTD8,uniprot:P0DTD1,uniprot:P0DTD8,uniprot.chain:PRO_0000449619,,two hybrid,physical association,intact:EBI-25508759;imex:IM-27901-81,0.37
9,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD8,intact:EBI-25475871,intact:EBI-25475914,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:ns7b_sars2(display_long)|uniprotkb:Acce...,"psi-mi:""MI:0018""(two hybrid)",Li et al. (2020),pubmed:32838362|imex:IM-27901|doi:10.1101/2020...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25508766|imex:IM-27901-3,intact-miscore:0.37,P0DTD1-PRO_0000449625,P0DTD8,uniprot:P0DTD1,uniprot:P0DTD8,uniprot.chain:PRO_0000449625,,two hybrid,physical association,intact:EBI-25508766;imex:IM-27901-3,0.37


### Remove duplicates

Create a unique interaction id

In [20]:
data['interaction_id'] = data[['id_a', 'id_b']].apply(lambda x: x[0] + x[1] if x[0] < x[1] else x[1] + x[0], axis=1)

In [21]:
data.shape

(780441, 26)

In [22]:
data.drop_duplicates(subset=['interaction_id'], inplace=True)
#data.drop_duplicates(subset=['interactionId'], inplace=True)

In [23]:
data.shape

(446520, 26)

In [24]:
data.head()

Unnamed: 0,interactorA,interactorB,interactorA.1,interactorB.1,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,detectionMethod,interactionType,interactionId,confidenceValue,interaction_id
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.94,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,3D electron microscopy,direct interaction,intact:EBI-25495631;pdb:6vyb;pdb:6vxx;imex:IM-...,0.94,P0DTC2P0DTC2
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,psi-mi:spike_sars2-6(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.94,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,x-ray crystallography,direct interaction,intact:EBI-25503580;pdb:6LXT;imex:IM-27873-1,0.94,P0DTC2P0DTC2
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,x-ray crystallography,direct interaction,intact:EBI-25504928;imex:IM-27884-1;pdb:6vww;p...,0.44,P0DTD1-PRO_0000449632P0DTD1-PRO_0000449632
3,uniprotkb:P0DTD1-PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475871,intact:EBI-25475874,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,"psi-mi:""MI:0410""(3D electron microscopy)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25506373|wwpdb:7btf|wwpdb:6M71|imex...,intact-miscore:0.94,P0DTD1-PRO_0000449625,P0DTD1-PRO_0000449626,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449625,uniprot.chain:PRO_0000449626,3D electron microscopy,physical association,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...,0.94,P0DTD1-PRO_0000449625P0DTD1-PRO_0000449626
5,uniprotkb:P0DTD1-PRO_0000449626,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475874,intact:EBI-25475871,psi-mi:p0dtd1-pro_0000449626(display_long)|uni...,psi-mi:p0dtd1-pro_0000449625(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Gao et al. (2020),pubmed:32277040|imex:IM-27888,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25510487|imex:IM-27888-3,intact-miscore:0.94,P0DTD1-PRO_0000449626,P0DTD1-PRO_0000449625,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449626,uniprot.chain:PRO_0000449625,molecular sieving,direct interaction,intact:EBI-25510487;imex:IM-27888-3,0.94,P0DTD1-PRO_0000449625P0DTD1-PRO_0000449626


#### Extract pubmed id
Example: imex:IM-27912|pubmed:32275855 -> 2275855

In [25]:
position_pattern = re.compile('pubmed:(\d*).')
position_pattern = re.compile('pubmed:(\d*)')

def extract_pubmed_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [26]:
# s.str.extract('.*\((.*)\).*') # https://stackoverflow.com/questions/16842001/copy-text-between-parentheses-in-pandas-dataframe-column-into-another-column
data['pubmedId'] = 'pubmed:' + data['Publication Identifier(s)'].apply(extract_pubmed_id)

#### Extract taxonomy id
Example: taxid:9606(human)|taxid:9606(Homo sapiens) -> 9606

In [27]:
position_pattern = re.compile('taxid:(\d*)\(')

def extract_tax_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [28]:
data['taxonomy_id_a'] = data['Taxid interactor A'].apply(extract_tax_id)
data['taxonomy_id_b'] = data['Taxid interactor B'].apply(extract_tax_id)

In [29]:
data = data[['id_a', 'id_b', 'accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'interactorA', 'interactorB', 'taxonomy_id_a', 'taxonomy_id_b', 'interactionId', 'interactionType', 'detectionMethod', 'confidenceValue','pubmedId']]

In [30]:
data.head()

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,interactorA,interactorA.1,interactorB,interactorB.1,taxonomy_id_a,taxonomy_id_b,interactionId,interactionType,detectionMethod,confidenceValue,pubmedId
0,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,uniprotkb:P0DTC2,intact:EBI-25474821,uniprotkb:P0DTC2,intact:EBI-25474821,2697049,2697049,intact:EBI-25495631;pdb:6vyb;pdb:6vxx;imex:IM-...,direct interaction,3D electron microscopy,0.94,pubmed:32155444
1,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,uniprotkb:P0DTC2,intact:EBI-25474821,uniprotkb:P0DTC2,intact:EBI-25474821,2697049,2697049,intact:EBI-25503580;pdb:6LXT;imex:IM-27873-1,direct interaction,x-ray crystallography,0.94,pubmed:32231345
2,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,2697049,2697049,intact:EBI-25504928;imex:IM-27884-1;pdb:6vww;p...,direct interaction,x-ray crystallography,0.44,pubmed:32304108
3,P0DTD1-PRO_0000449625,P0DTD1-PRO_0000449626,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449625,uniprot.chain:PRO_0000449626,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475871,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475874,2697049,2697049,intact:EBI-25506373;pdb:7btf;pdb:6M71;imex:IM-...,physical association,3D electron microscopy,0.94,pubmed:32277040
5,P0DTD1-PRO_0000449626,P0DTD1-PRO_0000449625,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449626,uniprot.chain:PRO_0000449625,uniprotkb:P0DTD1-PRO_0000449626,intact:EBI-25475874,uniprotkb:P0DTD1-PRO_0000449625,intact:EBI-25475871,2697049,2697049,intact:EBI-25510487;imex:IM-27888-3,direct interaction,molecular sieving,0.94,pubmed:32277040


#### Restrict data to the set of currently supported taxonomy ids

In [31]:
data = data[data['taxonomy_id_a'].isin(taxonomy_ids) & data['taxonomy_id_b'].isin(taxonomy_ids)]

Remove data with accession numbers that are not UniProt accession numbers

In [32]:
data = data[~(data['id_a'].str.contains(':')) & ~(data['id_b'].str.contains(':'))]

Remove self-interactions (they make graph display too crowded)

In [33]:
data = data[~(data['accession_a'] == data['accession_b'])]

In [34]:
data.shape

(382232, 17)

In [35]:
data['source'] = 'IntAct'

### Save interaction data

In [36]:
#data = data[['accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'interactorA', 'interactorB', 'source', 'pubmedId']]
data.drop_duplicates(inplace=True)
data.to_csv(NEO4J_IMPORT / '01e-ProteinProteinInteraction.csv', index = False)

In [37]:
print('Number of interactions:', data.shape[0])

Number of interactions: 382232


In [38]:
data.sample(5)

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,interactorA,interactorA.1,interactorB,interactorB.1,taxonomy_id_a,taxonomy_id_b,interactionId,interactionType,detectionMethod,confidenceValue,pubmedId,source
131709,P22607,O00755,uniprot:P22607,uniprot:O00755,,,uniprotkb:P22607,intact:EBI-348399|uniprotkb:Q14308|uniprotkb:Q...,uniprotkb:O00755,intact:EBI-727198|uniprotkb:Q9Y560|uniprotkb:Q...,9606,9606,intact:EBI-26009737;imex:IM-28217-49833,physical association,two hybrid pooling approach,0.56,pubmed:32814053,IntAct
235867,Q9UDW3,A5D8V6,uniprot:Q9UDW3,uniprot:A5D8V6,,,uniprotkb:Q9UDW3,intact:EBI-7850213|uniprotkb:A8K9F6|intact:MIN...,uniprotkb:A5D8V6,intact:EBI-2559305|uniprotkb:Q8N3K4,9606,9606,intact:EBI-24118030;imex:IM-25472-101805,physical association,two hybrid array,,pubmed:32296183,IntAct
127090,Q9H7H0-2,P54253,uniprot:Q9H7H0,uniprot:P54253,,,uniprotkb:Q9H7H0-2,intact:EBI-11098807,uniprotkb:P54253,intact:EBI-930964|uniprotkb:Q17S02|uniprotkb:Q...,9606,9606,intact:EBI-25978134;imex:IM-28217-90447,physical association,two hybrid array,0.56,pubmed:32814053,IntAct
576639,Q6A162,Q8N715,uniprot:Q6A162,uniprot:Q8N715,,,uniprotkb:Q6A162,intact:EBI-10171697|uniprotkb:Q6IFU5,uniprotkb:Q8N715,intact:EBI-740814|uniprotkb:Q8N746|uniprotkb:Q...,9606,9606,intact:EBI-23160176;imex:IM-25472-46616,physical association,two hybrid array,,pubmed:32296183,IntAct
98638,P27815-4,Q9Y371,uniprot:P27815,uniprot:Q9Y371,,,uniprotkb:P27815-4,intact:EBI-12080840,uniprotkb:Q9Y371,intact:EBI-2623095|uniprotkb:Q9H3Z0|uniprotkb:...,9606,9606,intact:EBI-26230883;imex:IM-28217-57811,physical association,validated two hybrid,0.56,pubmed:32814053,IntAct
