# Protein Protein Interaction Data
**[Work in progress]**

This notebook downloads and standardizes viral-host iiIIrotein data from IntAct for ingestion into the Knowledge Graph.

Data source: [IntAct](https://www.ebi.ac.uk/intact/query/pubid:IM-27814)

Authors: Kaushik Ganapathy, Eric Yu, Peter Rose (krganapa@ucsd.edu, ery010@ucsd.edu, pwrose@ucsd.edu)

In [1]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columns

In [3]:
NEO4J_IMPORT = Path(os.getenv('NEO4J_IMPORT'))
print(NEO4J_IMPORT)

/Users/peter/Library/Application Support/com.Neo4j.Relate/data/dbmss/dbms-8bf637fc-0d20-4d9f-9c6f-f7e72e92a4da/import


### Get list of organisms to include in the Knowledge Graph

In [4]:
organisms = pd.read_csv("../../reference_data/Organism.csv", dtype=str)

In [5]:
organisms = organisms[organisms['id'].str.startswith('taxonomy')]
# remove CURIE
organisms['taxonomy'] = organisms['id'].apply(lambda x: x.split(':')[1])
taxonomy_ids = organisms['taxonomy'].unique()

In [6]:
taxonomy_ids

array(['2697049', '1263720', '694009', '443239', '31631', '11137',
       '277944', '2709072', '2708335', '12131', '12134', '766791',
       '693998', '1487703', '285949', '9606', '60711', '10090', '59479',
       '59477', '608659', '49442', '608708', '9974', '143292', '71116',
       '9608', '9615', '9685', '9666', '419130', '452646', '10036',
       '9689', '9694', '151659'], dtype=object)

### Retrieve interaction data from IntAct

In [7]:
urls = [f'https://www.ebi.ac.uk/intact/export?format=mitab_25&query=taxid%3A{taxon}&negative=false&spoke=false&ontology=false&sort=intact-miscore&asc=false'
        for taxon in taxonomy_ids]

In [8]:
data = pd.concat((pd.read_csv(url, sep='\t', dtype='str') for url in urls))

In [9]:
print('Number of interactions:', data.shape[0])

Number of interactions: 787866


In [10]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s)
0,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763621|imex:IM-28212-1|wwpdb:6YZ1,intact-miscore:0.93
1,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763651|imex:IM-28212-2,intact-miscore:0.93
2,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Hsieh et al. (2020),pubmed:32703906|imex:IM-28214,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763710|emdb:EMD-22221|wwpdb:6xkl|...,intact-miscore:0.95
3,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Cai et al. (2020),imex:IM-28256|pubmed:32694201,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766108|wwpdb:6XRA|wwpdb:6XR8|imex...,intact-miscore:0.95
4,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Lin et al. (2020),imex:IM-28230|pubmed:32728018,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766164|wwpdb:7C2I|wwpdb:7C2J|imex...,intact-miscore:0.93


### Process Data

In [11]:
# change CURIE from wwpdb to pdb to enable resolution by identifiers.org
data['interactionIds'] = data['Interaction identifier(s)'].str.replace('wwpdb', 'pdb')
# represent one-to-many relationship as semicolon separated string
data['interactionIds'] = data['interactionIds'].str.replace('|', ';')
# extract unique interaction id
data['interactionId'] = 'intact:EBI-' + data['interactionIds'].str.extract('intact:EBI-(\d*)')

  after removing the cwd from sys.path.


#### Remove duplicates

In [12]:
data.drop_duplicates(subset=['interactionId'], inplace=True)

In [13]:
print('Number of interactions without duplicates:', data.shape[0])

Number of interactions without duplicates: 450184


In [14]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),interactionIds,interactionId
0,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763621|imex:IM-28212-1|wwpdb:6YZ1,intact-miscore:0.93,intact:EBI-25763621;imex:IM-28212-1;pdb:6YZ1,intact:EBI-25763621
1,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763651|imex:IM-28212-2,intact-miscore:0.93,intact:EBI-25763651;imex:IM-28212-2,intact:EBI-25763651
2,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Hsieh et al. (2020),pubmed:32703906|imex:IM-28214,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763710|emdb:EMD-22221|wwpdb:6xkl|...,intact-miscore:0.95,intact:EBI-25763710;emdb:EMD-22221;pdb:6xkl;im...,intact:EBI-25763710
3,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Cai et al. (2020),imex:IM-28256|pubmed:32694201,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766108|wwpdb:6XRA|wwpdb:6XR8|imex...,intact-miscore:0.95,intact:EBI-25766108;pdb:6XRA;pdb:6XR8;imex:IM-...,intact:EBI-25766108
4,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Lin et al. (2020),imex:IM-28230|pubmed:32728018,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766164|wwpdb:7C2I|wwpdb:7C2J|imex...,intact-miscore:0.93,intact:EBI-25766164;pdb:7C2I;pdb:7C2J;imex:IM-...,intact:EBI-25766164


#### Apply naming conventions

In [15]:
data.rename(columns={'Alt. ID(s) interactor A': 'interactorA', 'Alt. ID(s) interactor B': 'interactorB'}, inplace=True)

#### Extract UniProt accession number and Uniprot protein ids from interactor columns

In [16]:
# uniprot:P0DTD1-PRO_0000449619 -> P0DTD1-PRO_0000449619
data['id_a'] = data['#ID(s) interactor A'].str.replace('uniprotkb:', '')
data['id_b'] = data['ID(s) interactor B'].str.replace('uniprotkb:', '')

# P0DTD1-PRO_0000449619 -> P0DTD1 (UniProt accession number)
data['accessionA'] = data['id_a'].str.split('-PRO', expand=True)[0]
data['accessionB'] = data['id_b'].str.split('-PRO', expand=True)[0]

# Remove isoform id: Q9UBL6-2 -> Q9UBL6
data['accessionA'] = data['accessionA'].str.split('-', expand=True)[0]
data['accessionB'] = data['accessionB'].str.split('-', expand=True)[0]

# ADD CURIE "uniprot" as prefix (see https://registry.identifiers.org/registry/uniprot)
data['accessionA'] = 'uniprot:' + data['accessionA']
data['accessionB'] = 'uniprot:' + data['accessionB']

# P0DTD1-PRO_0000449619 -> PRO_0000449619 (UniProt protein id)
data['proIdA'] = data['id_a'].str.split('-PRO', expand=True)[1]
data['proIdB'] = data['id_b'].str.split('-PRO', expand=True)[1]

# Add CURIE "uniprot.chain" as prefix (see https://registry.identifiers.org/registry/uniprot.chain)
data['proIdA'] = data['proIdA'].str.replace('_', 'uniprot.chain:PRO_')
data['proIdB'] = data['proIdB'].str.replace('_', 'uniprot.chain:PRO_')

In [17]:
# extract text descriptions in  parenthesis
data['detectionMethod'] = data['Interaction detection method(s)'].str.extract('.*\((.*)\).*') 
data['interactionType'] = data['Interaction type(s)'].str.extract('.*\((.*)\).*') 

In [18]:
# set no-numerical values to ""
data['confidenceValue'] = data['Confidence value(s)'].str.split(':', expand=True)[1]
data['confidenceValue'] = data['confidenceValue'].apply(lambda s: s if s and s.replace('.','',1).isdigit() else '')

In [19]:
data['confidenceValue'].unique()

array(['0.93', '0.95', '0.84', '0.49', '0.94', '0.36', '0.56', '0.37',
       '0.82', '0.44', '0.27', '0.35', '0.32', '0.66', '0.99', '0.54',
       '0.80', '0.77', '0.72', '0.64', '0.50', '0.40', '0.71', '0.62',
       '0.46', '0.63', '0.87', '0.53', '0.59', '0.73', '0.60', '0.67',
       '0.52', '0.51', '0.91', '0.61', '0.81', '0.98', '0.57', '0.65',
       '0.89', '0.74', '0.75', '0.76', '0.68', '0.42', '0.58', '0.69',
       '0.55', '0.43', '0.97', '0.83', '', '0.79', '0.92', '0.45', '0.70',
       '0.48', '0.85', '0.90', '0.86', '0.88', '0.96', '0.78', '0.47',
       '0.38', '0.41', '0.28', '0.23', '0.31', '0.22', '0.34'],
      dtype=object)

#### Extract pubmed id
Example: imex:IM-27912|pubmed:32275855 -> 2275855

In [20]:
data['pubmedId'] = data['Publication Identifier(s)'].str.extract('pubmed:(\d*)')
# add CURIE prefix if pubmedId exists
data['pubmedId'] = data['pubmedId'].apply(lambda s: 'pubmed:' + s if len(s) > 0 else '')

#### Extract taxonomy id
Example: taxid:9606(human)|taxid:9606(Homo sapiens) -> 9606

In [21]:
data['taxonomyIdA'] = data['Taxid interactor A'].str.extract('taxid:(\d*)\(')
data['taxonomyIdB'] = data['Taxid interactor B'].str.extract('taxid:(\d*)\(')

In [22]:
data.fillna('', inplace=True)

In [23]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,interactorA,interactorB,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s),interactionIds,interactionId,id_a,id_b,accessionA,accessionB,proIdA,proIdB,detectionMethod,interactionType,confidenceValue,pubmedId,taxonomyIdA,taxonomyIdB
0,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763621|imex:IM-28212-1|wwpdb:6YZ1,intact-miscore:0.93,intact:EBI-25763621;imex:IM-28212-1;pdb:6YZ1,intact:EBI-25763621,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,x-ray crystallography,direct interaction,0.93,pubmed:32709887,2697049,2697049
1,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0071""(molecular sieving)",Krafcikova et al. (2020),pubmed:32709887|imex:IM-28212,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763651|imex:IM-28212-2,intact-miscore:0.93,intact:EBI-25763651;imex:IM-28212-2,intact:EBI-25763651,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,molecular sieving,direct interaction,0.93,pubmed:32709887,2697049,2697049
2,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Hsieh et al. (2020),pubmed:32703906|imex:IM-28214,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25763710|emdb:EMD-22221|wwpdb:6xkl|...,intact-miscore:0.95,intact:EBI-25763710;emdb:EMD-22221;pdb:6xkl;im...,intact:EBI-25763710,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,3D electron microscopy,direct interaction,0.95,pubmed:32703906,2697049,2697049
3,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,psi-mi:spike_sars2(display_long)|uniprotkb:S(g...,"psi-mi:""MI:0410""(3D electron microscopy)",Cai et al. (2020),imex:IM-28256|pubmed:32694201,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766108|wwpdb:6XRA|wwpdb:6XR8|imex...,intact-miscore:0.95,intact:EBI-25766108;pdb:6XRA;pdb:6XR8;imex:IM-...,intact:EBI-25766108,P0DTC2,P0DTC2,uniprot:P0DTC2,uniprot:P0DTC2,,,3D electron microscopy,direct interaction,0.95,pubmed:32694201,2697049,2697049
4,uniprotkb:P0DTD1-PRO_0000449628,uniprotkb:P0DTD1-PRO_0000449633,intact:EBI-25475880,intact:EBI-25492395,psi-mi:p0dtd1-pro_0000449628(display_long)|uni...,psi-mi:p0dtd1-pro_0000449633(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Lin et al. (2020),imex:IM-28230|pubmed:32728018,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25766164|wwpdb:7C2I|wwpdb:7C2J|imex...,intact-miscore:0.93,intact:EBI-25766164;pdb:7C2I;pdb:7C2J;imex:IM-...,intact:EBI-25766164,P0DTD1-PRO_0000449628,P0DTD1-PRO_0000449633,uniprot:P0DTD1,uniprot:P0DTD1,uniprot.chain:PRO_0000449628,uniprot.chain:PRO_0000449633,x-ray crystallography,direct interaction,0.93,pubmed:32728018,2697049,2697049


#### Restrict data to the set of currently supported taxonomy ids

In [24]:
data = data[data['taxonomyIdA'].isin(taxonomy_ids) & data['taxonomyIdB'].isin(taxonomy_ids)]

In [25]:
data.shape

(415577, 29)

Remove data with accession numbers that are not UniProt accession numbers

In [26]:
data = data[~(data['id_a'].str.contains(':')) & ~(data['id_b'].str.contains(':'))]

In [27]:
data.shape

(392103, 29)

Remove self-interactions (they make graph display too crowded)

In [28]:
data = data[~(data['accessionA'] == data['accessionB'])]

In [29]:
data.shape

(383651, 29)

In [30]:
data['taxonomyIdA'] = 'taxonomy:' + data['taxonomyIdA']
data['taxonomyIdB'] = 'taxonomy:' + data['taxonomyIdB']

In [31]:
data['source'] = 'IntAct'

In [32]:
data = data[['accessionA', 'accessionB', 'proIdA', 'proIdB', 'taxonomyIdA', 'taxonomyIdB', 'interactionId', 'interactionIds', 'interactionType', 'detectionMethod', 'confidenceValue','pubmedId']]

In [33]:
def unique_id(row):
    if row.accessionA > row.accessionB:
        return row.accessionB + '_' + row.accessionA + '_' + row.proIdB + '_' + row.proIdA
    else:
        return row.accessionA + '_' + row.accessionB + '_' + row.proIdA + '_' + row.proIdB

In [34]:
data['id'] = data.apply(lambda r: unique_id(r), axis=1)

In [35]:
def weight(row):
    value = 0.0
    # records with higher confidence value have priority
    if row.confidenceValue != '':
        value = float(row.confidenceValue)
    # records with a pubmedId have priority
    if row.pubmedId != '':
        value += 0.001
        
    return value

In [36]:
data['weight'] = data.apply(lambda r: weight(r), axis=1)

In [37]:
data.sort_values(by=['weight'], ascending=False, inplace=True)

In [38]:
data.drop_duplicates(['id'], inplace=True)

In [39]:
data.drop(columns=['id', 'weight'], inplace=True)

In [40]:
print('Number of interactions:', data.shape[0])

Number of interactions: 158660


In [41]:
data.head()

Unnamed: 0,accessionA,accessionB,proIdA,proIdB,taxonomyIdA,taxonomyIdB,interactionId,interactionIds,interactionType,detectionMethod,confidenceValue,pubmedId
455433,uniprot:P04637,uniprot:Q00987,,,taxonomy:9606,taxonomy:9606,intact:EBI-7983312,intact:EBI-7983312;mint:MINT-8385459,association,anti tag coimmunoprecipitation,0.99,pubmed:22819825
2685,uniprot:Q9BYF1,uniprot:P0DTC2,,,taxonomy:9606,taxonomy:2697049,intact:EBI-25564294,intact:EBI-25564294;imex:IM-27912-17,physical association,fluorescence-activated cell sorting,0.99,pubmed:32275855
4019,uniprot:Q9BYF1,uniprot:P59594,,,taxonomy:9606,taxonomy:694009,intact:EBI-25564314,intact:EBI-25564314;imex:IM-27912-23,physical association,fluorescence-activated cell sorting,0.99,pubmed:32275855
8143,uniprot:P42345,uniprot:Q8N122,,,taxonomy:9606,taxonomy:9606,intact:EBI-2434906,intact:EBI-2434906;imex:IM-20364-22,physical association,anti tag coimmunoprecipitation,0.98,pubmed:12408816
510059,uniprot:P51587,uniprot:Q06609,,,taxonomy:9606,taxonomy:9606,intact:EBI-4399681,intact:EBI-4399681;imex:IM-16571-6,physical association,pull down,0.98,pubmed:21399666


### Save interaction data

In [42]:
data.to_csv(NEO4J_IMPORT / '01e-ProteinProteinInteraction.csv', index = False)