# Protein Protein Interaction Data
**[Work in progress]**

This notebook downloads and standardizes viral-host protein data from IntAct for ingestion into the Knowledge Graph.

Data source: [IntAct](https://www.ebi.ac.uk/intact/query/pubid:IM-27814)

Authors: Kaushik Ganapathy, Eric Yu, Peter Rose (krganapa@ucsd.edu, ery010@ucsd.edu, pwrose@ucsd.edu)

In [1]:
import os
import re
import hashlib 

import pandas as pd
import numpy as np

from pathlib import Path
from Bio import SeqIO

pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [2]:
NEO4J_HOME = Path(os.getenv('NEO4J_HOME'))
print(NEO4J_HOME)

/Users/peter/Library/Application Support/Neo4j Desktop/Application/neo4jDatabases/database-4af96121-2328-4e2f-ba60-6d8b728a26d5/installation-4.0.3


https://www.uniprot.org/help/uniprotkb_column_names
https://www.uniprot.org/uniprot/P0DTD1#PRO_0000449630

### Retrieve interaction data from IntAct

##### Query for interactions with SARS-CoV-2: "taxid:2697049"

In [3]:
data = pd.read_csv("https://www.ebi.ac.uk/intact/export?format=mitab_25&query=taxid%3A2697049&negative=false&spoke=false&ontology=false&sort=intact-miscore&asc=false", sep='\t')

In [4]:
print('Number of interactions:', data.shape[0])

Number of interactions: 1036


In [5]:
data.head()

Unnamed: 0,#ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,Taxid interactor B,Interaction type(s),Source database(s),Interaction identifier(s),Confidence value(s)
0,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:0410""(3D electron microscopy)",Walls et al. (2020),pubmed:32155444|imex:IM-27846,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25495631|wwpdb:6vyb|wwpdb:6vxx|imex...,intact-miscore:0.74
1,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:0114""(x-ray crystallography)",Xia et al. (2020),imex:IM-27873|pubmed:32231345,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25503580|wwpdb:6LXT|imex:IM-27873-1,intact-miscore:0.74
2,uniprotkb:P0DTD1-PRO_0000449632,uniprotkb:P0DTD1-PRO_0000449632,intact:EBI-25475891,intact:EBI-25475891,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,psi-mi:p0dtd1-pro_0000449632(display_long)|uni...,"psi-mi:""MI:0114""(x-ray crystallography)",Kim. et al. (2020),pubmed:32304108|imex:IM-27884,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25504928|imex:IM-27884-1|wwpdb:6vww...,intact-miscore:0.44
3,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:2338""(electron tomography)",Wrapp et al. (2020),pubmed:32075877|imex:IM-27946,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25590861|imex:IM-27946-1|wwpdb:6vsb,intact-miscore:0.74
4,uniprotkb:P0DTC2,uniprotkb:P0DTC2,intact:EBI-25474821,intact:EBI-25474821,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,psi-mi:spike_sars2-1(display_long)|uniprotkb:S...,"psi-mi:""MI:0071""(molecular sieving)",Wrapp et al. (2020),pubmed:32075877|imex:IM-27946,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-C...,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0469""(IntAct)",intact:EBI-25591137|imex:IM-27946-11,intact-miscore:0.74


### Process Data

In [6]:
data.rename(columns={'#ID(s) interactor A': 'interactorA', 'ID(s) interactor B': 'interactorB'}, inplace=True)

In [7]:
data = data.replace('uniprotkb','uniprot', regex=True)

#### Extract UniProt accession number and Uniprot protein id from interactor columns

In [8]:
# uniprot:P0DTD1-PRO_0000449619 -> P0DTD1-PRO_0000449619
data['id_a'] = data['interactorA'].str.replace('uniprot:', '')
data['id_b'] = data['interactorB'].str.replace('uniprot:', '')

# P0DTD1-PRO_0000449619 -> P0DTD1 (UniProt accession number)
data['accession_a'] = data['id_a'].str.split('-', expand=True)[0]
data['accession_b'] = data['id_b'].str.split('-', expand=True)[0]

# P0DTD1-PRO_0000449619 -> PRO_0000449619 (UniProt protein id)
data['pro_id_a'] = data['id_a'].str.split('-', expand=True)[1]
data['pro_id_b'] = data['id_b'].str.split('-', expand=True)[1]

# Add CURIE "uniprot.chain" as prefix (see https://registry.identifiers.org/registry/uniprot.chain)
data['pro_id_a'] = data['pro_id_a'].str.replace('PRO_', 'uniprot.chain:PRO_')
data['pro_id_b'] = data['pro_id_b'].str.replace('PRO_', 'uniprot.chain:PRO_')

data.fillna('', inplace=True)

#### Extract pubmed id
Example: imex:IM-27912|pubmed:32275855 -> 2275855

In [9]:
position_pattern = re.compile('pubmed:(\d*).')

def extract_pubmed_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [10]:
data['pubmedId'] = data['Publication Identifier(s)'].apply(extract_pubmed_id)

#### Extract taxonomy id
Example: taxid:9606(human)|taxid:9606(Homo sapiens) -> 9606

In [11]:
position_pattern = re.compile('taxid:(\d*)\(')

def extract_tax_id(s):
    groups = position_pattern.search(s)
    if groups == None:
        return ''
    else:
        return groups.group(1)

In [12]:
data['taxonomy_id_a'] = data['Taxid interactor A'].apply(extract_tax_id)
data['taxonomy_id_b'] = data['Taxid interactor B'].apply(extract_tax_id)

In [13]:
data = data[['id_a', 'id_b', 'accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'taxonomy_id_a', 'taxonomy_id_b', 'pubmedId']]

In [14]:
data.head()

Unnamed: 0,id_a,id_b,accession_a,accession_b,pro_id_a,pro_id_b,taxonomy_id_a,taxonomy_id_b,pubmedId
0,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,32155444
1,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,3223134
2,P0DTD1-PRO_0000449632,P0DTD1-PRO_0000449632,P0DTD1,P0DTD1,uniprot.chain:PRO_0000449632,uniprot.chain:PRO_0000449632,2697049,2697049,32304108
3,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,32075877
4,P0DTC2,P0DTC2,P0DTC2,P0DTC2,,,2697049,2697049,32075877


#### Restrict data to SARS-CoV-2 - human protein-protein interactions (TODO: expand to SARS, MERS, etc.)

In [15]:
data = data[((data['taxonomy_id_a'] == '2697049') & (data['taxonomy_id_b'] == '9606')) |
            ((data['taxonomy_id_b'] == '2697049') & (data['taxonomy_id_a'] == '9606'))]

In [16]:
data.drop_duplicates(inplace=True)

#### Retrieve Protein names and sequences from UniProt.org

Collect unique accession numbers

In [17]:
a = data['accession_a'].unique()
b = data['accession_b'].unique()
ids = np.unique(np.concatenate((a, b)))

In [18]:
def split_chains(accession, chains):
    chain_list = []
    for chain in chains:
        items = chain.split(';')
        feature_dict = {}
        
        try:
            if len(items) > 1:
                feature_dict['accession'] = accession
                for item in items:
                    item = item.strip()
                    if '..' in item:
                        feature_dict['start'] = int(items[0].split('..')[0])
                        feature_dict['end'] = int(items[0].split('..')[1])
                    elif item.startswith("/note="):
                        name = item[6:].replace('\"', '')
                        feature_dict['name'] = name
                    elif item.startswith("/id="):
                        pro_id = item[4:].replace('\"', '')
                        feature_dict['proId'] = 'uniprot.chain:' + pro_id
                chain_list.append(feature_dict)
        except:
            print('Skipping incomplete features for', accession, ':', chain)
            
    df = pd.DataFrame(chain_list)
    df.dropna(inplace=True)
    return df

In [19]:
seq_dfs = []
for accession in ids:
    url = f'https://www.uniprot.org/uniprot/?query=accession:{accession}&columns=id,sequence,length,protein%20names,feature(CHAIN),organism-id&format=tab'
    try:
        df = pd.read_csv(url, sep='\t')
        df.fillna('', inplace=True)
        
        whole_chain = df.copy()
        
        start = 1
        end = whole_chain.iloc[0]['Length']

        whole_chain['accession'] = accession
        whole_chain['start'] = start
        whole_chain['end'] = end
        whole_chain['name'] = whole_chain['Protein names']
        whole_chain['name'] = whole_chain['Protein names'].str.split('\(', expand=True)
        whole_chain['proId'] =''
        whole_chain = whole_chain[['accession', 'start', 'end', 'name', 'proId']]
        
        # process features (Chains)
        feature_string = df['Chain'].values[0]
        if (feature_string != ''):
            chains = feature_string.split("CHAIN ")
            chain_df = split_chains(accession, chains)
            if chain_df.shape[0] > 0:
                # Add the whole chain if it is not in the list of chains
                if chain_df.iloc[0]['start'] != start or chain_df.iloc[0]['end'] != end:
                    chain_df = pd.concat([whole_chain, chain_df])
                    
                df = df.merge(chain_df, left_on='Entry', right_on='accession', how='left')
                print(accession, end=' ')
                seq_dfs.append(df)
        else:
            df = df.merge(whole_chain, left_on='Entry', right_on='accession', how='left')
            seq_dfs.append(df)
    except:
        print("Uniprot accession not found:", accession)
        
seq = pd.concat(seq_dfs, axis=0, ignore_index=True)

A3KN83 A7MCY6 O00116 O00124 O00148 O00203 O00231 O00411 O00469 O00566 O00567 O14578 O14656 O14745 O14818 O14874 O14975 O14980 O15027 O15226 O15381 O15397 O43169 O43292 O43395 O43592 O43633 O43660 O43795 O43818 O43823 O43852 O60293 O60573 O60762 O60832 O60884 O60885 O75347 O75439 O75506 O75534 O75569 O75592 O75688 O75746 O75934 O75964 O76021 O76024 O76061 O76094 O94826 O94973 O95070 O95071 O95104 O95260 O95347 O95391 O95573 O95613 O95684 O95757 O95816 O95831 O96028 P00338 P00387 P00750 P04075 P04406 P05023 P05026 P05556 P06280 P06493 P06576 P06733 P06753 P07195 P07203 P07355 P08621 P08708 P08865 P09012 P09132 P09601 P09622 P09874 P09884 P0C7P0 P0DN76 P0DP23 P0DTC1 P0DTC2 P0DTC3 P0DTC4 P0DTC5 P0DTC6 P0DTC7 P0DTC8 P0DTC9 P0DTD1 P0DTD2 P0DTD3 P0DTD8 P10155 P11233 P11310 P11387 P11586 P11940 P12004 P12109 P12268 P13797 P13804 P13861 P13984 P14735 P15151 P15924 P16435 P16615 P16989 P17612 P17987 P19105 P19784 P21964 P22234 P22314 P25205 P25398 P25440 P25705 P25789 P26358 P26599 P27105 P27348

#### Cleave sequences into peptides

In [20]:
def get_subsequence(row):
    start = row['start']
    end = row['end']
    sequence = row['Sequence']
    return sequence[start-1: end]

In [21]:
seq['sequence'] = seq.apply(lambda row: get_subsequence(row), axis=1)

Set flag if protein chain is full length

In [22]:
seq['fullLength'] = (seq['start'] == 1) & (seq['end'] == seq['Length'])

In [23]:
seq['id'] = seq['sequence'].apply(lambda seq: 'md5:' + hashlib.md5(seq.encode()).hexdigest())

In [24]:
seq.rename(columns={'Organism ID': 'taxonomyId'}, inplace=True)

In [25]:
seq['name'] = seq['name'].str.strip()

In [26]:
seq.head()

Unnamed: 0,Entry,Sequence,Length,Protein names,Chain,taxonomyId,accession,start,end,name,proId,sequence,fullLength,id
0,A0A663DJA2,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,38,ORF10 proteiin (ORF10 protein),,2697049,A0A663DJA2,1,38,ORF10 proteiin,,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,True,md5:06bfef85cf1319e805e05f5bee3ec601
1,A3KN83,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,1393,Protein strawberry notch homolog 1 (Monocyte p...,"CHAIN 1..1393; /note=""Protein strawberry notc...",9606,A3KN83,1,1393,Protein strawberry notch homolog 1,uniprot.chain:PRO_0000314555,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,True,md5:a8f6908047b03fcfde873a516979cacd
2,A7MCY6,MESMFEDDISILTQEALGPSEVWLDSPGDPSLGGDMCSASHFALIT...,615,TANK-binding kinase 1-binding protein 1 (TBK1-...,"CHAIN 1..615; /note=""TANK-binding kinase 1-bi...",9606,A7MCY6,1,615,TANK-binding kinase 1-binding protein 1,uniprot.chain:PRO_0000324654,MESMFEDDISILTQEALGPSEVWLDSPGDPSLGGDMCSASHFALIT...,True,md5:38a1c0fe55ce1552e3e417004717dd21
3,O00116,MAEAAAAAGGTGLGAGASYGSAADRDRDPDPDRAGRRLRVLSGHLL...,658,"Alkyldihydroxyacetonephosphate synthase, perox...","CHAIN 59..658; /note=""Alkyldihydroxyacetoneph...",9606,O00116,1,658,"Alkyldihydroxyacetonephosphate synthase, perox...",,MAEAAAAAGGTGLGAGASYGSAADRDRDPDPDRAGRRLRVLSGHLL...,True,md5:9357ca15dc8abc6d5efb4d6b3f7c5b80
4,O00116,MAEAAAAAGGTGLGAGASYGSAADRDRDPDPDRAGRRLRVLSGHLL...,658,"Alkyldihydroxyacetonephosphate synthase, perox...","CHAIN 59..658; /note=""Alkyldihydroxyacetoneph...",9606,O00116,59,658,"Alkyldihydroxyacetonephosphate synthase, perox...",uniprot.chain:PRO_0000020431,KARRAASAATAAPTATPAAQESGTIPKKRQEVMKWNGWGYNDSKFI...,False,md5:edf403cd7e4b04b81da14b792d847d76


### Save proteins

In [27]:
proteins = seq[['id', 'name', 'accession', 'proId', 'sequence', 'start', 'end', 'fullLength', 'taxonomyId']].copy()
proteins['accession'] = 'uniprot:' + proteins['accession']
proteins['taxonomyId'] = 'taxonomy:' + proteins['taxonomyId'].astype(str)
proteins.to_csv(NEO4J_HOME / 'import/01e-ProteinProteinInteractionProtein.csv', index = False)

In [28]:
print('Number of proteins:', proteins.shape[0])

Number of proteins: 855


In [29]:
proteins.head()

Unnamed: 0,id,name,accession,proId,sequence,start,end,fullLength,taxonomyId
0,md5:06bfef85cf1319e805e05f5bee3ec601,ORF10 proteiin,uniprot:A0A663DJA2,,MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT,1,38,True,taxonomy:2697049
1,md5:a8f6908047b03fcfde873a516979cacd,Protein strawberry notch homolog 1,uniprot:A3KN83,uniprot.chain:PRO_0000314555,MVEPGQDLLLAALSESGISPNDLFDIDGGDAGLATPMPTPSVQQSV...,1,1393,True,taxonomy:9606
2,md5:38a1c0fe55ce1552e3e417004717dd21,TANK-binding kinase 1-binding protein 1,uniprot:A7MCY6,uniprot.chain:PRO_0000324654,MESMFEDDISILTQEALGPSEVWLDSPGDPSLGGDMCSASHFALIT...,1,615,True,taxonomy:9606
3,md5:9357ca15dc8abc6d5efb4d6b3f7c5b80,"Alkyldihydroxyacetonephosphate synthase, perox...",uniprot:O00116,,MAEAAAAAGGTGLGAGASYGSAADRDRDPDPDRAGRRLRVLSGHLL...,1,658,True,taxonomy:9606
4,md5:edf403cd7e4b04b81da14b792d847d76,"Alkyldihydroxyacetonephosphate synthase, perox...",uniprot:O00116,uniprot.chain:PRO_0000020431,KARRAASAATAAPTATPAAQESGTIPKKRQEVMKWNGWGYNDSKFI...,59,658,False,taxonomy:9606


### Merge interaction data with sequence data

In [30]:
sequences = seq[['accession', 'proId', 'id']].copy()
sequences_full_length = seq.query('fullLength')
sequences_full_length = sequences_full_length[['accession', 'proId', 'id']].copy()

In [31]:
sequences_full_length.head()

Unnamed: 0,accession,proId,id
0,A0A663DJA2,,md5:06bfef85cf1319e805e05f5bee3ec601
1,A3KN83,uniprot.chain:PRO_0000314555,md5:a8f6908047b03fcfde873a516979cacd
2,A7MCY6,uniprot.chain:PRO_0000324654,md5:38a1c0fe55ce1552e3e417004717dd21
3,O00116,,md5:9357ca15dc8abc6d5efb4d6b3f7c5b80
5,O00124,uniprot.chain:PRO_0000211033,md5:5291ee7a85d801c849a32f90c955a077


In [32]:
sequences_full_length.query("accession == 'Q9BYF1'")

Unnamed: 0,accession,proId,id
710,Q9BYF1,,md5:906dc56d5c9c5513eef859ee82e80267


In [33]:
interact = data[['accession_a', 'accession_b', 'pro_id_a', 'pro_id_b', 'pubmedId']]

In [34]:
interact.query("accession_a == 'P0DTC2'")

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId
60,P0DTC2,Q9C0B5,,,
61,P0DTC2,Q7Z5G4,,,
599,P0DTC2,P62269,,,
600,P0DTC2,Q96N67,,,
601,P0DTC2,P63244,,,
602,P0DTC2,P40227,,,
603,P0DTC2,P07195,,,
604,P0DTC2,P08621,,,
605,P0DTC2,P60660,,,
606,P0DTC2,P47756,,,


Split dataframe into two parts, depending on the presence of a protein id for interactor A. Each part is processed differently.

In [35]:
interact_pro_a = interact.query("pro_id_a != ''").copy()
interact_acc_a = interact.query("pro_id_a == ''").copy()

In [36]:
interact_pro_a = interact_pro_a.merge(sequences, left_on='pro_id_a', right_on='proId', how='left')
interact_pro_a.fillna('', inplace=True)

In [37]:
interact_acc_a = interact_acc_a.merge(sequences_full_length, left_on='accession_a', right_on='accession', how='left')
interact_acc_a.fillna('', inplace=True)

Concatenate the dataframes back together

In [38]:
interact = pd.concat([interact_pro_a, interact_acc_a])
interact.rename(columns={'id': 'id_a'}, inplace=True)

In [39]:
print('total interactions:', interact.shape[0])

total interactions: 963


Split dataframe into two parts, depending on the presence of a protein id for interactor B. Each part is processed differently.

In [40]:
interact_pro_b = interact.query("pro_id_b != ''").copy()
interact_acc_b = interact.query("pro_id_b == ''").copy()

In [41]:
interact_pro_b = interact_pro_b.merge(sequences, left_on='pro_id_b', right_on='proId', how='left')
interact_pro_b.fillna('', inplace=True)

In [42]:
interact_acc_b = interact_acc_b.merge(sequences_full_length, left_on='accession_b', right_on='accession', how='left')
interact_acc_b.fillna('', inplace=True)

Concatenate the dataframes back together

In [43]:
interact = pd.concat([interact_pro_b, interact_acc_b])
interact.rename(columns={'id': 'id_b'}, inplace=True)
interact.head()

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId,accession_x,proId_x,id_a,accession_y,proId_y,id_b
0,O75347,P0DTC1,,uniprot.chain:PRO_0000449645,,O75347,,md5:895df31d01af35825589954917c6b682,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66
1,Q92769,P0DTD1,,uniprot.chain:PRO_0000449623,,Q92769,uniprot.chain:PRO_0000114693,md5:6d48f5d9d96f10557d680247a7cdcfe9,P0DTD1,uniprot.chain:PRO_0000449623,md5:dc6436f559bc873ac013085f6e56d467
2,Q9UJZ1,intact:EBI,,25475912,,Q9UJZ1,,md5:4f0a474a755af072900d84d3cacc6d92,,,
3,P61158,P0DTD1,,uniprot.chain:PRO_0000449633,,P61158,,md5:c07966db47593469f42f4ee1a7b0ccdf,P0DTD1,uniprot.chain:PRO_0000449633,md5:42b973dceb72cdea04fda203c55b67bc
0,P0DTD1,Q14181,uniprot.chain:PRO_0000449619,,,P0DTD1,uniprot.chain:PRO_0000449619,md5:5c2c364f44079728c451280435c4236a,Q14181,uniprot.chain:PRO_0000194035,md5:41e036ebe08ce3a96edad90d8f2efef0


In [44]:
interact.query("accession_a == 'P0DTC1'")

Unnamed: 0,accession_a,accession_b,pro_id_a,pro_id_b,pubmedId,accession_x,proId_x,id_a,accession_y,proId_y,id_b
259,P0DTC1,Q9UM54,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,Q9UM54,uniprot.chain:PRO_0000123464,md5:6c4869132f6880065ac9e8d67ff15787
260,P0DTC1,O43795,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,O43795,uniprot.chain:PRO_0000123442,md5:250d73405f887504ee3046934f5cd258
261,P0DTC1,P61158,uniprot.chain:PRO_0000449645,,,P0DTC1,uniprot.chain:PRO_0000449645,md5:63d2c81f37726f44c600eb5225676a66,P61158,,md5:c07966db47593469f42f4ee1a7b0ccdf


### Save interactions

Remove interactions without md5 ids

In [45]:
interact = interact[(interact['id_a'] != '') & (interact['id_b'] != '')]

In [46]:
interact = interact[['id_a', 'id_b']]
interact.drop_duplicates(inplace=True)
interact.to_csv(NEO4J_HOME / 'import/01e-ProteinProteinInteraction.csv', index = False)

In [47]:
print('Number of interactions:', interact.shape[0])

Number of interactions: 951


In [48]:
interact.head()

Unnamed: 0,id_a,id_b
0,md5:895df31d01af35825589954917c6b682,md5:63d2c81f37726f44c600eb5225676a66
1,md5:6d48f5d9d96f10557d680247a7cdcfe9,md5:dc6436f559bc873ac013085f6e56d467
3,md5:c07966db47593469f42f4ee1a7b0ccdf,md5:42b973dceb72cdea04fda203c55b67bc
0,md5:5c2c364f44079728c451280435c4236a,md5:41e036ebe08ce3a96edad90d8f2efef0
1,md5:5c2c364f44079728c451280435c4236a,md5:9b6eac378c77b98ec8ea9925973fbd03
